2 2 2 1 1 1 1 1 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 // SPDX-License-Identifier: GPL-2.0-or-later /* * TechnoTrend USB IR Receiver * * Copyright (C) 2012 Sean Young <sean@mess.org> */ #include <linux/module.h> #include <linux/usb.h> #include <linux/usb/input.h> #include <linux/slab.h> #include <linux/leds.h> #include <media/rc-core.h> #define DRIVER_NAME "ttusbir" #define DRIVER_DESC "TechnoTrend USB IR Receiver" /* * The Windows driver uses 8 URBS, the original lirc drivers has a * configurable amount (2 default, 4 max). This device generates about 125 * messages per second (!), whether IR is idle or not. */ #define NUM_URBS 4 #define US_PER_BYTE 62 #define US_PER_BIT (US_PER_BYTE / 8) struct ttusbir { struct rc_dev *rc; struct device *dev; struct usb_device *udev; struct urb *urb[NUM_URBS]; struct led_classdev led; struct urb *bulk_urb; uint8_t bulk_buffer[5]; int bulk_out_endp, iso_in_endp; bool led_on, is_led_on; atomic_t led_complete; char phys[64]; }; static enum led_brightness ttusbir_brightness_get(struct led_classdev *led_dev) { struct ttusbir *tt = container_of(led_dev, struct ttusbir, led); return tt->led_on ? LED_FULL : LED_OFF; } static void ttusbir_set_led(struct ttusbir *tt) { int ret; smp_mb(); if (tt->led_on != tt->is_led_on && tt->udev && atomic_add_unless(&tt->led_complete, 1, 1)) { tt->bulk_buffer[4] = tt->is_led_on = tt->led_on; ret = usb_submit_urb(tt->bulk_urb, GFP_ATOMIC); if (ret) { dev_warn(tt->dev, "failed to submit bulk urb: %d\n", ret); atomic_dec(&tt->led_complete); } } } static void ttusbir_brightness_set(struct led_classdev *led_dev, enum led_brightness brightness) { struct ttusbir *tt = container_of(led_dev, struct ttusbir, led); tt->led_on = brightness != LED_OFF; ttusbir_set_led(tt); } /* * The urb cannot be reused until the urb completes */ static void ttusbir_bulk_complete(struct urb *urb) { struct ttusbir *tt = urb->context; atomic_dec(&tt->led_complete); switch (urb->status) { case 0: break; case -ECONNRESET: case -ENOENT: case -ESHUTDOWN: return; case -EPIPE: default: dev_dbg(tt->dev, "Error: urb status = %d\n", urb->status); break; } ttusbir_set_led(tt); } /* * The data is one bit per sample, a set bit signifying silence and samples * being MSB first. Bit 0 can contain garbage so take it to be whatever * bit 1 is, so we don't have unexpected edges. */ static void ttusbir_process_ir_data(struct ttusbir *tt, uint8_t *buf) { struct ir_raw_event rawir = {}; unsigned i, v, b; bool event = false; for (i = 0; i < 128; i++) { v = buf[i] & 0xfe; switch (v) { case 0xfe: rawir.pulse = false; rawir.duration = US_PER_BYTE; if (ir_raw_event_store_with_filter(tt->rc, &rawir)) event = true; break; case 0: rawir.pulse = true; rawir.duration = US_PER_BYTE; if (ir_raw_event_store_with_filter(tt->rc, &rawir)) event = true; break; default: /* one edge per byte */ if (v & 2) { b = ffz(v | 1); rawir.pulse = true; } else { b = ffs(v) - 1; rawir.pulse = false; } rawir.duration = US_PER_BIT * (8 - b); if (ir_raw_event_store_with_filter(tt->rc, &rawir)) event = true; rawir.pulse = !rawir.pulse; rawir.duration = US_PER_BIT * b; if (ir_raw_event_store_with_filter(tt->rc, &rawir)) event = true; break; } } /* don't wakeup when there's nothing to do */ if (event) ir_raw_event_handle(tt->rc); } static void ttusbir_urb_complete(struct urb *urb) { struct ttusbir *tt = urb->context; int rc; switch (urb->status) { case 0: ttusbir_process_ir_data(tt, urb->transfer_buffer); break; case -ECONNRESET: case -ENOENT: case -ESHUTDOWN: return; case -EPIPE: default: dev_dbg(tt->dev, "Error: urb status = %d\n", urb->status); break; } rc = usb_submit_urb(urb, GFP_ATOMIC); if (rc && rc != -ENODEV) dev_warn(tt->dev, "failed to resubmit urb: %d\n", rc); } static int ttusbir_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct ttusbir *tt; struct usb_interface_descriptor *idesc; struct usb_endpoint_descriptor *desc; struct rc_dev *rc; int i, j, ret; int altsetting = -1; tt = kzalloc(sizeof(*tt), GFP_KERNEL); rc = rc_allocate_device(RC_DRIVER_IR_RAW); if (!tt || !rc) { ret = -ENOMEM; goto out; } /* find the correct alt setting */ for (i = 0; i < intf->num_altsetting && altsetting == -1; i++) { int max_packet, bulk_out_endp = -1, iso_in_endp = -1; idesc = &intf->altsetting[i].desc; for (j = 0; j < idesc->bNumEndpoints; j++) { desc = &intf->altsetting[i].endpoint[j].desc; max_packet = le16_to_cpu(desc->wMaxPacketSize); if (usb_endpoint_dir_in(desc) && usb_endpoint_xfer_isoc(desc) && max_packet == 0x10) iso_in_endp = j; else if (usb_endpoint_dir_out(desc) && usb_endpoint_xfer_bulk(desc) && max_packet == 0x20) bulk_out_endp = j; if (bulk_out_endp != -1 && iso_in_endp != -1) { tt->bulk_out_endp = bulk_out_endp; tt->iso_in_endp = iso_in_endp; altsetting = i; break; } } } if (altsetting == -1) { dev_err(&intf->dev, "cannot find expected altsetting\n"); ret = -ENODEV; goto out; } tt->dev = &intf->dev; tt->udev = interface_to_usbdev(intf); tt->rc = rc; ret = usb_set_interface(tt->udev, 0, altsetting); if (ret) goto out; for (i = 0; i < NUM_URBS; i++) { struct urb *urb = usb_alloc_urb(8, GFP_KERNEL); void *buffer; if (!urb) { ret = -ENOMEM; goto out; } urb->dev = tt->udev; urb->context = tt; urb->pipe = usb_rcvisocpipe(tt->udev, tt->iso_in_endp); urb->interval = 1; buffer = usb_alloc_coherent(tt->udev, 128, GFP_KERNEL, &urb->transfer_dma); if (!buffer) { usb_free_urb(urb); ret = -ENOMEM; goto out; } urb->transfer_flags = URB_NO_TRANSFER_DMA_MAP | URB_ISO_ASAP; urb->transfer_buffer = buffer; urb->complete = ttusbir_urb_complete; urb->number_of_packets = 8; urb->transfer_buffer_length = 128; for (j = 0; j < 8; j++) { urb->iso_frame_desc[j].offset = j * 16; urb->iso_frame_desc[j].length = 16; } tt->urb[i] = urb; } tt->bulk_urb = usb_alloc_urb(0, GFP_KERNEL); if (!tt->bulk_urb) { ret = -ENOMEM; goto out; } tt->bulk_buffer[0] = 0xaa; tt->bulk_buffer[1] = 0x01; tt->bulk_buffer[2] = 0x05; tt->bulk_buffer[3] = 0x01; usb_fill_bulk_urb(tt->bulk_urb, tt->udev, usb_sndbulkpipe(tt->udev, tt->bulk_out_endp), tt->bulk_buffer, sizeof(tt->bulk_buffer), ttusbir_bulk_complete, tt); tt->led.name = "ttusbir:green:power"; tt->led.default_trigger = "rc-feedback"; tt->led.brightness_set = ttusbir_brightness_set; tt->led.brightness_get = ttusbir_brightness_get; tt->is_led_on = tt->led_on = true; atomic_set(&tt->led_complete, 0); ret = led_classdev_register(&intf->dev, &tt->led); if (ret) goto out; usb_make_path(tt->udev, tt->phys, sizeof(tt->phys)); rc->device_name = DRIVER_DESC; rc->input_phys = tt->phys; usb_to_input_id(tt->udev, &rc->input_id); rc->dev.parent = &intf->dev; rc->allowed_protocols = RC_PROTO_BIT_ALL_IR_DECODER; rc->priv = tt; rc->driver_name = DRIVER_NAME; rc->map_name = RC_MAP_TT_1500; rc->min_timeout = 1; rc->timeout = IR_DEFAULT_TIMEOUT; rc->max_timeout = 10 * IR_DEFAULT_TIMEOUT; /* * The precision is US_PER_BIT, but since every 8th bit can be * overwritten with garbage the accuracy is at best 2 * US_PER_BIT. */ rc->rx_resolution = 2 * US_PER_BIT; ret = rc_register_device(rc); if (ret) { dev_err(&intf->dev, "failed to register rc device %d\n", ret); goto out2; } usb_set_intfdata(intf, tt); for (i = 0; i < NUM_URBS; i++) { ret = usb_submit_urb(tt->urb[i], GFP_KERNEL); if (ret) { dev_err(tt->dev, "failed to submit urb %d\n", ret); goto out3; } } return 0; out3: rc_unregister_device(rc); rc = NULL; out2: led_classdev_unregister(&tt->led); out: if (tt) { for (i = 0; i < NUM_URBS && tt->urb[i]; i++) { struct urb *urb = tt->urb[i]; usb_kill_urb(urb); usb_free_coherent(tt->udev, 128, urb->transfer_buffer, urb->transfer_dma); usb_free_urb(urb); } usb_kill_urb(tt->bulk_urb); usb_free_urb(tt->bulk_urb); kfree(tt); } rc_free_device(rc); return ret; } static void ttusbir_disconnect(struct usb_interface *intf) { struct ttusbir *tt = usb_get_intfdata(intf); struct usb_device *udev = tt->udev; int i; tt->udev = NULL; rc_unregister_device(tt->rc); led_classdev_unregister(&tt->led); for (i = 0; i < NUM_URBS; i++) { usb_kill_urb(tt->urb[i]); usb_free_coherent(udev, 128, tt->urb[i]->transfer_buffer, tt->urb[i]->transfer_dma); usb_free_urb(tt->urb[i]); } usb_kill_urb(tt->bulk_urb); usb_free_urb(tt->bulk_urb); usb_set_intfdata(intf, NULL); kfree(tt); } static int ttusbir_suspend(struct usb_interface *intf, pm_message_t message) { struct ttusbir *tt = usb_get_intfdata(intf); int i; for (i = 0; i < NUM_URBS; i++) usb_kill_urb(tt->urb[i]); led_classdev_suspend(&tt->led); usb_kill_urb(tt->bulk_urb); return 0; } static int ttusbir_resume(struct usb_interface *intf) { struct ttusbir *tt = usb_get_intfdata(intf); int i, rc; tt->is_led_on = true; led_classdev_resume(&tt->led); for (i = 0; i < NUM_URBS; i++) { rc = usb_submit_urb(tt->urb[i], GFP_NOIO); if (rc) { dev_warn(tt->dev, "failed to submit urb: %d\n", rc); break; } } return rc; } static const struct usb_device_id ttusbir_table[] = { { USB_DEVICE(0x0b48, 0x2003) }, { } }; static struct usb_driver ttusbir_driver = { .name = DRIVER_NAME, .id_table = ttusbir_table, .probe = ttusbir_probe, .suspend = ttusbir_suspend, .resume = ttusbir_resume, .reset_resume = ttusbir_resume, .disconnect = ttusbir_disconnect, }; module_usb_driver(ttusbir_driver); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_AUTHOR("Sean Young <sean@mess.org>"); MODULE_LICENSE("GPL"); MODULE_DEVICE_TABLE(usb, ttusbir_table);
55 50 55 481 49 2 55 29 29 29 29 481 144 481 481 481 29 8 481 55 55 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * linux/include/linux/jbd2.h * * Written by Stephen C. Tweedie <sct@redhat.com> * * Copyright 1998-2000 Red Hat, Inc --- All Rights Reserved * * Definitions for transaction data structures for the buffer cache * filesystem journaling support. */ #ifndef _LINUX_JBD2_H #define _LINUX_JBD2_H /* Allow this file to be included directly into e2fsprogs */ #ifndef __KERNEL__ #include "jfs_compat.h" #define JBD2_DEBUG #else #include <linux/types.h> #include <linux/buffer_head.h> #include <linux/journal-head.h> #include <linux/stddef.h> #include <linux/mutex.h> #include <linux/timer.h> #include <linux/slab.h> #include <linux/bit_spinlock.h> #include <linux/blkdev.h> #include <crypto/hash.h> #endif #define journal_oom_retry 1 /* * Define JBD2_PARANIOD_IOFAIL to cause a kernel BUG() if ext4 finds * certain classes of error which can occur due to failed IOs. Under * normal use we want ext4 to continue after such errors, because * hardware _can_ fail, but for debugging purposes when running tests on * known-good hardware we may want to trap these errors. */ #undef JBD2_PARANOID_IOFAIL /* * The default maximum commit age, in seconds. */ #define JBD2_DEFAULT_MAX_COMMIT_AGE 5 #ifdef CONFIG_JBD2_DEBUG /* * Define JBD2_EXPENSIVE_CHECKING to enable more expensive internal * consistency checks. By default we don't do this unless * CONFIG_JBD2_DEBUG is on. */ #define JBD2_EXPENSIVE_CHECKING void __jbd2_debug(int level, const char *file, const char *func, unsigned int line, const char *fmt, ...); #define jbd2_debug(n, fmt, a...) \ __jbd2_debug((n), __FILE__, __func__, __LINE__, (fmt), ##a) #else #define jbd2_debug(n, fmt, a...) no_printk(fmt, ##a) #endif extern void *jbd2_alloc(size_t size, gfp_t flags); extern void jbd2_free(void *ptr, size_t size); #define JBD2_MIN_JOURNAL_BLOCKS 1024 #define JBD2_DEFAULT_FAST_COMMIT_BLOCKS 256 #ifdef __KERNEL__ /** * typedef handle_t - The handle_t type represents a single atomic update being performed by some process. * * All filesystem modifications made by the process go * through this handle. Recursive operations (such as quota operations) * are gathered into a single update. * * The buffer credits field is used to account for journaled buffers * being modified by the running process. To ensure that there is * enough log space for all outstanding operations, we need to limit the * number of outstanding buffers possible at any time. When the * operation completes, any buffer credits not used are credited back to * the transaction, so that at all times we know how many buffers the * outstanding updates on a transaction might possibly touch. * * This is an opaque datatype. **/ typedef struct jbd2_journal_handle handle_t; /* Atomic operation type */ /** * typedef journal_t - The journal_t maintains all of the journaling state information for a single filesystem. * * journal_t is linked to from the fs superblock structure. * * We use the journal_t to keep track of all outstanding transaction * activity on the filesystem, and to manage the state of the log * writing process. * * This is an opaque datatype. **/ typedef struct journal_s journal_t; /* Journal control structure */ #endif /* * Internal structures used by the logging mechanism: */ #define JBD2_MAGIC_NUMBER 0xc03b3998U /* The first 4 bytes of /dev/random! */ /* * On-disk structures */ /* * Descriptor block types: */ #define JBD2_DESCRIPTOR_BLOCK 1 #define JBD2_COMMIT_BLOCK 2 #define JBD2_SUPERBLOCK_V1 3 #define JBD2_SUPERBLOCK_V2 4 #define JBD2_REVOKE_BLOCK 5 /* * Standard header for all descriptor blocks: */ typedef struct journal_header_s { __be32 h_magic; __be32 h_blocktype; __be32 h_sequence; } journal_header_t; /* * Checksum types. */ #define JBD2_CRC32_CHKSUM 1 #define JBD2_MD5_CHKSUM 2 #define JBD2_SHA1_CHKSUM 3 #define JBD2_CRC32C_CHKSUM 4 #define JBD2_CRC32_CHKSUM_SIZE 4 #define JBD2_CHECKSUM_BYTES (32 / sizeof(u32)) /* * Commit block header for storing transactional checksums: * * NOTE: If FEATURE_COMPAT_CHECKSUM (checksum v1) is set, the h_chksum* * fields are used to store a checksum of the descriptor and data blocks. * * If FEATURE_INCOMPAT_CSUM_V2 (checksum v2) is set, then the h_chksum * field is used to store crc32c(uuid+commit_block). Each journal metadata * block gets its own checksum, and data block checksums are stored in * journal_block_tag (in the descriptor). The other h_chksum* fields are * not used. * * If FEATURE_INCOMPAT_CSUM_V3 is set, the descriptor block uses * journal_block_tag3_t to store a full 32-bit checksum. Everything else * is the same as v2. * * Checksum v1, v2, and v3 are mutually exclusive features. */ struct commit_header { __be32 h_magic; __be32 h_blocktype; __be32 h_sequence; unsigned char h_chksum_type; unsigned char h_chksum_size; unsigned char h_padding[2]; __be32 h_chksum[JBD2_CHECKSUM_BYTES]; __be64 h_commit_sec; __be32 h_commit_nsec; }; /* * The block tag: used to describe a single buffer in the journal. * t_blocknr_high is only used if INCOMPAT_64BIT is set, so this * raw struct shouldn't be used for pointer math or sizeof() - use * journal_tag_bytes(journal) instead to compute this. */ typedef struct journal_block_tag3_s { __be32 t_blocknr; /* The on-disk block number */ __be32 t_flags; /* See below */ __be32 t_blocknr_high; /* most-significant high 32bits. */ __be32 t_checksum; /* crc32c(uuid+seq+block) */ } journal_block_tag3_t; typedef struct journal_block_tag_s { __be32 t_blocknr; /* The on-disk block number */ __be16 t_checksum; /* truncated crc32c(uuid+seq+block) */ __be16 t_flags; /* See below */ __be32 t_blocknr_high; /* most-significant high 32bits. */ } journal_block_tag_t; /* Tail of descriptor or revoke block, for checksumming */ struct jbd2_journal_block_tail { __be32 t_checksum; /* crc32c(uuid+descr_block) */ }; /* * The revoke descriptor: used on disk to describe a series of blocks to * be revoked from the log */ typedef struct jbd2_journal_revoke_header_s { journal_header_t r_header; __be32 r_count; /* Count of bytes used in the block */ } jbd2_journal_revoke_header_t; /* Definitions for the journal tag flags word: */ #define JBD2_FLAG_ESCAPE 1 /* on-disk block is escaped */ #define JBD2_FLAG_SAME_UUID 2 /* block has same uuid as previous */ #define JBD2_FLAG_DELETED 4 /* block deleted by this transaction */ #define JBD2_FLAG_LAST_TAG 8 /* last tag in this descriptor block */ /* * The journal superblock. All fields are in big-endian byte order. */ typedef struct journal_superblock_s { /* 0x0000 */ journal_header_t s_header; /* 0x000C */ /* Static information describing the journal */ __be32 s_blocksize; /* journal device blocksize */ __be32 s_maxlen; /* total blocks in journal file */ __be32 s_first; /* first block of log information */ /* 0x0018 */ /* Dynamic information describing the current state of the log */ __be32 s_sequence; /* first commit ID expected in log */ __be32 s_start; /* blocknr of start of log */ /* 0x0020 */ /* Error value, as set by jbd2_journal_abort(). */ __be32 s_errno; /* 0x0024 */ /* Remaining fields are only valid in a version-2 superblock */ __be32 s_feature_compat; /* compatible feature set */ __be32 s_feature_incompat; /* incompatible feature set */ __be32 s_feature_ro_compat; /* readonly-compatible feature set */ /* 0x0030 */ __u8 s_uuid[16]; /* 128-bit uuid for journal */ /* 0x0040 */ __be32 s_nr_users; /* Nr of filesystems sharing log */ __be32 s_dynsuper; /* Blocknr of dynamic superblock copy*/ /* 0x0048 */ __be32 s_max_transaction; /* Limit of journal blocks per trans.*/ __be32 s_max_trans_data; /* Limit of data blocks per trans. */ /* 0x0050 */ __u8 s_checksum_type; /* checksum type */ __u8 s_padding2[3]; /* 0x0054 */ __be32 s_num_fc_blks; /* Number of fast commit blocks */ __be32 s_head; /* blocknr of head of log, only uptodate * while the filesystem is clean */ /* 0x005C */ __u32 s_padding[40]; __be32 s_checksum; /* crc32c(superblock) */ /* 0x0100 */ __u8 s_users[16*48]; /* ids of all fs'es sharing the log */ /* 0x0400 */ } journal_superblock_t; #define JBD2_FEATURE_COMPAT_CHECKSUM 0x00000001 #define JBD2_FEATURE_INCOMPAT_REVOKE 0x00000001 #define JBD2_FEATURE_INCOMPAT_64BIT 0x00000002 #define JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT 0x00000004 #define JBD2_FEATURE_INCOMPAT_CSUM_V2 0x00000008 #define JBD2_FEATURE_INCOMPAT_CSUM_V3 0x00000010 #define JBD2_FEATURE_INCOMPAT_FAST_COMMIT 0x00000020 /* See "journal feature predicate functions" below */ /* Features known to this kernel version: */ #define JBD2_KNOWN_COMPAT_FEATURES JBD2_FEATURE_COMPAT_CHECKSUM #define JBD2_KNOWN_ROCOMPAT_FEATURES 0 #define JBD2_KNOWN_INCOMPAT_FEATURES (JBD2_FEATURE_INCOMPAT_REVOKE | \ JBD2_FEATURE_INCOMPAT_64BIT | \ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | \ JBD2_FEATURE_INCOMPAT_CSUM_V2 | \ JBD2_FEATURE_INCOMPAT_CSUM_V3 | \ JBD2_FEATURE_INCOMPAT_FAST_COMMIT) #ifdef __KERNEL__ #include <linux/fs.h> #include <linux/sched.h> enum jbd_state_bits { BH_JBD /* Has an attached ext3 journal_head */ = BH_PrivateStart, BH_JWrite, /* Being written to log (@@@ DEBUGGING) */ BH_Freed, /* Has been freed (truncated) */ BH_Revoked, /* Has been revoked from the log */ BH_RevokeValid, /* Revoked flag is valid */ BH_JBDDirty, /* Is dirty but journaled */ BH_JournalHead, /* Pins bh->b_private and jh->b_bh */ BH_Shadow, /* IO on shadow buffer is running */ BH_Verified, /* Metadata block has been verified ok */ BH_JBDPrivateStart, /* First bit available for private use by FS */ }; BUFFER_FNS(JBD, jbd) BUFFER_FNS(JWrite, jwrite) BUFFER_FNS(JBDDirty, jbddirty) TAS_BUFFER_FNS(JBDDirty, jbddirty) BUFFER_FNS(Revoked, revoked) TAS_BUFFER_FNS(Revoked, revoked) BUFFER_FNS(RevokeValid, revokevalid) TAS_BUFFER_FNS(RevokeValid, revokevalid) BUFFER_FNS(Freed, freed) BUFFER_FNS(Shadow, shadow) BUFFER_FNS(Verified, verified) static inline struct buffer_head *jh2bh(struct journal_head *jh) { return jh->b_bh; } static inline struct journal_head *bh2jh(struct buffer_head *bh) { return bh->b_private; } static inline void jbd_lock_bh_journal_head(struct buffer_head *bh) { bit_spin_lock(BH_JournalHead, &bh->b_state); } static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh) { bit_spin_unlock(BH_JournalHead, &bh->b_state); } #define J_ASSERT(assert) BUG_ON(!(assert)) #define J_ASSERT_BH(bh, expr) J_ASSERT(expr) #define J_ASSERT_JH(jh, expr) J_ASSERT(expr) #if defined(JBD2_PARANOID_IOFAIL) #define J_EXPECT(expr, why...) J_ASSERT(expr) #define J_EXPECT_BH(bh, expr, why...) J_ASSERT_BH(bh, expr) #define J_EXPECT_JH(jh, expr, why...) J_ASSERT_JH(jh, expr) #else #define __journal_expect(expr, why...) \ ({ \ int val = (expr); \ if (!val) { \ printk(KERN_ERR \ "JBD2 unexpected failure: %s: %s;\n", \ __func__, #expr); \ printk(KERN_ERR why "\n"); \ } \ val; \ }) #define J_EXPECT(expr, why...) __journal_expect(expr, ## why) #define J_EXPECT_BH(bh, expr, why...) __journal_expect(expr, ## why) #define J_EXPECT_JH(jh, expr, why...) __journal_expect(expr, ## why) #endif /* Flags in jbd_inode->i_flags */ #define __JI_COMMIT_RUNNING 0 #define __JI_WRITE_DATA 1 #define __JI_WAIT_DATA 2 /* * Commit of the inode data in progress. We use this flag to protect us from * concurrent deletion of inode. We cannot use reference to inode for this * since we cannot afford doing last iput() on behalf of kjournald */ #define JI_COMMIT_RUNNING (1 << __JI_COMMIT_RUNNING) /* Write allocated dirty buffers in this inode before commit */ #define JI_WRITE_DATA (1 << __JI_WRITE_DATA) /* Wait for outstanding data writes for this inode before commit */ #define JI_WAIT_DATA (1 << __JI_WAIT_DATA) /** * struct jbd2_inode - The jbd_inode type is the structure linking inodes in * ordered mode present in a transaction so that we can sync them during commit. */ struct jbd2_inode { /** * @i_transaction: * * Which transaction does this inode belong to? Either the running * transaction or the committing one. [j_list_lock] */ transaction_t *i_transaction; /** * @i_next_transaction: * * Pointer to the running transaction modifying inode's data in case * there is already a committing transaction touching it. [j_list_lock] */ transaction_t *i_next_transaction; /** * @i_list: List of inodes in the i_transaction [j_list_lock] */ struct list_head i_list; /** * @i_vfs_inode: * * VFS inode this inode belongs to [constant for lifetime of structure] */ struct inode *i_vfs_inode; /** * @i_flags: Flags of inode [j_list_lock] */ unsigned long i_flags; /** * @i_dirty_start: * * Offset in bytes where the dirty range for this inode starts. * [j_list_lock] */ loff_t i_dirty_start; /** * @i_dirty_end: * * Inclusive offset in bytes where the dirty range for this inode * ends. [j_list_lock] */ loff_t i_dirty_end; }; struct jbd2_revoke_table_s; /** * struct jbd2_journal_handle - The jbd2_journal_handle type is the concrete * type associated with handle_t. * @h_transaction: Which compound transaction is this update a part of? * @h_journal: Which journal handle belongs to - used iff h_reserved set. * @h_rsv_handle: Handle reserved for finishing the logical operation. * @h_total_credits: Number of remaining buffers we are allowed to add to * journal. These are dirty buffers and revoke descriptor blocks. * @h_revoke_credits: Number of remaining revoke records available for handle * @h_ref: Reference count on this handle. * @h_err: Field for caller's use to track errors through large fs operations. * @h_sync: Flag for sync-on-close. * @h_jdata: Flag to force data journaling. * @h_reserved: Flag for handle for reserved credits. * @h_aborted: Flag indicating fatal error on handle. * @h_type: For handle statistics. * @h_line_no: For handle statistics. * @h_start_jiffies: Handle Start time. * @h_requested_credits: Holds @h_total_credits after handle is started. * @h_revoke_credits_requested: Holds @h_revoke_credits after handle is started. * @saved_alloc_context: Saved context while transaction is open. **/ /* Docbook can't yet cope with the bit fields, but will leave the documentation * in so it can be fixed later. */ struct jbd2_journal_handle { union { transaction_t *h_transaction; /* Which journal handle belongs to - used iff h_reserved set */ journal_t *h_journal; }; handle_t *h_rsv_handle; int h_total_credits; int h_revoke_credits; int h_revoke_credits_requested; int h_ref; int h_err; /* Flags [no locking] */ unsigned int h_sync: 1; unsigned int h_jdata: 1; unsigned int h_reserved: 1; unsigned int h_aborted: 1; unsigned int h_type: 8; unsigned int h_line_no: 16; unsigned long h_start_jiffies; unsigned int h_requested_credits; unsigned int saved_alloc_context; }; /* * Some stats for checkpoint phase */ struct transaction_chp_stats_s { unsigned long cs_chp_time; __u32 cs_forced_to_close; __u32 cs_written; __u32 cs_dropped; }; /* The transaction_t type is the guts of the journaling mechanism. It * tracks a compound transaction through its various states: * * RUNNING: accepting new updates * LOCKED: Updates still running but we don't accept new ones * RUNDOWN: Updates are tidying up but have finished requesting * new buffers to modify (state not used for now) * FLUSH: All updates complete, but we are still writing to disk * COMMIT: All data on disk, writing commit record * FINISHED: We still have to keep the transaction for checkpointing. * * The transaction keeps track of all of the buffers modified by a * running transaction, and all of the buffers committed but not yet * flushed to home for finished transactions. * (Locking Documentation improved by LockDoc) */ /* * Lock ranking: * * j_list_lock * ->jbd_lock_bh_journal_head() (This is "innermost") * * j_state_lock * ->b_state_lock * * b_state_lock * ->j_list_lock * * j_state_lock * ->j_list_lock (journal_unmap_buffer) * */ struct transaction_s { /* Pointer to the journal for this transaction. [no locking] */ journal_t *t_journal; /* Sequence number for this transaction [no locking] */ tid_t t_tid; /* * Transaction's current state * [no locking - only kjournald2 alters this] * [j_list_lock] guards transition of a transaction into T_FINISHED * state and subsequent call of __jbd2_journal_drop_transaction() * FIXME: needs barriers * KLUDGE: [use j_state_lock] */ enum { T_RUNNING, T_LOCKED, T_SWITCH, T_FLUSH, T_COMMIT, T_COMMIT_DFLUSH, T_COMMIT_JFLUSH, T_COMMIT_CALLBACK, T_FINISHED } t_state; /* * Where in the log does this transaction's commit start? [no locking] */ unsigned long t_log_start; /* * Number of buffers on the t_buffers list [j_list_lock, no locks * needed for jbd2 thread] */ int t_nr_buffers; /* * Doubly-linked circular list of all buffers reserved but not yet * modified by this transaction [j_list_lock, no locks needed fo * jbd2 thread] */ struct journal_head *t_reserved_list; /* * Doubly-linked circular list of all metadata buffers owned by this * transaction [j_list_lock, no locks needed for jbd2 thread] */ struct journal_head *t_buffers; /* * Doubly-linked circular list of all forget buffers (superseded * buffers which we can un-checkpoint once this transaction commits) * [j_list_lock] */ struct journal_head *t_forget; /* * Doubly-linked circular list of all buffers still to be flushed before * this transaction can be checkpointed. [j_list_lock] */ struct journal_head *t_checkpoint_list; /* * Doubly-linked circular list of metadata buffers being * shadowed by log IO. The IO buffers on the iobuf list and * the shadow buffers on this list match each other one for * one at all times. [j_list_lock, no locks needed for jbd2 * thread] */ struct journal_head *t_shadow_list; /* * List of inodes associated with the transaction; e.g., ext4 uses * this to track inodes in data=ordered and data=journal mode that * need special handling on transaction commit; also used by ocfs2. * [j_list_lock] */ struct list_head t_inode_list; /* * Longest time some handle had to wait for running transaction */ unsigned long t_max_wait; /* * When transaction started */ unsigned long t_start; /* * When commit was requested [j_state_lock] */ unsigned long t_requested; /* * Checkpointing stats [j_list_lock] */ struct transaction_chp_stats_s t_chp_stats; /* * Number of outstanding updates running on this transaction * [none] */ atomic_t t_updates; /* * Number of blocks reserved for this transaction in the journal. * This is including all credits reserved when starting transaction * handles as well as all journal descriptor blocks needed for this * transaction. [none] */ atomic_t t_outstanding_credits; /* * Number of revoke records for this transaction added by already * stopped handles. [none] */ atomic_t t_outstanding_revokes; /* * How many handles used this transaction? [none] */ atomic_t t_handle_count; /* * Forward and backward links for the circular list of all transactions * awaiting checkpoint. [j_list_lock] */ transaction_t *t_cpnext, *t_cpprev; /* * When will the transaction expire (become due for commit), in jiffies? * [no locking] */ unsigned long t_expires; /* * When this transaction started, in nanoseconds [no locking] */ ktime_t t_start_time; /* * This transaction is being forced and some process is * waiting for it to finish. */ unsigned int t_synchronous_commit:1; /* Disk flush needs to be sent to fs partition [no locking] */ int t_need_data_flush; /* * For use by the filesystem to store fs-specific data * structures associated with the transaction */ struct list_head t_private_list; }; struct transaction_run_stats_s { unsigned long rs_wait; unsigned long rs_request_delay; unsigned long rs_running; unsigned long rs_locked; unsigned long rs_flushing; unsigned long rs_logging; __u32 rs_handle_count; __u32 rs_blocks; __u32 rs_blocks_logged; }; struct transaction_stats_s { unsigned long ts_tid; unsigned long ts_requested; struct transaction_run_stats_s run; }; static inline unsigned long jbd2_time_diff(unsigned long start, unsigned long end) { if (end >= start) return end - start; return end + (MAX_JIFFY_OFFSET - start); } #define JBD2_NR_BATCH 64 enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY}; #define JBD2_FC_REPLAY_STOP 0 #define JBD2_FC_REPLAY_CONTINUE 1 /** * struct journal_s - The journal_s type is the concrete type associated with * journal_t. */ struct journal_s { /** * @j_flags: General journaling state flags [j_state_lock, * no lock for quick racy checks] */ unsigned long j_flags; /** * @j_atomic_flags: Atomic journaling state flags. */ unsigned long j_atomic_flags; /** * @j_errno: * * Is there an outstanding uncleared error on the journal (from a prior * abort)? [j_state_lock] */ int j_errno; /** * @j_abort_mutex: Lock the whole aborting procedure. */ struct mutex j_abort_mutex; /** * @j_sb_buffer: The first part of the superblock buffer. */ struct buffer_head *j_sb_buffer; /** * @j_superblock: The second part of the superblock buffer. */ journal_superblock_t *j_superblock; /** * @j_state_lock: Protect the various scalars in the journal. */ rwlock_t j_state_lock; /** * @j_barrier_count: * * Number of processes waiting to create a barrier lock [j_state_lock, * no lock for quick racy checks] */ int j_barrier_count; /** * @j_barrier: The barrier lock itself. */ struct mutex j_barrier; /** * @j_running_transaction: * * Transactions: The current running transaction... * [j_state_lock, no lock for quick racy checks] [caller holding * open handle] */ transaction_t *j_running_transaction; /** * @j_committing_transaction: * * the transaction we are pushing to disk * [j_state_lock] [caller holding open handle] */ transaction_t *j_committing_transaction; /** * @j_checkpoint_transactions: * * ... and a linked circular list of all transactions waiting for * checkpointing. [j_list_lock] */ transaction_t *j_checkpoint_transactions; /** * @j_wait_transaction_locked: * * Wait queue for waiting for a locked transaction to start committing, * or for a barrier lock to be released. */ wait_queue_head_t j_wait_transaction_locked; /** * @j_wait_done_commit: Wait queue for waiting for commit to complete. */ wait_queue_head_t j_wait_done_commit; /** * @j_wait_commit: Wait queue to trigger commit. */ wait_queue_head_t j_wait_commit; /** * @j_wait_updates: Wait queue to wait for updates to complete. */ wait_queue_head_t j_wait_updates; /** * @j_wait_reserved: * * Wait queue to wait for reserved buffer credits to drop. */ wait_queue_head_t j_wait_reserved; /** * @j_fc_wait: * * Wait queue to wait for completion of async fast commits. */ wait_queue_head_t j_fc_wait; /** * @j_checkpoint_mutex: * * Semaphore for locking against concurrent checkpoints. */ struct mutex j_checkpoint_mutex; /** * @j_chkpt_bhs: * * List of buffer heads used by the checkpoint routine. This * was moved from jbd2_log_do_checkpoint() to reduce stack * usage. Access to this array is controlled by the * @j_checkpoint_mutex. [j_checkpoint_mutex] */ struct buffer_head *j_chkpt_bhs[JBD2_NR_BATCH]; /** * @j_shrinker: * * Journal head shrinker, reclaim buffer's journal head which * has been written back. */ struct shrinker *j_shrinker; /** * @j_checkpoint_jh_count: * * Number of journal buffers on the checkpoint list. [j_list_lock] */ struct percpu_counter j_checkpoint_jh_count; /** * @j_shrink_transaction: * * Record next transaction will shrink on the checkpoint list. * [j_list_lock] */ transaction_t *j_shrink_transaction; /** * @j_head: * * Journal head: identifies the first unused block in the journal. * [j_state_lock] */ unsigned long j_head; /** * @j_tail: * * Journal tail: identifies the oldest still-used block in the journal. * [j_state_lock] */ unsigned long j_tail; /** * @j_free: * * Journal free: how many free blocks are there in the journal? * [j_state_lock] */ unsigned long j_free; /** * @j_first: * * The block number of the first usable block in the journal * [j_state_lock]. */ unsigned long j_first; /** * @j_last: * * The block number one beyond the last usable block in the journal * [j_state_lock]. */ unsigned long j_last; /** * @j_fc_first: * * The block number of the first fast commit block in the journal * [j_state_lock]. */ unsigned long j_fc_first; /** * @j_fc_off: * * Number of fast commit blocks currently allocated. Accessed only * during fast commit. Currently only process can do fast commit, so * this field is not protected by any lock. */ unsigned long j_fc_off; /** * @j_fc_last: * * The block number one beyond the last fast commit block in the journal * [j_state_lock]. */ unsigned long j_fc_last; /** * @j_dev: Device where we store the journal. */ struct block_device *j_dev; /** * @j_blocksize: Block size for the location where we store the journal. */ int j_blocksize; /** * @j_blk_offset: * * Starting block offset into the device where we store the journal. */ unsigned long long j_blk_offset; /** * @j_devname: Journal device name. */ char j_devname[BDEVNAME_SIZE+24]; /** * @j_fs_dev: * * Device which holds the client fs. For internal journal this will be * equal to j_dev. */ struct block_device *j_fs_dev; /** * @j_total_len: Total maximum capacity of the journal region on disk. */ unsigned int j_total_len; /** * @j_reserved_credits: * * Number of buffers reserved from the running transaction. */ atomic_t j_reserved_credits; /** * @j_list_lock: Protects the buffer lists and internal buffer state. */ spinlock_t j_list_lock; /** * @j_inode: * * Optional inode where we store the journal. If present, all * journal block numbers are mapped into this inode via bmap(). */ struct inode *j_inode; /** * @j_tail_sequence: * * Sequence number of the oldest transaction in the log [j_state_lock] */ tid_t j_tail_sequence; /** * @j_transaction_sequence: * * Sequence number of the next transaction to grant [j_state_lock] */ tid_t j_transaction_sequence; /** * @j_commit_sequence: * * Sequence number of the most recently committed transaction * [j_state_lock, no lock for quick racy checks] */ tid_t j_commit_sequence; /** * @j_commit_request: * * Sequence number of the most recent transaction wanting commit * [j_state_lock, no lock for quick racy checks] */ tid_t j_commit_request; /** * @j_uuid: * * Journal uuid: identifies the object (filesystem, LVM volume etc) * backed by this journal. This will eventually be replaced by an array * of uuids, allowing us to index multiple devices within a single * journal and to perform atomic updates across them. */ __u8 j_uuid[16]; /** * @j_task: Pointer to the current commit thread for this journal. */ struct task_struct *j_task; /** * @j_max_transaction_buffers: * * Maximum number of metadata buffers to allow in a single compound * commit transaction. */ int j_max_transaction_buffers; /** * @j_revoke_records_per_block: * * Number of revoke records that fit in one descriptor block. */ int j_revoke_records_per_block; /** * @j_commit_interval: * * What is the maximum transaction lifetime before we begin a commit? */ unsigned long j_commit_interval; /** * @j_commit_timer: The timer used to wakeup the commit thread. */ struct timer_list j_commit_timer; /** * @j_revoke_lock: Protect the revoke table. */ spinlock_t j_revoke_lock; /** * @j_revoke: * * The revoke table - maintains the list of revoked blocks in the * current transaction. */ struct jbd2_revoke_table_s *j_revoke; /** * @j_revoke_table: Alternate revoke tables for j_revoke. */ struct jbd2_revoke_table_s *j_revoke_table[2]; /** * @j_wbuf: Array of bhs for jbd2_journal_commit_transaction. */ struct buffer_head **j_wbuf; /** * @j_fc_wbuf: Array of fast commit bhs for fast commit. Accessed only * during a fast commit. Currently only process can do fast commit, so * this field is not protected by any lock. */ struct buffer_head **j_fc_wbuf; /** * @j_wbufsize: * * Size of @j_wbuf array. */ int j_wbufsize; /** * @j_fc_wbufsize: * * Size of @j_fc_wbuf array. */ int j_fc_wbufsize; /** * @j_last_sync_writer: * * The pid of the last person to run a synchronous operation * through the journal. */ pid_t j_last_sync_writer; /** * @j_average_commit_time: * * The average amount of time in nanoseconds it takes to commit a * transaction to disk. [j_state_lock] */ u64 j_average_commit_time; /** * @j_min_batch_time: * * Minimum time that we should wait for additional filesystem operations * to get batched into a synchronous handle in microseconds. */ u32 j_min_batch_time; /** * @j_max_batch_time: * * Maximum time that we should wait for additional filesystem operations * to get batched into a synchronous handle in microseconds. */ u32 j_max_batch_time; /** * @j_commit_callback: * * This function is called when a transaction is closed. */ void (*j_commit_callback)(journal_t *, transaction_t *); /** * @j_submit_inode_data_buffers: * * This function is called for all inodes associated with the * committing transaction marked with JI_WRITE_DATA flag * before we start to write out the transaction to the journal. */ int (*j_submit_inode_data_buffers) (struct jbd2_inode *); /** * @j_finish_inode_data_buffers: * * This function is called for all inodes associated with the * committing transaction marked with JI_WAIT_DATA flag * after we have written the transaction to the journal * but before we write out the commit block. */ int (*j_finish_inode_data_buffers) (struct jbd2_inode *); /* * Journal statistics */ /** * @j_history_lock: Protect the transactions statistics history. */ spinlock_t j_history_lock; /** * @j_proc_entry: procfs entry for the jbd statistics directory. */ struct proc_dir_entry *j_proc_entry; /** * @j_stats: Overall statistics. */ struct transaction_stats_s j_stats; /** * @j_failed_commit: Failed journal commit ID. */ unsigned int j_failed_commit; /** * @j_private: * * An opaque pointer to fs-private information. ext3 puts its * superblock pointer here. */ void *j_private; /** * @j_chksum_driver: * * Reference to checksum algorithm driver via cryptoapi. */ struct crypto_shash *j_chksum_driver; /** * @j_csum_seed: * * Precomputed journal UUID checksum for seeding other checksums. */ __u32 j_csum_seed; #ifdef CONFIG_DEBUG_LOCK_ALLOC /** * @j_trans_commit_map: * * Lockdep entity to track transaction commit dependencies. Handles * hold this "lock" for read, when we wait for commit, we acquire the * "lock" for writing. This matches the properties of jbd2 journalling * where the running transaction has to wait for all handles to be * dropped to commit that transaction and also acquiring a handle may * require transaction commit to finish. */ struct lockdep_map j_trans_commit_map; #endif /** * @j_fc_cleanup_callback: * * Clean-up after fast commit or full commit. JBD2 calls this function * after every commit operation. */ void (*j_fc_cleanup_callback)(struct journal_s *journal, int full, tid_t tid); /** * @j_fc_replay_callback: * * File-system specific function that performs replay of a fast * commit. JBD2 calls this function for each fast commit block found in * the journal. This function should return JBD2_FC_REPLAY_CONTINUE * to indicate that the block was processed correctly and more fast * commit replay should continue. Return value of JBD2_FC_REPLAY_STOP * indicates the end of replay (no more blocks remaining). A negative * return value indicates error. */ int (*j_fc_replay_callback)(struct journal_s *journal, struct buffer_head *bh, enum passtype pass, int off, tid_t expected_commit_id); /** * @j_bmap: * * Bmap function that should be used instead of the generic * VFS bmap function. */ int (*j_bmap)(struct journal_s *journal, sector_t *block); }; #define jbd2_might_wait_for_commit(j) \ do { \ rwsem_acquire(&j->j_trans_commit_map, 0, 0, _THIS_IP_); \ rwsem_release(&j->j_trans_commit_map, _THIS_IP_); \ } while (0) /* * We can support any known requested features iff the * superblock is not in version 1. Otherwise we fail to support any * extended sb features. */ static inline bool jbd2_format_support_feature(journal_t *j) { return j->j_superblock->s_header.h_blocktype != cpu_to_be32(JBD2_SUPERBLOCK_V1); } /* journal feature predicate functions */ #define JBD2_FEATURE_COMPAT_FUNCS(name, flagname) \ static inline bool jbd2_has_feature_##name(journal_t *j) \ { \ return (jbd2_format_support_feature(j) && \ ((j)->j_superblock->s_feature_compat & \ cpu_to_be32(JBD2_FEATURE_COMPAT_##flagname)) != 0); \ } \ static inline void jbd2_set_feature_##name(journal_t *j) \ { \ (j)->j_superblock->s_feature_compat |= \ cpu_to_be32(JBD2_FEATURE_COMPAT_##flagname); \ } \ static inline void jbd2_clear_feature_##name(journal_t *j) \ { \ (j)->j_superblock->s_feature_compat &= \ ~cpu_to_be32(JBD2_FEATURE_COMPAT_##flagname); \ } #define JBD2_FEATURE_RO_COMPAT_FUNCS(name, flagname) \ static inline bool jbd2_has_feature_##name(journal_t *j) \ { \ return (jbd2_format_support_feature(j) && \ ((j)->j_superblock->s_feature_ro_compat & \ cpu_to_be32(JBD2_FEATURE_RO_COMPAT_##flagname)) != 0); \ } \ static inline void jbd2_set_feature_##name(journal_t *j) \ { \ (j)->j_superblock->s_feature_ro_compat |= \ cpu_to_be32(JBD2_FEATURE_RO_COMPAT_##flagname); \ } \ static inline void jbd2_clear_feature_##name(journal_t *j) \ { \ (j)->j_superblock->s_feature_ro_compat &= \ ~cpu_to_be32(JBD2_FEATURE_RO_COMPAT_##flagname); \ } #define JBD2_FEATURE_INCOMPAT_FUNCS(name, flagname) \ static inline bool jbd2_has_feature_##name(journal_t *j) \ { \ return (jbd2_format_support_feature(j) && \ ((j)->j_superblock->s_feature_incompat & \ cpu_to_be32(JBD2_FEATURE_INCOMPAT_##flagname)) != 0); \ } \ static inline void jbd2_set_feature_##name(journal_t *j) \ { \ (j)->j_superblock->s_feature_incompat |= \ cpu_to_be32(JBD2_FEATURE_INCOMPAT_##flagname); \ } \ static inline void jbd2_clear_feature_##name(journal_t *j) \ { \ (j)->j_superblock->s_feature_incompat &= \ ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_##flagname); \ } JBD2_FEATURE_COMPAT_FUNCS(checksum, CHECKSUM) JBD2_FEATURE_INCOMPAT_FUNCS(revoke, REVOKE) JBD2_FEATURE_INCOMPAT_FUNCS(64bit, 64BIT) JBD2_FEATURE_INCOMPAT_FUNCS(async_commit, ASYNC_COMMIT) JBD2_FEATURE_INCOMPAT_FUNCS(csum2, CSUM_V2) JBD2_FEATURE_INCOMPAT_FUNCS(csum3, CSUM_V3) JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit, FAST_COMMIT) /* * Journal flag definitions */ #define JBD2_UNMOUNT 0x001 /* Journal thread is being destroyed */ #define JBD2_ABORT 0x002 /* Journaling has been aborted for errors. */ #define JBD2_ACK_ERR 0x004 /* The errno in the sb has been acked */ #define JBD2_FLUSHED 0x008 /* The journal superblock has been flushed */ #define JBD2_LOADED 0x010 /* The journal superblock has been loaded */ #define JBD2_BARRIER 0x020 /* Use IDE barriers */ #define JBD2_ABORT_ON_SYNCDATA_ERR 0x040 /* Abort the journal on file * data write error in ordered * mode */ #define JBD2_CYCLE_RECORD 0x080 /* Journal cycled record log on * clean and empty filesystem * logging area */ #define JBD2_FAST_COMMIT_ONGOING 0x100 /* Fast commit is ongoing */ #define JBD2_FULL_COMMIT_ONGOING 0x200 /* Full commit is ongoing */ #define JBD2_JOURNAL_FLUSH_DISCARD 0x0001 #define JBD2_JOURNAL_FLUSH_ZEROOUT 0x0002 #define JBD2_JOURNAL_FLUSH_VALID (JBD2_JOURNAL_FLUSH_DISCARD | \ JBD2_JOURNAL_FLUSH_ZEROOUT) /* * Journal atomic flag definitions */ #define JBD2_CHECKPOINT_IO_ERROR 0x001 /* Detect io error while writing * buffer back to disk */ /* * Function declarations for the journaling transaction and buffer * management */ /* Filing buffers */ extern void jbd2_journal_unfile_buffer(journal_t *, struct journal_head *); extern bool __jbd2_journal_refile_buffer(struct journal_head *); extern void jbd2_journal_refile_buffer(journal_t *, struct journal_head *); extern void __jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int); extern void jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int); static inline void jbd2_file_log_bh(struct list_head *head, struct buffer_head *bh) { list_add_tail(&bh->b_assoc_buffers, head); } static inline void jbd2_unfile_log_bh(struct buffer_head *bh) { list_del_init(&bh->b_assoc_buffers); } /* Log buffer allocation */ struct buffer_head *jbd2_journal_get_descriptor_buffer(transaction_t *, int); void jbd2_descriptor_block_csum_set(journal_t *, struct buffer_head *); int jbd2_journal_next_log_block(journal_t *, unsigned long long *); int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid, unsigned long *block); int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block); void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block); /* Commit management */ extern void jbd2_journal_commit_transaction(journal_t *); /* Checkpoint list management */ void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy); unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, unsigned long *nr_to_scan); int __jbd2_journal_remove_checkpoint(struct journal_head *); int jbd2_journal_try_remove_checkpoint(struct journal_head *jh); void jbd2_journal_destroy_checkpoint(journal_t *journal); void __jbd2_journal_insert_checkpoint(struct journal_head *, transaction_t *); /* * Triggers */ struct jbd2_buffer_trigger_type { /* * Fired a the moment data to write to the journal are known to be * stable - so either at the moment b_frozen_data is created or just * before a buffer is written to the journal. mapped_data is a mapped * buffer that is the frozen data for commit. */ void (*t_frozen)(struct jbd2_buffer_trigger_type *type, struct buffer_head *bh, void *mapped_data, size_t size); /* * Fired during journal abort for dirty buffers that will not be * committed. */ void (*t_abort)(struct jbd2_buffer_trigger_type *type, struct buffer_head *bh); }; extern void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data, struct jbd2_buffer_trigger_type *triggers); extern void jbd2_buffer_abort_trigger(struct journal_head *jh, struct jbd2_buffer_trigger_type *triggers); /* Buffer IO */ extern int jbd2_journal_write_metadata_buffer(transaction_t *transaction, struct journal_head *jh_in, struct buffer_head **bh_out, sector_t blocknr); /* Transaction cache support */ extern void jbd2_journal_destroy_transaction_cache(void); extern int __init jbd2_journal_init_transaction_cache(void); extern void jbd2_journal_free_transaction(transaction_t *); /* * Journal locking. * * We need to lock the journal during transaction state changes so that nobody * ever tries to take a handle on the running transaction while we are in the * middle of moving it to the commit phase. j_state_lock does this. * * Note that the locking is completely interrupt unsafe. We never touch * journal structures from interrupts. */ static inline handle_t *journal_current_handle(void) { return current->journal_info; } /* The journaling code user interface: * * Create and destroy handles * Register buffer modifications against the current transaction. */ extern handle_t *jbd2_journal_start(journal_t *, int nblocks); extern handle_t *jbd2__journal_start(journal_t *, int blocks, int rsv_blocks, int revoke_records, gfp_t gfp_mask, unsigned int type, unsigned int line_no); extern int jbd2_journal_restart(handle_t *, int nblocks); extern int jbd2__journal_restart(handle_t *, int nblocks, int revoke_records, gfp_t gfp_mask); extern int jbd2_journal_start_reserved(handle_t *handle, unsigned int type, unsigned int line_no); extern void jbd2_journal_free_reserved(handle_t *handle); extern int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records); extern int jbd2_journal_get_write_access(handle_t *, struct buffer_head *); extern int jbd2_journal_get_create_access (handle_t *, struct buffer_head *); extern int jbd2_journal_get_undo_access(handle_t *, struct buffer_head *); void jbd2_journal_set_triggers(struct buffer_head *, struct jbd2_buffer_trigger_type *type); extern int jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *); extern int jbd2_journal_forget (handle_t *, struct buffer_head *); int jbd2_journal_invalidate_folio(journal_t *, struct folio *, size_t offset, size_t length); bool jbd2_journal_try_to_free_buffers(journal_t *journal, struct folio *folio); extern int jbd2_journal_stop(handle_t *); extern int jbd2_journal_flush(journal_t *journal, unsigned int flags); extern void jbd2_journal_lock_updates (journal_t *); extern void jbd2_journal_unlock_updates (journal_t *); void jbd2_journal_wait_updates(journal_t *); extern journal_t * jbd2_journal_init_dev(struct block_device *bdev, struct block_device *fs_dev, unsigned long long start, int len, int bsize); extern journal_t * jbd2_journal_init_inode (struct inode *); extern int jbd2_journal_update_format (journal_t *); extern int jbd2_journal_check_used_features (journal_t *, unsigned long, unsigned long, unsigned long); extern int jbd2_journal_check_available_features (journal_t *, unsigned long, unsigned long, unsigned long); extern int jbd2_journal_set_features (journal_t *, unsigned long, unsigned long, unsigned long); extern void jbd2_journal_clear_features (journal_t *, unsigned long, unsigned long, unsigned long); extern int jbd2_journal_load (journal_t *journal); extern int jbd2_journal_destroy (journal_t *); extern int jbd2_journal_recover (journal_t *journal); extern int jbd2_journal_wipe (journal_t *, int); extern int jbd2_journal_skip_recovery (journal_t *); extern void jbd2_journal_update_sb_errno(journal_t *); extern int jbd2_journal_update_sb_log_tail (journal_t *, tid_t, unsigned long, blk_opf_t); extern void jbd2_journal_abort (journal_t *, int); extern int jbd2_journal_errno (journal_t *); extern void jbd2_journal_ack_err (journal_t *); extern int jbd2_journal_clear_err (journal_t *); extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *); extern int jbd2_journal_force_commit(journal_t *); extern int jbd2_journal_force_commit_nested(journal_t *); extern int jbd2_journal_inode_ranged_write(handle_t *handle, struct jbd2_inode *inode, loff_t start_byte, loff_t length); extern int jbd2_journal_inode_ranged_wait(handle_t *handle, struct jbd2_inode *inode, loff_t start_byte, loff_t length); extern int jbd2_journal_finish_inode_data_buffers( struct jbd2_inode *jinode); extern int jbd2_journal_begin_ordered_truncate(journal_t *journal, struct jbd2_inode *inode, loff_t new_size); extern void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode); extern void jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode); /* * journal_head management */ struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh); struct journal_head *jbd2_journal_grab_journal_head(struct buffer_head *bh); void jbd2_journal_put_journal_head(struct journal_head *jh); /* * handle management */ extern struct kmem_cache *jbd2_handle_cache; static inline handle_t *jbd2_alloc_handle(gfp_t gfp_flags) { return kmem_cache_zalloc(jbd2_handle_cache, gfp_flags); } static inline void jbd2_free_handle(handle_t *handle) { kmem_cache_free(jbd2_handle_cache, handle); } /* * jbd2_inode management (optional, for those file systems that want to use * dynamically allocated jbd2_inode structures) */ extern struct kmem_cache *jbd2_inode_cache; static inline struct jbd2_inode *jbd2_alloc_inode(gfp_t gfp_flags) { return kmem_cache_alloc(jbd2_inode_cache, gfp_flags); } static inline void jbd2_free_inode(struct jbd2_inode *jinode) { kmem_cache_free(jbd2_inode_cache, jinode); } /* Primary revoke support */ #define JOURNAL_REVOKE_DEFAULT_HASH 256 extern int jbd2_journal_init_revoke(journal_t *, int); extern void jbd2_journal_destroy_revoke_record_cache(void); extern void jbd2_journal_destroy_revoke_table_cache(void); extern int __init jbd2_journal_init_revoke_record_cache(void); extern int __init jbd2_journal_init_revoke_table_cache(void); extern void jbd2_journal_destroy_revoke(journal_t *); extern int jbd2_journal_revoke (handle_t *, unsigned long long, struct buffer_head *); extern int jbd2_journal_cancel_revoke(handle_t *, struct journal_head *); extern void jbd2_journal_write_revoke_records(transaction_t *transaction, struct list_head *log_bufs); /* Recovery revoke support */ extern int jbd2_journal_set_revoke(journal_t *, unsigned long long, tid_t); extern int jbd2_journal_test_revoke(journal_t *, unsigned long long, tid_t); extern void jbd2_journal_clear_revoke(journal_t *); extern void jbd2_journal_switch_revoke_table(journal_t *journal); extern void jbd2_clear_buffer_revoked_flags(journal_t *journal); /* * The log thread user interface: * * Request space in the current transaction, and force transaction commit * transitions on demand. */ int jbd2_log_start_commit(journal_t *journal, tid_t tid); int jbd2_journal_start_commit(journal_t *journal, tid_t *tid); int jbd2_log_wait_commit(journal_t *journal, tid_t tid); int jbd2_transaction_committed(journal_t *journal, tid_t tid); int jbd2_complete_transaction(journal_t *journal, tid_t tid); int jbd2_log_do_checkpoint(journal_t *journal); int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid); void __jbd2_log_wait_for_space(journal_t *journal); extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *); extern int jbd2_cleanup_journal_tail(journal_t *); /* Fast commit related APIs */ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid); int jbd2_fc_end_commit(journal_t *journal); int jbd2_fc_end_commit_fallback(journal_t *journal); int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out); int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode); int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode); int jbd2_fc_wait_bufs(journal_t *journal, int num_blks); int jbd2_fc_release_bufs(journal_t *journal); static inline int jbd2_journal_get_max_txn_bufs(journal_t *journal) { return (journal->j_total_len - journal->j_fc_wbufsize) / 4; } /* * is_journal_abort * * Simple test wrapper function to test the JBD2_ABORT state flag. This * bit, when set, indicates that we have had a fatal error somewhere, * either inside the journaling layer or indicated to us by the client * (eg. ext3), and that we and should not commit any further * transactions. */ static inline int is_journal_aborted(journal_t *journal) { return journal->j_flags & JBD2_ABORT; } static inline int is_handle_aborted(handle_t *handle) { if (handle->h_aborted || !handle->h_transaction) return 1; return is_journal_aborted(handle->h_transaction->t_journal); } static inline void jbd2_journal_abort_handle(handle_t *handle) { handle->h_aborted = 1; } #endif /* __KERNEL__ */ /* Comparison functions for transaction IDs: perform comparisons using * modulo arithmetic so that they work over sequence number wraps. */ static inline int tid_gt(tid_t x, tid_t y) { int difference = (x - y); return (difference > 0); } static inline int tid_geq(tid_t x, tid_t y) { int difference = (x - y); return (difference >= 0); } extern int jbd2_journal_blocks_per_page(struct inode *inode); extern size_t journal_tag_bytes(journal_t *journal); static inline bool jbd2_journal_has_csum_v2or3_feature(journal_t *j) { return jbd2_has_feature_csum2(j) || jbd2_has_feature_csum3(j); } static inline int jbd2_journal_has_csum_v2or3(journal_t *journal) { WARN_ON_ONCE(jbd2_journal_has_csum_v2or3_feature(journal) && journal->j_chksum_driver == NULL); return journal->j_chksum_driver != NULL; } static inline int jbd2_journal_get_num_fc_blks(journal_superblock_t *jsb) { int num_fc_blocks = be32_to_cpu(jsb->s_num_fc_blks); return num_fc_blocks ? num_fc_blocks : JBD2_DEFAULT_FAST_COMMIT_BLOCKS; } /* * Return number of free blocks in the log. Must be called under j_state_lock. */ static inline unsigned long jbd2_log_space_left(journal_t *journal) { /* Allow for rounding errors */ long free = journal->j_free - 32; if (journal->j_committing_transaction) { free -= atomic_read(&journal-> j_committing_transaction->t_outstanding_credits); } return max_t(long, free, 0); } /* * Definitions which augment the buffer_head layer */ /* journaling buffer types */ #define BJ_None 0 /* Not journaled */ #define BJ_Metadata 1 /* Normal journaled metadata */ #define BJ_Forget 2 /* Buffer superseded by this transaction */ #define BJ_Shadow 3 /* Buffer contents being shadowed to the log */ #define BJ_Reserved 4 /* Buffer is reserved for access by journal */ #define BJ_Types 5 /* JBD uses a CRC32 checksum */ #define JBD_MAX_CHECKSUM_SIZE 4 static inline u32 jbd2_chksum(journal_t *journal, u32 crc, const void *address, unsigned int length) { struct { struct shash_desc shash; char ctx[JBD_MAX_CHECKSUM_SIZE]; } desc; int err; BUG_ON(crypto_shash_descsize(journal->j_chksum_driver) > JBD_MAX_CHECKSUM_SIZE); desc.shash.tfm = journal->j_chksum_driver; *(u32 *)desc.ctx = crc; err = crypto_shash_update(&desc.shash, address, length); BUG_ON(err); return *(u32 *)desc.ctx; } /* Return most recent uncommitted transaction */ static inline tid_t jbd2_get_latest_transaction(journal_t *journal) { tid_t tid; read_lock(&journal->j_state_lock); tid = journal->j_commit_request; if (journal->j_running_transaction) tid = journal->j_running_transaction->t_tid; read_unlock(&journal->j_state_lock); return tid; } static inline int jbd2_handle_buffer_credits(handle_t *handle) { journal_t *journal; if (!handle->h_reserved) journal = handle->h_transaction->t_journal; else journal = handle->h_journal; return handle->h_total_credits - DIV_ROUND_UP(handle->h_revoke_credits_requested, journal->j_revoke_records_per_block); } #ifdef __KERNEL__ #define buffer_trace_init(bh) do {} while (0) #define print_buffer_fields(bh) do {} while (0) #define print_buffer_trace(bh) do {} while (0) #define BUFFER_TRACE(bh, info) do {} while (0) #define BUFFER_TRACE2(bh, bh2, info) do {} while (0) #define JBUFFER_TRACE(jh, info) do {} while (0) #endif /* __KERNEL__ */ #define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #endif /* _LINUX_JBD2_H */
4874 4883 640 640 639 639 640 640 640 640 640 640 640 640 640 639 640 640 640 639 640 873 872 872 873 873 873 872 872 873 873 872 872 873 873 873 873 873 873 873 872 874 743 743 743 743 743 743 743 873 873 873 873 640 639 640 639 638 639 872 874 874 873 874 874 874 874 639 639 618 852 874 713 874 874 640 20 872 872 414 638 606 111 640 640 537 872 873 873 873 873 873 873 873 873 873 873 873 873 873 872 873 20 873 873 872 873 872 873 873 803 803 872 873 873 872 871 873 873 872 873 870 873 640 640 640 640 639 640 640 872 872 872 871 873 872 873 640 640 640 640 640 639 639 640 640 640 640 640 524 524 872 873 872 872 873 872 873 873 873 873 873 873 872 873 871 872 872 873 871 871 873 872 872 873 871 873 872 873 873 873 872 872 873 872 872 872 872 132 750 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 1993 Linus Torvalds * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000 * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002 * Numa awareness, Christoph Lameter, SGI, June 2005 * Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019 */ #include <linux/vmalloc.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/highmem.h> #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/interrupt.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/set_memory.h> #include <linux/debugobjects.h> #include <linux/kallsyms.h> #include <linux/list.h> #include <linux/notifier.h> #include <linux/rbtree.h> #include <linux/xarray.h> #include <linux/io.h> #include <linux/rcupdate.h> #include <linux/pfn.h> #include <linux/kmemleak.h> #include <linux/atomic.h> #include <linux/compiler.h> #include <linux/memcontrol.h> #include <linux/llist.h> #include <linux/uio.h> #include <linux/bitops.h> #include <linux/rbtree_augmented.h> #include <linux/overflow.h> #include <linux/pgtable.h> #include <linux/hugetlb.h> #include <linux/sched/mm.h> #include <asm/tlbflush.h> #include <asm/shmparam.h> #define CREATE_TRACE_POINTS #include <trace/events/vmalloc.h> #include "internal.h" #include "pgalloc-track.h" #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP static unsigned int __ro_after_init ioremap_max_page_shift = BITS_PER_LONG - 1; static int __init set_nohugeiomap(char *str) { ioremap_max_page_shift = PAGE_SHIFT; return 0; } early_param("nohugeiomap", set_nohugeiomap); #else /* CONFIG_HAVE_ARCH_HUGE_VMAP */ static const unsigned int ioremap_max_page_shift = PAGE_SHIFT; #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC static bool __ro_after_init vmap_allow_huge = true; static int __init set_nohugevmalloc(char *str) { vmap_allow_huge = false; return 0; } early_param("nohugevmalloc", set_nohugevmalloc); #else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ static const bool vmap_allow_huge = false; #endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ bool is_vmalloc_addr(const void *x) { unsigned long addr = (unsigned long)kasan_reset_tag(x); return addr >= VMALLOC_START && addr < VMALLOC_END; } EXPORT_SYMBOL(is_vmalloc_addr); struct vfree_deferred { struct llist_head list; struct work_struct wq; }; static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred); /*** Page table manipulation functions ***/ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift, pgtbl_mod_mask *mask) { pte_t *pte; u64 pfn; unsigned long size = PAGE_SIZE; pfn = phys_addr >> PAGE_SHIFT; pte = pte_alloc_kernel_track(pmd, addr, mask); if (!pte) return -ENOMEM; do { BUG_ON(!pte_none(ptep_get(pte))); #ifdef CONFIG_HUGETLB_PAGE size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift); if (size != PAGE_SIZE) { pte_t entry = pfn_pte(pfn, prot); entry = arch_make_huge_pte(entry, ilog2(size), 0); set_huge_pte_at(&init_mm, addr, pte, entry, size); pfn += PFN_DOWN(size); continue; } #endif set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot)); pfn++; } while (pte += PFN_DOWN(size), addr += size, addr != end); *mask |= PGTBL_PTE_MODIFIED; return 0; } static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift) { if (max_page_shift < PMD_SHIFT) return 0; if (!arch_vmap_pmd_supported(prot)) return 0; if ((end - addr) != PMD_SIZE) return 0; if (!IS_ALIGNED(addr, PMD_SIZE)) return 0; if (!IS_ALIGNED(phys_addr, PMD_SIZE)) return 0; if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr)) return 0; return pmd_set_huge(pmd, phys_addr, prot); } static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift, pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; pmd = pmd_alloc_track(&init_mm, pud, addr, mask); if (!pmd) return -ENOMEM; do { next = pmd_addr_end(addr, end); if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot, max_page_shift)) { *mask |= PGTBL_PMD_MODIFIED; continue; } if (vmap_pte_range(pmd, addr, next, phys_addr, prot, max_page_shift, mask)) return -ENOMEM; } while (pmd++, phys_addr += (next - addr), addr = next, addr != end); return 0; } static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift) { if (max_page_shift < PUD_SHIFT) return 0; if (!arch_vmap_pud_supported(prot)) return 0; if ((end - addr) != PUD_SIZE) return 0; if (!IS_ALIGNED(addr, PUD_SIZE)) return 0; if (!IS_ALIGNED(phys_addr, PUD_SIZE)) return 0; if (pud_present(*pud) && !pud_free_pmd_page(pud, addr)) return 0; return pud_set_huge(pud, phys_addr, prot); } static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift, pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; pud = pud_alloc_track(&init_mm, p4d, addr, mask); if (!pud) return -ENOMEM; do { next = pud_addr_end(addr, end); if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot, max_page_shift)) { *mask |= PGTBL_PUD_MODIFIED; continue; } if (vmap_pmd_range(pud, addr, next, phys_addr, prot, max_page_shift, mask)) return -ENOMEM; } while (pud++, phys_addr += (next - addr), addr = next, addr != end); return 0; } static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift) { if (max_page_shift < P4D_SHIFT) return 0; if (!arch_vmap_p4d_supported(prot)) return 0; if ((end - addr) != P4D_SIZE) return 0; if (!IS_ALIGNED(addr, P4D_SIZE)) return 0; if (!IS_ALIGNED(phys_addr, P4D_SIZE)) return 0; if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr)) return 0; return p4d_set_huge(p4d, phys_addr, prot); } static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift, pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); if (!p4d) return -ENOMEM; do { next = p4d_addr_end(addr, end); if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot, max_page_shift)) { *mask |= PGTBL_P4D_MODIFIED; continue; } if (vmap_pud_range(p4d, addr, next, phys_addr, prot, max_page_shift, mask)) return -ENOMEM; } while (p4d++, phys_addr += (next - addr), addr = next, addr != end); return 0; } static int vmap_range_noflush(unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift) { pgd_t *pgd; unsigned long start; unsigned long next; int err; pgtbl_mod_mask mask = 0; might_sleep(); BUG_ON(addr >= end); start = addr; pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); err = vmap_p4d_range(pgd, addr, next, phys_addr, prot, max_page_shift, &mask); if (err) break; } while (pgd++, phys_addr += (next - addr), addr = next, addr != end); if (mask & ARCH_PAGE_TABLE_SYNC_MASK) arch_sync_kernel_mappings(start, end); return err; } int ioremap_page_range(unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot) { int err; err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot), ioremap_max_page_shift); flush_cache_vmap(addr, end); if (!err) err = kmsan_ioremap_page_range(addr, end, phys_addr, prot, ioremap_max_page_shift); return err; } static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pgtbl_mod_mask *mask) { pte_t *pte; pte = pte_offset_kernel(pmd, addr); do { pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte); WARN_ON(!pte_none(ptent) && !pte_present(ptent)); } while (pte++, addr += PAGE_SIZE, addr != end); *mask |= PGTBL_PTE_MODIFIED; } static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; int cleared; pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); cleared = pmd_clear_huge(pmd); if (cleared || pmd_bad(*pmd)) *mask |= PGTBL_PMD_MODIFIED; if (cleared) continue; if (pmd_none_or_clear_bad(pmd)) continue; vunmap_pte_range(pmd, addr, next, mask); cond_resched(); } while (pmd++, addr = next, addr != end); } static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; int cleared; pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); cleared = pud_clear_huge(pud); if (cleared || pud_bad(*pud)) *mask |= PGTBL_PUD_MODIFIED; if (cleared) continue; if (pud_none_or_clear_bad(pud)) continue; vunmap_pmd_range(pud, addr, next, mask); } while (pud++, addr = next, addr != end); } static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); p4d_clear_huge(p4d); if (p4d_bad(*p4d)) *mask |= PGTBL_P4D_MODIFIED; if (p4d_none_or_clear_bad(p4d)) continue; vunmap_pud_range(p4d, addr, next, mask); } while (p4d++, addr = next, addr != end); } /* * vunmap_range_noflush is similar to vunmap_range, but does not * flush caches or TLBs. * * The caller is responsible for calling flush_cache_vmap() before calling * this function, and flush_tlb_kernel_range after it has returned * successfully (and before the addresses are expected to cause a page fault * or be re-mapped for something else, if TLB flushes are being delayed or * coalesced). * * This is an internal function only. Do not use outside mm/. */ void __vunmap_range_noflush(unsigned long start, unsigned long end) { unsigned long next; pgd_t *pgd; unsigned long addr = start; pgtbl_mod_mask mask = 0; BUG_ON(addr >= end); pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); if (pgd_bad(*pgd)) mask |= PGTBL_PGD_MODIFIED; if (pgd_none_or_clear_bad(pgd)) continue; vunmap_p4d_range(pgd, addr, next, &mask); } while (pgd++, addr = next, addr != end); if (mask & ARCH_PAGE_TABLE_SYNC_MASK) arch_sync_kernel_mappings(start, end); } void vunmap_range_noflush(unsigned long start, unsigned long end) { kmsan_vunmap_range_noflush(start, end); __vunmap_range_noflush(start, end); } /** * vunmap_range - unmap kernel virtual addresses * @addr: start of the VM area to unmap * @end: end of the VM area to unmap (non-inclusive) * * Clears any present PTEs in the virtual address range, flushes TLBs and * caches. Any subsequent access to the address before it has been re-mapped * is a kernel bug. */ void vunmap_range(unsigned long addr, unsigned long end) { flush_cache_vunmap(addr, end); vunmap_range_noflush(addr, end); flush_tlb_kernel_range(addr, end); } static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { pte_t *pte; /* * nr is a running index into the array which helps higher level * callers keep track of where we're up to. */ pte = pte_alloc_kernel_track(pmd, addr, mask); if (!pte) return -ENOMEM; do { struct page *page = pages[*nr]; if (WARN_ON(!pte_none(ptep_get(pte)))) return -EBUSY; if (WARN_ON(!page)) return -ENOMEM; if (WARN_ON(!pfn_valid(page_to_pfn(page)))) return -EINVAL; set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); (*nr)++; } while (pte++, addr += PAGE_SIZE, addr != end); *mask |= PGTBL_PTE_MODIFIED; return 0; } static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; pmd = pmd_alloc_track(&init_mm, pud, addr, mask); if (!pmd) return -ENOMEM; do { next = pmd_addr_end(addr, end); if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (pmd++, addr = next, addr != end); return 0; } static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; pud = pud_alloc_track(&init_mm, p4d, addr, mask); if (!pud) return -ENOMEM; do { next = pud_addr_end(addr, end); if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (pud++, addr = next, addr != end); return 0; } static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); if (!p4d) return -ENOMEM; do { next = p4d_addr_end(addr, end); if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (p4d++, addr = next, addr != end); return 0; } static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages) { unsigned long start = addr; pgd_t *pgd; unsigned long next; int err = 0; int nr = 0; pgtbl_mod_mask mask = 0; BUG_ON(addr >= end); pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); if (pgd_bad(*pgd)) mask |= PGTBL_PGD_MODIFIED; err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask); if (err) return err; } while (pgd++, addr = next, addr != end); if (mask & ARCH_PAGE_TABLE_SYNC_MASK) arch_sync_kernel_mappings(start, end); return 0; } /* * vmap_pages_range_noflush is similar to vmap_pages_range, but does not * flush caches. * * The caller is responsible for calling flush_cache_vmap() after this * function returns successfully and before the addresses are accessed. * * This is an internal function only. Do not use outside mm/. */ int __vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { unsigned int i, nr = (end - addr) >> PAGE_SHIFT; WARN_ON(page_shift < PAGE_SHIFT); if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) || page_shift == PAGE_SHIFT) return vmap_small_pages_range_noflush(addr, end, prot, pages); for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) { int err; err = vmap_range_noflush(addr, addr + (1UL << page_shift), page_to_phys(pages[i]), prot, page_shift); if (err) return err; addr += 1UL << page_shift; } return 0; } int vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { int ret = kmsan_vmap_pages_range_noflush(addr, end, prot, pages, page_shift); if (ret) return ret; return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift); } /** * vmap_pages_range - map pages to a kernel virtual address * @addr: start of the VM area to map * @end: end of the VM area to map (non-inclusive) * @prot: page protection flags to use * @pages: pages to map (always PAGE_SIZE pages) * @page_shift: maximum shift that the pages may be mapped with, @pages must * be aligned and contiguous up to at least this shift. * * RETURNS: * 0 on success, -errno on failure. */ static int vmap_pages_range(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { int err; err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift); flush_cache_vmap(addr, end); return err; } int is_vmalloc_or_module_addr(const void *x) { /* * ARM, x86-64 and sparc64 put modules in a special place, * and fall back on vmalloc() if that fails. Others * just put it in the vmalloc space. */ #if defined(CONFIG_MODULES) && defined(MODULES_VADDR) unsigned long addr = (unsigned long)kasan_reset_tag(x); if (addr >= MODULES_VADDR && addr < MODULES_END) return 1; #endif return is_vmalloc_addr(x); } EXPORT_SYMBOL_GPL(is_vmalloc_or_module_addr); /* * Walk a vmap address to the struct page it maps. Huge vmap mappings will * return the tail page that corresponds to the base page address, which * matches small vmap mappings. */ struct page *vmalloc_to_page(const void *vmalloc_addr) { unsigned long addr = (unsigned long) vmalloc_addr; struct page *page = NULL; pgd_t *pgd = pgd_offset_k(addr); p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *ptep, pte; /* * XXX we might need to change this if we add VIRTUAL_BUG_ON for * architectures that do not vmalloc module space */ VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr)); if (pgd_none(*pgd)) return NULL; if (WARN_ON_ONCE(pgd_leaf(*pgd))) return NULL; /* XXX: no allowance for huge pgd */ if (WARN_ON_ONCE(pgd_bad(*pgd))) return NULL; p4d = p4d_offset(pgd, addr); if (p4d_none(*p4d)) return NULL; if (p4d_leaf(*p4d)) return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT); if (WARN_ON_ONCE(p4d_bad(*p4d))) return NULL; pud = pud_offset(p4d, addr); if (pud_none(*pud)) return NULL; if (pud_leaf(*pud)) return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); if (WARN_ON_ONCE(pud_bad(*pud))) return NULL; pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) return NULL; if (pmd_leaf(*pmd)) return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); if (WARN_ON_ONCE(pmd_bad(*pmd))) return NULL; ptep = pte_offset_kernel(pmd, addr); pte = ptep_get(ptep); if (pte_present(pte)) page = pte_page(pte); return page; } EXPORT_SYMBOL(vmalloc_to_page); /* * Map a vmalloc()-space virtual address to the physical page frame number. */ unsigned long vmalloc_to_pfn(const void *vmalloc_addr) { return page_to_pfn(vmalloc_to_page(vmalloc_addr)); } EXPORT_SYMBOL(vmalloc_to_pfn); /*** Global kva allocator ***/ #define DEBUG_AUGMENT_PROPAGATE_CHECK 0 #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0 static DEFINE_SPINLOCK(vmap_area_lock); static DEFINE_SPINLOCK(free_vmap_area_lock); /* Export for kexec only */ LIST_HEAD(vmap_area_list); static struct rb_root vmap_area_root = RB_ROOT; static bool vmap_initialized __read_mostly; static struct rb_root purge_vmap_area_root = RB_ROOT; static LIST_HEAD(purge_vmap_area_list); static DEFINE_SPINLOCK(purge_vmap_area_lock); /* * This kmem_cache is used for vmap_area objects. Instead of * allocating from slab we reuse an object from this cache to * make things faster. Especially in "no edge" splitting of * free block. */ static struct kmem_cache *vmap_area_cachep; /* * This linked list is used in pair with free_vmap_area_root. * It gives O(1) access to prev/next to perform fast coalescing. */ static LIST_HEAD(free_vmap_area_list); /* * This augment red-black tree represents the free vmap space. * All vmap_area objects in this tree are sorted by va->va_start * address. It is used for allocation and merging when a vmap * object is released. * * Each vmap_area node contains a maximum available free block * of its sub-tree, right or left. Therefore it is possible to * find a lowest match of free area. */ static struct rb_root free_vmap_area_root = RB_ROOT; /* * Preload a CPU with one object for "no edge" split case. The * aim is to get rid of allocations from the atomic context, thus * to use more permissive allocation masks. */ static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node); static __always_inline unsigned long va_size(struct vmap_area *va) { return (va->va_end - va->va_start); } static __always_inline unsigned long get_subtree_max_size(struct rb_node *node) { struct vmap_area *va; va = rb_entry_safe(node, struct vmap_area, rb_node); return va ? va->subtree_max_size : 0; } RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb, struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size) static void reclaim_and_purge_vmap_areas(void); static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); static void drain_vmap_area_work(struct work_struct *work); static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work); static atomic_long_t nr_vmalloc_pages; unsigned long vmalloc_nr_pages(void) { return atomic_long_read(&nr_vmalloc_pages); } /* Look up the first VA which satisfies addr < va_end, NULL if none. */ static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr) { struct vmap_area *va = NULL; struct rb_node *n = vmap_area_root.rb_node; addr = (unsigned long)kasan_reset_tag((void *)addr); while (n) { struct vmap_area *tmp; tmp = rb_entry(n, struct vmap_area, rb_node); if (tmp->va_end > addr) { va = tmp; if (tmp->va_start <= addr) break; n = n->rb_left; } else n = n->rb_right; } return va; } static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root) { struct rb_node *n = root->rb_node; addr = (unsigned long)kasan_reset_tag((void *)addr); while (n) { struct vmap_area *va; va = rb_entry(n, struct vmap_area, rb_node); if (addr < va->va_start) n = n->rb_left; else if (addr >= va->va_end) n = n->rb_right; else return va; } return NULL; } /* * This function returns back addresses of parent node * and its left or right link for further processing. * * Otherwise NULL is returned. In that case all further * steps regarding inserting of conflicting overlap range * have to be declined and actually considered as a bug. */ static __always_inline struct rb_node ** find_va_links(struct vmap_area *va, struct rb_root *root, struct rb_node *from, struct rb_node **parent) { struct vmap_area *tmp_va; struct rb_node **link; if (root) { link = &root->rb_node; if (unlikely(!*link)) { *parent = NULL; return link; } } else { link = &from; } /* * Go to the bottom of the tree. When we hit the last point * we end up with parent rb_node and correct direction, i name * it link, where the new va->rb_node will be attached to. */ do { tmp_va = rb_entry(*link, struct vmap_area, rb_node); /* * During the traversal we also do some sanity check. * Trigger the BUG() if there are sides(left/right) * or full overlaps. */ if (va->va_end <= tmp_va->va_start) link = &(*link)->rb_left; else if (va->va_start >= tmp_va->va_end) link = &(*link)->rb_right; else { WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n", va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end); return NULL; } } while (*link); *parent = &tmp_va->rb_node; return link; } static __always_inline struct list_head * get_va_next_sibling(struct rb_node *parent, struct rb_node **link) { struct list_head *list; if (unlikely(!parent)) /* * The red-black tree where we try to find VA neighbors * before merging or inserting is empty, i.e. it means * there is no free vmap space. Normally it does not * happen but we handle this case anyway. */ return NULL; list = &rb_entry(parent, struct vmap_area, rb_node)->list; return (&parent->rb_right == link ? list->next : list); } static __always_inline void __link_va(struct vmap_area *va, struct rb_root *root, struct rb_node *parent, struct rb_node **link, struct list_head *head, bool augment) { /* * VA is still not in the list, but we can * identify its future previous list_head node. */ if (likely(parent)) { head = &rb_entry(parent, struct vmap_area, rb_node)->list; if (&parent->rb_right != link) head = head->prev; } /* Insert to the rb-tree */ rb_link_node(&va->rb_node, parent, link); if (augment) { /* * Some explanation here. Just perform simple insertion * to the tree. We do not set va->subtree_max_size to * its current size before calling rb_insert_augmented(). * It is because we populate the tree from the bottom * to parent levels when the node _is_ in the tree. * * Therefore we set subtree_max_size to zero after insertion, * to let __augment_tree_propagate_from() puts everything to * the correct order later on. */ rb_insert_augmented(&va->rb_node, root, &free_vmap_area_rb_augment_cb); va->subtree_max_size = 0; } else { rb_insert_color(&va->rb_node, root); } /* Address-sort this list */ list_add(&va->list, head); } static __always_inline void link_va(struct vmap_area *va, struct rb_root *root, struct rb_node *parent, struct rb_node **link, struct list_head *head) { __link_va(va, root, parent, link, head, false); } static __always_inline void link_va_augment(struct vmap_area *va, struct rb_root *root, struct rb_node *parent, struct rb_node **link, struct list_head *head) { __link_va(va, root, parent, link, head, true); } static __always_inline void __unlink_va(struct vmap_area *va, struct rb_root *root, bool augment) { if (WARN_ON(RB_EMPTY_NODE(&va->rb_node))) return; if (augment) rb_erase_augmented(&va->rb_node, root, &free_vmap_area_rb_augment_cb); else rb_erase(&va->rb_node, root); list_del_init(&va->list); RB_CLEAR_NODE(&va->rb_node); } static __always_inline void unlink_va(struct vmap_area *va, struct rb_root *root) { __unlink_va(va, root, false); } static __always_inline void unlink_va_augment(struct vmap_area *va, struct rb_root *root) { __unlink_va(va, root, true); } #if DEBUG_AUGMENT_PROPAGATE_CHECK /* * Gets called when remove the node and rotate. */ static __always_inline unsigned long compute_subtree_max_size(struct vmap_area *va) { return max3(va_size(va), get_subtree_max_size(va->rb_node.rb_left), get_subtree_max_size(va->rb_node.rb_right)); } static void augment_tree_propagate_check(void) { struct vmap_area *va; unsigned long computed_size; list_for_each_entry(va, &free_vmap_area_list, list) { computed_size = compute_subtree_max_size(va); if (computed_size != va->subtree_max_size) pr_emerg("tree is corrupted: %lu, %lu\n", va_size(va), va->subtree_max_size); } } #endif /* * This function populates subtree_max_size from bottom to upper * levels starting from VA point. The propagation must be done * when VA size is modified by changing its va_start/va_end. Or * in case of newly inserting of VA to the tree. * * It means that __augment_tree_propagate_from() must be called: * - After VA has been inserted to the tree(free path); * - After VA has been shrunk(allocation path); * - After VA has been increased(merging path). * * Please note that, it does not mean that upper parent nodes * and their subtree_max_size are recalculated all the time up * to the root node. * * 4--8 * /\ * / \ * / \ * 2--2 8--8 * * For example if we modify the node 4, shrinking it to 2, then * no any modification is required. If we shrink the node 2 to 1 * its subtree_max_size is updated only, and set to 1. If we shrink * the node 8 to 6, then its subtree_max_size is set to 6 and parent * node becomes 4--6. */ static __always_inline void augment_tree_propagate_from(struct vmap_area *va) { /* * Populate the tree from bottom towards the root until * the calculated maximum available size of checked node * is equal to its current one. */ free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL); #if DEBUG_AUGMENT_PROPAGATE_CHECK augment_tree_propagate_check(); #endif } static void insert_vmap_area(struct vmap_area *va, struct rb_root *root, struct list_head *head) { struct rb_node **link; struct rb_node *parent; link = find_va_links(va, root, NULL, &parent); if (link) link_va(va, root, parent, link, head); } static void insert_vmap_area_augment(struct vmap_area *va, struct rb_node *from, struct rb_root *root, struct list_head *head) { struct rb_node **link; struct rb_node *parent; if (from) link = find_va_links(va, NULL, from, &parent); else link = find_va_links(va, root, NULL, &parent); if (link) { link_va_augment(va, root, parent, link, head); augment_tree_propagate_from(va); } } /* * Merge de-allocated chunk of VA memory with previous * and next free blocks. If coalesce is not done a new * free area is inserted. If VA has been merged, it is * freed. * * Please note, it can return NULL in case of overlap * ranges, followed by WARN() report. Despite it is a * buggy behaviour, a system can be alive and keep * ongoing. */ static __always_inline struct vmap_area * __merge_or_add_vmap_area(struct vmap_area *va, struct rb_root *root, struct list_head *head, bool augment) { struct vmap_area *sibling; struct list_head *next; struct rb_node **link; struct rb_node *parent; bool merged = false; /* * Find a place in the tree where VA potentially will be * inserted, unless it is merged with its sibling/siblings. */ link = find_va_links(va, root, NULL, &parent); if (!link) return NULL; /* * Get next node of VA to check if merging can be done. */ next = get_va_next_sibling(parent, link); if (unlikely(next == NULL)) goto insert; /* * start end * | | * |<------VA------>|<-----Next----->| * | | * start end */ if (next != head) { sibling = list_entry(next, struct vmap_area, list); if (sibling->va_start == va->va_end) { sibling->va_start = va->va_start; /* Free vmap_area object. */ kmem_cache_free(vmap_area_cachep, va); /* Point to the new merged area. */ va = sibling; merged = true; } } /* * start end * | | * |<-----Prev----->|<------VA------>| * | | * start end */ if (next->prev != head) { sibling = list_entry(next->prev, struct vmap_area, list); if (sibling->va_end == va->va_start) { /* * If both neighbors are coalesced, it is important * to unlink the "next" node first, followed by merging * with "previous" one. Otherwise the tree might not be * fully populated if a sibling's augmented value is * "normalized" because of rotation operations. */ if (merged) __unlink_va(va, root, augment); sibling->va_end = va->va_end; /* Free vmap_area object. */ kmem_cache_free(vmap_area_cachep, va); /* Point to the new merged area. */ va = sibling; merged = true; } } insert: if (!merged) __link_va(va, root, parent, link, head, augment); return va; } static __always_inline struct vmap_area * merge_or_add_vmap_area(struct vmap_area *va, struct rb_root *root, struct list_head *head) { return __merge_or_add_vmap_area(va, root, head, false); } static __always_inline struct vmap_area * merge_or_add_vmap_area_augment(struct vmap_area *va, struct rb_root *root, struct list_head *head) { va = __merge_or_add_vmap_area(va, root, head, true); if (va) augment_tree_propagate_from(va); return va; } static __always_inline bool is_within_this_va(struct vmap_area *va, unsigned long size, unsigned long align, unsigned long vstart) { unsigned long nva_start_addr; if (va->va_start > vstart) nva_start_addr = ALIGN(va->va_start, align); else nva_start_addr = ALIGN(vstart, align); /* Can be overflowed due to big size or alignment. */ if (nva_start_addr + size < nva_start_addr || nva_start_addr < vstart) return false; return (nva_start_addr + size <= va->va_end); } /* * Find the first free block(lowest start address) in the tree, * that will accomplish the request corresponding to passing * parameters. Please note, with an alignment bigger than PAGE_SIZE, * a search length is adjusted to account for worst case alignment * overhead. */ static __always_inline struct vmap_area * find_vmap_lowest_match(struct rb_root *root, unsigned long size, unsigned long align, unsigned long vstart, bool adjust_search_size) { struct vmap_area *va; struct rb_node *node; unsigned long length; /* Start from the root. */ node = root->rb_node; /* Adjust the search size for alignment overhead. */ length = adjust_search_size ? size + align - 1 : size; while (node) { va = rb_entry(node, struct vmap_area, rb_node); if (get_subtree_max_size(node->rb_left) >= length && vstart < va->va_start) { node = node->rb_left; } else { if (is_within_this_va(va, size, align, vstart)) return va; /* * Does not make sense to go deeper towards the right * sub-tree if it does not have a free block that is * equal or bigger to the requested search length. */ if (get_subtree_max_size(node->rb_right) >= length) { node = node->rb_right; continue; } /* * OK. We roll back and find the first right sub-tree, * that will satisfy the search criteria. It can happen * due to "vstart" restriction or an alignment overhead * that is bigger then PAGE_SIZE. */ while ((node = rb_parent(node))) { va = rb_entry(node, struct vmap_area, rb_node); if (is_within_this_va(va, size, align, vstart)) return va; if (get_subtree_max_size(node->rb_right) >= length && vstart <= va->va_start) { /* * Shift the vstart forward. Please note, we update it with * parent's start address adding "1" because we do not want * to enter same sub-tree after it has already been checked * and no suitable free block found there. */ vstart = va->va_start + 1; node = node->rb_right; break; } } } } return NULL; } #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK #include <linux/random.h> static struct vmap_area * find_vmap_lowest_linear_match(struct list_head *head, unsigned long size, unsigned long align, unsigned long vstart) { struct vmap_area *va; list_for_each_entry(va, head, list) { if (!is_within_this_va(va, size, align, vstart)) continue; return va; } return NULL; } static void find_vmap_lowest_match_check(struct rb_root *root, struct list_head *head, unsigned long size, unsigned long align) { struct vmap_area *va_1, *va_2; unsigned long vstart; unsigned int rnd; get_random_bytes(&rnd, sizeof(rnd)); vstart = VMALLOC_START + rnd; va_1 = find_vmap_lowest_match(root, size, align, vstart, false); va_2 = find_vmap_lowest_linear_match(head, size, align, vstart); if (va_1 != va_2) pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n", va_1, va_2, vstart); } #endif enum fit_type { NOTHING_FIT = 0, FL_FIT_TYPE = 1, /* full fit */ LE_FIT_TYPE = 2, /* left edge fit */ RE_FIT_TYPE = 3, /* right edge fit */ NE_FIT_TYPE = 4 /* no edge fit */ }; static __always_inline enum fit_type classify_va_fit_type(struct vmap_area *va, unsigned long nva_start_addr, unsigned long size) { enum fit_type type; /* Check if it is within VA. */ if (nva_start_addr < va->va_start || nva_start_addr + size > va->va_end) return NOTHING_FIT; /* Now classify. */ if (va->va_start == nva_start_addr) { if (va->va_end == nva_start_addr + size) type = FL_FIT_TYPE; else type = LE_FIT_TYPE; } else if (va->va_end == nva_start_addr + size) { type = RE_FIT_TYPE; } else { type = NE_FIT_TYPE; } return type; } static __always_inline int adjust_va_to_fit_type(struct rb_root *root, struct list_head *head, struct vmap_area *va, unsigned long nva_start_addr, unsigned long size) { struct vmap_area *lva = NULL; enum fit_type type = classify_va_fit_type(va, nva_start_addr, size); if (type == FL_FIT_TYPE) { /* * No need to split VA, it fully fits. * * | | * V NVA V * |---------------| */ unlink_va_augment(va, root); kmem_cache_free(vmap_area_cachep, va); } else if (type == LE_FIT_TYPE) { /* * Split left edge of fit VA. * * | | * V NVA V R * |-------|-------| */ va->va_start += size; } else if (type == RE_FIT_TYPE) { /* * Split right edge of fit VA. * * | | * L V NVA V * |-------|-------| */ va->va_end = nva_start_addr; } else if (type == NE_FIT_TYPE) { /* * Split no edge of fit VA. * * | | * L V NVA V R * |---|-------|---| */ lva = __this_cpu_xchg(ne_fit_preload_node, NULL); if (unlikely(!lva)) { /* * For percpu allocator we do not do any pre-allocation * and leave it as it is. The reason is it most likely * never ends up with NE_FIT_TYPE splitting. In case of * percpu allocations offsets and sizes are aligned to * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE * are its main fitting cases. * * There are a few exceptions though, as an example it is * a first allocation (early boot up) when we have "one" * big free space that has to be split. * * Also we can hit this path in case of regular "vmap" * allocations, if "this" current CPU was not preloaded. * See the comment in alloc_vmap_area() why. If so, then * GFP_NOWAIT is used instead to get an extra object for * split purpose. That is rare and most time does not * occur. * * What happens if an allocation gets failed. Basically, * an "overflow" path is triggered to purge lazily freed * areas to free some memory, then, the "retry" path is * triggered to repeat one more time. See more details * in alloc_vmap_area() function. */ lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT); if (!lva) return -1; } /* * Build the remainder. */ lva->va_start = va->va_start; lva->va_end = nva_start_addr; /* * Shrink this VA to remaining size. */ va->va_start = nva_start_addr + size; } else { return -1; } if (type != FL_FIT_TYPE) { augment_tree_propagate_from(va); if (lva) /* type == NE_FIT_TYPE */ insert_vmap_area_augment(lva, &va->rb_node, root, head); } return 0; } /* * Returns a start address of the newly allocated area, if success. * Otherwise a vend is returned that indicates failure. */ static __always_inline unsigned long __alloc_vmap_area(struct rb_root *root, struct list_head *head, unsigned long size, unsigned long align, unsigned long vstart, unsigned long vend) { bool adjust_search_size = true; unsigned long nva_start_addr; struct vmap_area *va; int ret; /* * Do not adjust when: * a) align <= PAGE_SIZE, because it does not make any sense. * All blocks(their start addresses) are at least PAGE_SIZE * aligned anyway; * b) a short range where a requested size corresponds to exactly * specified [vstart:vend] interval and an alignment > PAGE_SIZE. * With adjusted search length an allocation would not succeed. */ if (align <= PAGE_SIZE || (align > PAGE_SIZE && (vend - vstart) == size)) adjust_search_size = false; va = find_vmap_lowest_match(root, size, align, vstart, adjust_search_size); if (unlikely(!va)) return vend; if (va->va_start > vstart) nva_start_addr = ALIGN(va->va_start, align); else nva_start_addr = ALIGN(vstart, align); /* Check the "vend" restriction. */ if (nva_start_addr + size > vend) return vend; /* Update the free vmap_area. */ ret = adjust_va_to_fit_type(root, head, va, nva_start_addr, size); if (WARN_ON_ONCE(ret)) return vend; #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK find_vmap_lowest_match_check(root, head, size, align); #endif return nva_start_addr; } /* * Free a region of KVA allocated by alloc_vmap_area */ static void free_vmap_area(struct vmap_area *va) { /* * Remove from the busy tree/list. */ spin_lock(&vmap_area_lock); unlink_va(va, &vmap_area_root); spin_unlock(&vmap_area_lock); /* * Insert/Merge it back to the free tree/list. */ spin_lock(&free_vmap_area_lock); merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list); spin_unlock(&free_vmap_area_lock); } static inline void preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node) { struct vmap_area *va = NULL; /* * Preload this CPU with one extra vmap_area object. It is used * when fit type of free area is NE_FIT_TYPE. It guarantees that * a CPU that does an allocation is preloaded. * * We do it in non-atomic context, thus it allows us to use more * permissive allocation masks to be more stable under low memory * condition and high memory pressure. */ if (!this_cpu_read(ne_fit_preload_node)) va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); spin_lock(lock); if (va && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, va)) kmem_cache_free(vmap_area_cachep, va); } /* * Allocate a region of KVA of the specified size and alignment, within the * vstart and vend. */ static struct vmap_area *alloc_vmap_area(unsigned long size, unsigned long align, unsigned long vstart, unsigned long vend, int node, gfp_t gfp_mask, unsigned long va_flags) { struct vmap_area *va; unsigned long freed; unsigned long addr; int purged = 0; int ret; if (unlikely(!size || offset_in_page(size) || !is_power_of_2(align))) return ERR_PTR(-EINVAL); if (unlikely(!vmap_initialized)) return ERR_PTR(-EBUSY); might_sleep(); gfp_mask = gfp_mask & GFP_RECLAIM_MASK; va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); if (unlikely(!va)) return ERR_PTR(-ENOMEM); /* * Only scan the relevant parts containing pointers to other objects * to avoid false negatives. */ kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask); retry: preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node); addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list, size, align, vstart, vend); spin_unlock(&free_vmap_area_lock); trace_alloc_vmap_area(addr, size, align, vstart, vend, addr == vend); /* * If an allocation fails, the "vend" address is * returned. Therefore trigger the overflow path. */ if (unlikely(addr == vend)) goto overflow; va->va_start = addr; va->va_end = addr + size; va->vm = NULL; va->flags = va_flags; spin_lock(&vmap_area_lock); insert_vmap_area(va, &vmap_area_root, &vmap_area_list); spin_unlock(&vmap_area_lock); BUG_ON(!IS_ALIGNED(va->va_start, align)); BUG_ON(va->va_start < vstart); BUG_ON(va->va_end > vend); ret = kasan_populate_vmalloc(addr, size); if (ret) { free_vmap_area(va); return ERR_PTR(ret); } return va; overflow: if (!purged) { reclaim_and_purge_vmap_areas(); purged = 1; goto retry; } freed = 0; blocking_notifier_call_chain(&vmap_notify_list, 0, &freed); if (freed > 0) { purged = 0; goto retry; } if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n", size); kmem_cache_free(vmap_area_cachep, va); return ERR_PTR(-EBUSY); } int register_vmap_purge_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&vmap_notify_list, nb); } EXPORT_SYMBOL_GPL(register_vmap_purge_notifier); int unregister_vmap_purge_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&vmap_notify_list, nb); } EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier); /* * lazy_max_pages is the maximum amount of virtual address space we gather up * before attempting to purge with a TLB flush. * * There is a tradeoff here: a larger number will cover more kernel page tables * and take slightly longer to purge, but it will linearly reduce the number of * global TLB flushes that must be performed. It would seem natural to scale * this number up linearly with the number of CPUs (because vmapping activity * could also scale linearly with the number of CPUs), however it is likely * that in practice, workloads might be constrained in other ways that mean * vmap activity will not scale linearly with CPUs. Also, I want to be * conservative and not introduce a big latency on huge systems, so go with * a less aggressive log scale. It will still be an improvement over the old * code, and it will be simple to change the scale factor if we find that it * becomes a problem on bigger systems. */ static unsigned long lazy_max_pages(void) { unsigned int log; log = fls(num_online_cpus()); return log * (32UL * 1024 * 1024 / PAGE_SIZE); } static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0); /* * Serialize vmap purging. There is no actual critical section protected * by this lock, but we want to avoid concurrent calls for performance * reasons and to make the pcpu_get_vm_areas more deterministic. */ static DEFINE_MUTEX(vmap_purge_lock); /* for per-CPU blocks */ static void purge_fragmented_blocks_allcpus(void); /* * Purges all lazily-freed vmap areas. */ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) { unsigned long resched_threshold; unsigned int num_purged_areas = 0; struct list_head local_purge_list; struct vmap_area *va, *n_va; lockdep_assert_held(&vmap_purge_lock); spin_lock(&purge_vmap_area_lock); purge_vmap_area_root = RB_ROOT; list_replace_init(&purge_vmap_area_list, &local_purge_list); spin_unlock(&purge_vmap_area_lock); if (unlikely(list_empty(&local_purge_list))) goto out; start = min(start, list_first_entry(&local_purge_list, struct vmap_area, list)->va_start); end = max(end, list_last_entry(&local_purge_list, struct vmap_area, list)->va_end); flush_tlb_kernel_range(start, end); resched_threshold = lazy_max_pages() << 1; spin_lock(&free_vmap_area_lock); list_for_each_entry_safe(va, n_va, &local_purge_list, list) { unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; unsigned long orig_start = va->va_start; unsigned long orig_end = va->va_end; /* * Finally insert or merge lazily-freed area. It is * detached and there is no need to "unlink" it from * anything. */ va = merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list); if (!va) continue; if (is_vmalloc_or_module_addr((void *)orig_start)) kasan_release_vmalloc(orig_start, orig_end, va->va_start, va->va_end); atomic_long_sub(nr, &vmap_lazy_nr); num_purged_areas++; if (atomic_long_read(&vmap_lazy_nr) < resched_threshold) cond_resched_lock(&free_vmap_area_lock); } spin_unlock(&free_vmap_area_lock); out: trace_purge_vmap_area_lazy(start, end, num_purged_areas); return num_purged_areas > 0; } /* * Reclaim vmap areas by purging fragmented blocks and purge_vmap_area_list. */ static void reclaim_and_purge_vmap_areas(void) { mutex_lock(&vmap_purge_lock); purge_fragmented_blocks_allcpus(); __purge_vmap_area_lazy(ULONG_MAX, 0); mutex_unlock(&vmap_purge_lock); } static void drain_vmap_area_work(struct work_struct *work) { unsigned long nr_lazy; do { mutex_lock(&vmap_purge_lock); __purge_vmap_area_lazy(ULONG_MAX, 0); mutex_unlock(&vmap_purge_lock); /* Recheck if further work is required. */ nr_lazy = atomic_long_read(&vmap_lazy_nr); } while (nr_lazy > lazy_max_pages()); } /* * Free a vmap area, caller ensuring that the area has been unmapped, * unlinked and flush_cache_vunmap had been called for the correct * range previously. */ static void free_vmap_area_noflush(struct vmap_area *va) { unsigned long nr_lazy_max = lazy_max_pages(); unsigned long va_start = va->va_start; unsigned long nr_lazy; if (WARN_ON_ONCE(!list_empty(&va->list))) return; nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); /* * Merge or place it to the purge tree/list. */ spin_lock(&purge_vmap_area_lock); merge_or_add_vmap_area(va, &purge_vmap_area_root, &purge_vmap_area_list); spin_unlock(&purge_vmap_area_lock); trace_free_vmap_area_noflush(va_start, nr_lazy, nr_lazy_max); /* After this point, we may free va at any time */ if (unlikely(nr_lazy > nr_lazy_max)) schedule_work(&drain_vmap_work); } /* * Free and unmap a vmap area */ static void free_unmap_vmap_area(struct vmap_area *va) { flush_cache_vunmap(va->va_start, va->va_end); vunmap_range_noflush(va->va_start, va->va_end); if (debug_pagealloc_enabled_static()) flush_tlb_kernel_range(va->va_start, va->va_end); free_vmap_area_noflush(va); } struct vmap_area *find_vmap_area(unsigned long addr) { struct vmap_area *va; spin_lock(&vmap_area_lock); va = __find_vmap_area(addr, &vmap_area_root); spin_unlock(&vmap_area_lock); return va; } static struct vmap_area *find_unlink_vmap_area(unsigned long addr) { struct vmap_area *va; spin_lock(&vmap_area_lock); va = __find_vmap_area(addr, &vmap_area_root); if (va) unlink_va(va, &vmap_area_root); spin_unlock(&vmap_area_lock); return va; } /*** Per cpu kva allocator ***/ /* * vmap space is limited especially on 32 bit architectures. Ensure there is * room for at least 16 percpu vmap blocks per CPU. */ /* * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess * instead (we just need a rough idea) */ #if BITS_PER_LONG == 32 #define VMALLOC_SPACE (128UL*1024*1024) #else #define VMALLOC_SPACE (128UL*1024*1024*1024) #endif #define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE) #define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */ #define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */ #define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) #define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ #define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ #define VMAP_BBMAP_BITS \ VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16)) #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) /* * Purge threshold to prevent overeager purging of fragmented blocks for * regular operations: Purge if vb->free is less than 1/4 of the capacity. */ #define VMAP_PURGE_THRESHOLD (VMAP_BBMAP_BITS / 4) #define VMAP_RAM 0x1 /* indicates vm_map_ram area*/ #define VMAP_BLOCK 0x2 /* mark out the vmap_block sub-type*/ #define VMAP_FLAGS_MASK 0x3 struct vmap_block_queue { spinlock_t lock; struct list_head free; /* * An xarray requires an extra memory dynamically to * be allocated. If it is an issue, we can use rb-tree * instead. */ struct xarray vmap_blocks; }; struct vmap_block { spinlock_t lock; struct vmap_area *va; unsigned long free, dirty; DECLARE_BITMAP(used_map, VMAP_BBMAP_BITS); unsigned long dirty_min, dirty_max; /*< dirty range */ struct list_head free_list; struct rcu_head rcu_head; struct list_head purge; }; /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue); /* * In order to fast access to any "vmap_block" associated with a * specific address, we use a hash. * * A per-cpu vmap_block_queue is used in both ways, to serialize * an access to free block chains among CPUs(alloc path) and it * also acts as a vmap_block hash(alloc/free paths). It means we * overload it, since we already have the per-cpu array which is * used as a hash table. When used as a hash a 'cpu' passed to * per_cpu() is not actually a CPU but rather a hash index. * * A hash function is addr_to_vb_xa() which hashes any address * to a specific index(in a hash) it belongs to. This then uses a * per_cpu() macro to access an array with generated index. * * An example: * * CPU_1 CPU_2 CPU_0 * | | | * V V V * 0 10 20 30 40 50 60 * |------|------|------|------|------|------|...<vmap address space> * CPU0 CPU1 CPU2 CPU0 CPU1 CPU2 * * - CPU_1 invokes vm_unmap_ram(6), 6 belongs to CPU0 zone, thus * it access: CPU0/INDEX0 -> vmap_blocks -> xa_lock; * * - CPU_2 invokes vm_unmap_ram(11), 11 belongs to CPU1 zone, thus * it access: CPU1/INDEX1 -> vmap_blocks -> xa_lock; * * - CPU_0 invokes vm_unmap_ram(20), 20 belongs to CPU2 zone, thus * it access: CPU2/INDEX2 -> vmap_blocks -> xa_lock. * * This technique almost always avoids lock contention on insert/remove, * however xarray spinlocks protect against any contention that remains. */ static struct xarray * addr_to_vb_xa(unsigned long addr) { int index = (addr / VMAP_BLOCK_SIZE) % num_possible_cpus(); return &per_cpu(vmap_block_queue, index).vmap_blocks; } /* * We should probably have a fallback mechanism to allocate virtual memory * out of partially filled vmap blocks. However vmap block sizing should be * fairly reasonable according to the vmalloc size, so it shouldn't be a * big problem. */ static unsigned long addr_to_vb_idx(unsigned long addr) { addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1); addr /= VMAP_BLOCK_SIZE; return addr; } static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off) { unsigned long addr; addr = va_start + (pages_off << PAGE_SHIFT); BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start)); return (void *)addr; } /** * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this * block. Of course pages number can't exceed VMAP_BBMAP_BITS * @order: how many 2^order pages should be occupied in newly allocated block * @gfp_mask: flags for the page level allocator * * Return: virtual address in a newly allocated block or ERR_PTR(-errno) */ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) { struct vmap_block_queue *vbq; struct vmap_block *vb; struct vmap_area *va; struct xarray *xa; unsigned long vb_idx; int node, err; void *vaddr; node = numa_node_id(); vb = kmalloc_node(sizeof(struct vmap_block), gfp_mask & GFP_RECLAIM_MASK, node); if (unlikely(!vb)) return ERR_PTR(-ENOMEM); va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, VMALLOC_START, VMALLOC_END, node, gfp_mask, VMAP_RAM|VMAP_BLOCK); if (IS_ERR(va)) { kfree(vb); return ERR_CAST(va); } vaddr = vmap_block_vaddr(va->va_start, 0); spin_lock_init(&vb->lock); vb->va = va; /* At least something should be left free */ BUG_ON(VMAP_BBMAP_BITS <= (1UL << order)); bitmap_zero(vb->used_map, VMAP_BBMAP_BITS); vb->free = VMAP_BBMAP_BITS - (1UL << order); vb->dirty = 0; vb->dirty_min = VMAP_BBMAP_BITS; vb->dirty_max = 0; bitmap_set(vb->used_map, 0, (1UL << order)); INIT_LIST_HEAD(&vb->free_list); xa = addr_to_vb_xa(va->va_start); vb_idx = addr_to_vb_idx(va->va_start); err = xa_insert(xa, vb_idx, vb, gfp_mask); if (err) { kfree(vb); free_vmap_area(va); return ERR_PTR(err); } vbq = raw_cpu_ptr(&vmap_block_queue); spin_lock(&vbq->lock); list_add_tail_rcu(&vb->free_list, &vbq->free); spin_unlock(&vbq->lock); return vaddr; } static void free_vmap_block(struct vmap_block *vb) { struct vmap_block *tmp; struct xarray *xa; xa = addr_to_vb_xa(vb->va->va_start); tmp = xa_erase(xa, addr_to_vb_idx(vb->va->va_start)); BUG_ON(tmp != vb); spin_lock(&vmap_area_lock); unlink_va(vb->va, &vmap_area_root); spin_unlock(&vmap_area_lock); free_vmap_area_noflush(vb->va); kfree_rcu(vb, rcu_head); } static bool purge_fragmented_block(struct vmap_block *vb, struct vmap_block_queue *vbq, struct list_head *purge_list, bool force_purge) { if (vb->free + vb->dirty != VMAP_BBMAP_BITS || vb->dirty == VMAP_BBMAP_BITS) return false; /* Don't overeagerly purge usable blocks unless requested */ if (!(force_purge || vb->free < VMAP_PURGE_THRESHOLD)) return false; /* prevent further allocs after releasing lock */ WRITE_ONCE(vb->free, 0); /* prevent purging it again */ WRITE_ONCE(vb->dirty, VMAP_BBMAP_BITS); vb->dirty_min = 0; vb->dirty_max = VMAP_BBMAP_BITS; spin_lock(&vbq->lock); list_del_rcu(&vb->free_list); spin_unlock(&vbq->lock); list_add_tail(&vb->purge, purge_list); return true; } static void free_purged_blocks(struct list_head *purge_list) { struct vmap_block *vb, *n_vb; list_for_each_entry_safe(vb, n_vb, purge_list, purge) { list_del(&vb->purge); free_vmap_block(vb); } } static void purge_fragmented_blocks(int cpu) { LIST_HEAD(purge); struct vmap_block *vb; struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); rcu_read_lock(); list_for_each_entry_rcu(vb, &vbq->free, free_list) { unsigned long free = READ_ONCE(vb->free); unsigned long dirty = READ_ONCE(vb->dirty); if (free + dirty != VMAP_BBMAP_BITS || dirty == VMAP_BBMAP_BITS) continue; spin_lock(&vb->lock); purge_fragmented_block(vb, vbq, &purge, true); spin_unlock(&vb->lock); } rcu_read_unlock(); free_purged_blocks(&purge); } static void purge_fragmented_blocks_allcpus(void) { int cpu; for_each_possible_cpu(cpu) purge_fragmented_blocks(cpu); } static void *vb_alloc(unsigned long size, gfp_t gfp_mask) { struct vmap_block_queue *vbq; struct vmap_block *vb; void *vaddr = NULL; unsigned int order; BUG_ON(offset_in_page(size)); BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); if (WARN_ON(size == 0)) { /* * Allocating 0 bytes isn't what caller wants since * get_order(0) returns funny result. Just warn and terminate * early. */ return NULL; } order = get_order(size); rcu_read_lock(); vbq = raw_cpu_ptr(&vmap_block_queue); list_for_each_entry_rcu(vb, &vbq->free, free_list) { unsigned long pages_off; if (READ_ONCE(vb->free) < (1UL << order)) continue; spin_lock(&vb->lock); if (vb->free < (1UL << order)) { spin_unlock(&vb->lock); continue; } pages_off = VMAP_BBMAP_BITS - vb->free; vaddr = vmap_block_vaddr(vb->va->va_start, pages_off); WRITE_ONCE(vb->free, vb->free - (1UL << order)); bitmap_set(vb->used_map, pages_off, (1UL << order)); if (vb->free == 0) { spin_lock(&vbq->lock); list_del_rcu(&vb->free_list); spin_unlock(&vbq->lock); } spin_unlock(&vb->lock); break; } rcu_read_unlock(); /* Allocate new block if nothing was found */ if (!vaddr) vaddr = new_vmap_block(order, gfp_mask); return vaddr; } static void vb_free(unsigned long addr, unsigned long size) { unsigned long offset; unsigned int order; struct vmap_block *vb; struct xarray *xa; BUG_ON(offset_in_page(size)); BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); flush_cache_vunmap(addr, addr + size); order = get_order(size); offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT; xa = addr_to_vb_xa(addr); vb = xa_load(xa, addr_to_vb_idx(addr)); spin_lock(&vb->lock); bitmap_clear(vb->used_map, offset, (1UL << order)); spin_unlock(&vb->lock); vunmap_range_noflush(addr, addr + size); if (debug_pagealloc_enabled_static()) flush_tlb_kernel_range(addr, addr + size); spin_lock(&vb->lock); /* Expand the not yet TLB flushed dirty range */ vb->dirty_min = min(vb->dirty_min, offset); vb->dirty_max = max(vb->dirty_max, offset + (1UL << order)); WRITE_ONCE(vb->dirty, vb->dirty + (1UL << order)); if (vb->dirty == VMAP_BBMAP_BITS) { BUG_ON(vb->free); spin_unlock(&vb->lock); free_vmap_block(vb); } else spin_unlock(&vb->lock); } static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush) { LIST_HEAD(purge_list); int cpu; if (unlikely(!vmap_initialized)) return; mutex_lock(&vmap_purge_lock); for_each_possible_cpu(cpu) { struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); struct vmap_block *vb; unsigned long idx; rcu_read_lock(); xa_for_each(&vbq->vmap_blocks, idx, vb) { spin_lock(&vb->lock); /* * Try to purge a fragmented block first. If it's * not purgeable, check whether there is dirty * space to be flushed. */ if (!purge_fragmented_block(vb, vbq, &purge_list, false) && vb->dirty_max && vb->dirty != VMAP_BBMAP_BITS) { unsigned long va_start = vb->va->va_start; unsigned long s, e; s = va_start + (vb->dirty_min << PAGE_SHIFT); e = va_start + (vb->dirty_max << PAGE_SHIFT); start = min(s, start); end = max(e, end); /* Prevent that this is flushed again */ vb->dirty_min = VMAP_BBMAP_BITS; vb->dirty_max = 0; flush = 1; } spin_unlock(&vb->lock); } rcu_read_unlock(); } free_purged_blocks(&purge_list); if (!__purge_vmap_area_lazy(start, end) && flush) flush_tlb_kernel_range(start, end); mutex_unlock(&vmap_purge_lock); } /** * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer * * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily * to amortize TLB flushing overheads. What this means is that any page you * have now, may, in a former life, have been mapped into kernel virtual * address by the vmap layer and so there might be some CPUs with TLB entries * still referencing that page (additional to the regular 1:1 kernel mapping). * * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can * be sure that none of the pages we have control over will have any aliases * from the vmap layer. */ void vm_unmap_aliases(void) { unsigned long start = ULONG_MAX, end = 0; int flush = 0; _vm_unmap_aliases(start, end, flush); } EXPORT_SYMBOL_GPL(vm_unmap_aliases); /** * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram * @mem: the pointer returned by vm_map_ram * @count: the count passed to that vm_map_ram call (cannot unmap partial) */ void vm_unmap_ram(const void *mem, unsigned int count) { unsigned long size = (unsigned long)count << PAGE_SHIFT; unsigned long addr = (unsigned long)kasan_reset_tag(mem); struct vmap_area *va; might_sleep(); BUG_ON(!addr); BUG_ON(addr < VMALLOC_START); BUG_ON(addr > VMALLOC_END); BUG_ON(!PAGE_ALIGNED(addr)); kasan_poison_vmalloc(mem, size); if (likely(count <= VMAP_MAX_ALLOC)) { debug_check_no_locks_freed(mem, size); vb_free(addr, size); return; } va = find_unlink_vmap_area(addr); if (WARN_ON_ONCE(!va)) return; debug_check_no_locks_freed((void *)va->va_start, (va->va_end - va->va_start)); free_unmap_vmap_area(va); } EXPORT_SYMBOL(vm_unmap_ram); /** * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space) * @pages: an array of pointers to the pages to be mapped * @count: number of pages * @node: prefer to allocate data structures on this node * * If you use this function for less than VMAP_MAX_ALLOC pages, it could be * faster than vmap so it's good. But if you mix long-life and short-life * objects with vm_map_ram(), it could consume lots of address space through * fragmentation (especially on a 32bit machine). You could see failures in * the end. Please use this function for short-lived objects. * * Returns: a pointer to the address that has been mapped, or %NULL on failure */ void *vm_map_ram(struct page **pages, unsigned int count, int node) { unsigned long size = (unsigned long)count << PAGE_SHIFT; unsigned long addr; void *mem; if (likely(count <= VMAP_MAX_ALLOC)) { mem = vb_alloc(size, GFP_KERNEL); if (IS_ERR(mem)) return NULL; addr = (unsigned long)mem; } else { struct vmap_area *va; va = alloc_vmap_area(size, PAGE_SIZE, VMALLOC_START, VMALLOC_END, node, GFP_KERNEL, VMAP_RAM); if (IS_ERR(va)) return NULL; addr = va->va_start; mem = (void *)addr; } if (vmap_pages_range(addr, addr + size, PAGE_KERNEL, pages, PAGE_SHIFT) < 0) { vm_unmap_ram(mem, count); return NULL; } /* * Mark the pages as accessible, now that they are mapped. * With hardware tag-based KASAN, marking is skipped for * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). */ mem = kasan_unpoison_vmalloc(mem, size, KASAN_VMALLOC_PROT_NORMAL); return mem; } EXPORT_SYMBOL(vm_map_ram); static struct vm_struct *vmlist __initdata; static inline unsigned int vm_area_page_order(struct vm_struct *vm) { #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC return vm->page_order; #else return 0; #endif } static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order) { #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC vm->page_order = order; #else BUG_ON(order != 0); #endif } /** * vm_area_add_early - add vmap area early during boot * @vm: vm_struct to add * * This function is used to add fixed kernel vm area to vmlist before * vmalloc_init() is called. @vm->addr, @vm->size, and @vm->flags * should contain proper values and the other fields should be zero. * * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. */ void __init vm_area_add_early(struct vm_struct *vm) { struct vm_struct *tmp, **p; BUG_ON(vmap_initialized); for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { if (tmp->addr >= vm->addr) { BUG_ON(tmp->addr < vm->addr + vm->size); break; } else BUG_ON(tmp->addr + tmp->size > vm->addr); } vm->next = *p; *p = vm; } /** * vm_area_register_early - register vmap area early during boot * @vm: vm_struct to register * @align: requested alignment * * This function is used to register kernel vm area before * vmalloc_init() is called. @vm->size and @vm->flags should contain * proper values on entry and other fields should be zero. On return, * vm->addr contains the allocated address. * * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. */ void __init vm_area_register_early(struct vm_struct *vm, size_t align) { unsigned long addr = ALIGN(VMALLOC_START, align); struct vm_struct *cur, **p; BUG_ON(vmap_initialized); for (p = &vmlist; (cur = *p) != NULL; p = &cur->next) { if ((unsigned long)cur->addr - addr >= vm->size) break; addr = ALIGN((unsigned long)cur->addr + cur->size, align); } BUG_ON(addr > VMALLOC_END - vm->size); vm->addr = (void *)addr; vm->next = *p; *p = vm; kasan_populate_early_vm_area_shadow(vm->addr, vm->size); } static void vmap_init_free_space(void) { unsigned long vmap_start = 1; const unsigned long vmap_end = ULONG_MAX; struct vmap_area *busy, *free; /* * B F B B B F * -|-----|.....|-----|-----|-----|.....|- * | The KVA space | * |<--------------------------------->| */ list_for_each_entry(busy, &vmap_area_list, list) { if (busy->va_start - vmap_start > 0) { free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); if (!WARN_ON_ONCE(!free)) { free->va_start = vmap_start; free->va_end = busy->va_start; insert_vmap_area_augment(free, NULL, &free_vmap_area_root, &free_vmap_area_list); } } vmap_start = busy->va_end; } if (vmap_end - vmap_start > 0) { free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); if (!WARN_ON_ONCE(!free)) { free->va_start = vmap_start; free->va_end = vmap_end; insert_vmap_area_augment(free, NULL, &free_vmap_area_root, &free_vmap_area_list); } } } static inline void setup_vmalloc_vm_locked(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, const void *caller) { vm->flags = flags; vm->addr = (void *)va->va_start; vm->size = va->va_end - va->va_start; vm->caller = caller; va->vm = vm; } static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, const void *caller) { spin_lock(&vmap_area_lock); setup_vmalloc_vm_locked(vm, va, flags, caller); spin_unlock(&vmap_area_lock); } static void clear_vm_uninitialized_flag(struct vm_struct *vm) { /* * Before removing VM_UNINITIALIZED, * we should make sure that vm has proper values. * Pair with smp_rmb() in show_numa_info(). */ smp_wmb(); vm->flags &= ~VM_UNINITIALIZED; } static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long align, unsigned long shift, unsigned long flags, unsigned long start, unsigned long end, int node, gfp_t gfp_mask, const void *caller) { struct vmap_area *va; struct vm_struct *area; unsigned long requested_size = size; BUG_ON(in_interrupt()); size = ALIGN(size, 1ul << shift); if (unlikely(!size)) return NULL; if (flags & VM_IOREMAP) align = 1ul << clamp_t(int, get_count_order_long(size), PAGE_SHIFT, IOREMAP_MAX_ORDER); area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); if (unlikely(!area)) return NULL; if (!(flags & VM_NO_GUARD)) size += PAGE_SIZE; va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0); if (IS_ERR(va)) { kfree(area); return NULL; } setup_vmalloc_vm(area, va, flags, caller); /* * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a * best-effort approach, as they can be mapped outside of vmalloc code. * For VM_ALLOC mappings, the pages are marked as accessible after * getting mapped in __vmalloc_node_range(). * With hardware tag-based KASAN, marking is skipped for * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). */ if (!(flags & VM_ALLOC)) area->addr = kasan_unpoison_vmalloc(area->addr, requested_size, KASAN_VMALLOC_PROT_NORMAL); return area; } struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, const void *caller) { return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, start, end, NUMA_NO_NODE, GFP_KERNEL, caller); } /** * get_vm_area - reserve a contiguous kernel virtual area * @size: size of the area * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC * * Search an area of @size in the kernel virtual mapping area, * and reserved it for out purposes. Returns the area descriptor * on success or %NULL on failure. * * Return: the area descriptor on success or %NULL on failure. */ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) { return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, VMALLOC_START, VMALLOC_END, NUMA_NO_NODE, GFP_KERNEL, __builtin_return_address(0)); } struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, const void *caller) { return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, VMALLOC_START, VMALLOC_END, NUMA_NO_NODE, GFP_KERNEL, caller); } /** * find_vm_area - find a continuous kernel virtual area * @addr: base address * * Search for the kernel VM area starting at @addr, and return it. * It is up to the caller to do all required locking to keep the returned * pointer valid. * * Return: the area descriptor on success or %NULL on failure. */ struct vm_struct *find_vm_area(const void *addr) { struct vmap_area *va; va = find_vmap_area((unsigned long)addr); if (!va) return NULL; return va->vm; } /** * remove_vm_area - find and remove a continuous kernel virtual area * @addr: base address * * Search for the kernel VM area starting at @addr, and remove it. * This function returns the found VM area, but using it is NOT safe * on SMP machines, except for its size or flags. * * Return: the area descriptor on success or %NULL on failure. */ struct vm_struct *remove_vm_area(const void *addr) { struct vmap_area *va; struct vm_struct *vm; might_sleep(); if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n", addr)) return NULL; va = find_unlink_vmap_area((unsigned long)addr); if (!va || !va->vm) return NULL; vm = va->vm; debug_check_no_locks_freed(vm->addr, get_vm_area_size(vm)); debug_check_no_obj_freed(vm->addr, get_vm_area_size(vm)); kasan_free_module_shadow(vm); kasan_poison_vmalloc(vm->addr, get_vm_area_size(vm)); free_unmap_vmap_area(va); return vm; } static inline void set_area_direct_map(const struct vm_struct *area, int (*set_direct_map)(struct page *page)) { int i; /* HUGE_VMALLOC passes small pages to set_direct_map */ for (i = 0; i < area->nr_pages; i++) if (page_address(area->pages[i])) set_direct_map(area->pages[i]); } /* * Flush the vm mapping and reset the direct map. */ static void vm_reset_perms(struct vm_struct *area) { unsigned long start = ULONG_MAX, end = 0; unsigned int page_order = vm_area_page_order(area); int flush_dmap = 0; int i; /* * Find the start and end range of the direct mappings to make sure that * the vm_unmap_aliases() flush includes the direct map. */ for (i = 0; i < area->nr_pages; i += 1U << page_order) { unsigned long addr = (unsigned long)page_address(area->pages[i]); if (addr) { unsigned long page_size; page_size = PAGE_SIZE << page_order; start = min(addr, start); end = max(addr + page_size, end); flush_dmap = 1; } } /* * Set direct map to something invalid so that it won't be cached if * there are any accesses after the TLB flush, then flush the TLB and * reset the direct map permissions to the default. */ set_area_direct_map(area, set_direct_map_invalid_noflush); _vm_unmap_aliases(start, end, flush_dmap); set_area_direct_map(area, set_direct_map_default_noflush); } static void delayed_vfree_work(struct work_struct *w) { struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq); struct llist_node *t, *llnode; llist_for_each_safe(llnode, t, llist_del_all(&p->list)) vfree(llnode); } /** * vfree_atomic - release memory allocated by vmalloc() * @addr: memory base address * * This one is just like vfree() but can be called in any atomic context * except NMIs. */ void vfree_atomic(const void *addr) { struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred); BUG_ON(in_nmi()); kmemleak_free(addr); /* * Use raw_cpu_ptr() because this can be called from preemptible * context. Preemption is absolutely fine here, because the llist_add() * implementation is lockless, so it works even if we are adding to * another cpu's list. schedule_work() should be fine with this too. */ if (addr && llist_add((struct llist_node *)addr, &p->list)) schedule_work(&p->wq); } /** * vfree - Release memory allocated by vmalloc() * @addr: Memory base address * * Free the virtually continuous memory area starting at @addr, as obtained * from one of the vmalloc() family of APIs. This will usually also free the * physical memory underlying the virtual allocation, but that memory is * reference counted, so it will not be freed until the last user goes away. * * If @addr is NULL, no operation is performed. * * Context: * May sleep if called *not* from interrupt context. * Must not be called in NMI context (strictly speaking, it could be * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling * conventions for vfree() arch-dependent would be a really bad idea). */ void vfree(const void *addr) { struct vm_struct *vm; int i; if (unlikely(in_interrupt())) { vfree_atomic(addr); return; } BUG_ON(in_nmi()); kmemleak_free(addr); might_sleep(); if (!addr) return; vm = remove_vm_area(addr); if (unlikely(!vm)) { WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", addr); return; } if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS)) vm_reset_perms(vm); for (i = 0; i < vm->nr_pages; i++) { struct page *page = vm->pages[i]; BUG_ON(!page); mod_memcg_page_state(page, MEMCG_VMALLOC, -1); /* * High-order allocs for huge vmallocs are split, so * can be freed as an array of order-0 allocations */ __free_page(page); cond_resched(); } atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages); kvfree(vm->pages); kfree(vm); } EXPORT_SYMBOL(vfree); /** * vunmap - release virtual mapping obtained by vmap() * @addr: memory base address * * Free the virtually contiguous memory area starting at @addr, * which was created from the page array passed to vmap(). * * Must not be called in interrupt context. */ void vunmap(const void *addr) { struct vm_struct *vm; BUG_ON(in_interrupt()); might_sleep(); if (!addr) return; vm = remove_vm_area(addr); if (unlikely(!vm)) { WARN(1, KERN_ERR "Trying to vunmap() nonexistent vm area (%p)\n", addr); return; } kfree(vm); } EXPORT_SYMBOL(vunmap); /** * vmap - map an array of pages into virtually contiguous space * @pages: array of page pointers * @count: number of pages to map * @flags: vm_area->flags * @prot: page protection for the mapping * * Maps @count pages from @pages into contiguous kernel virtual space. * If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself * (which must be kmalloc or vmalloc memory) and one reference per pages in it * are transferred from the caller to vmap(), and will be freed / dropped when * vfree() is called on the return value. * * Return: the address of the area or %NULL on failure */ void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot) { struct vm_struct *area; unsigned long addr; unsigned long size; /* In bytes */ might_sleep(); if (WARN_ON_ONCE(flags & VM_FLUSH_RESET_PERMS)) return NULL; /* * Your top guard is someone else's bottom guard. Not having a top * guard compromises someone else's mappings too. */ if (WARN_ON_ONCE(flags & VM_NO_GUARD)) flags &= ~VM_NO_GUARD; if (count > totalram_pages()) return NULL; size = (unsigned long)count << PAGE_SHIFT; area = get_vm_area_caller(size, flags, __builtin_return_address(0)); if (!area) return NULL; addr = (unsigned long)area->addr; if (vmap_pages_range(addr, addr + size, pgprot_nx(prot), pages, PAGE_SHIFT) < 0) { vunmap(area->addr); return NULL; } if (flags & VM_MAP_PUT_PAGES) { area->pages = pages; area->nr_pages = count; } return area->addr; } EXPORT_SYMBOL(vmap); #ifdef CONFIG_VMAP_PFN struct vmap_pfn_data { unsigned long *pfns; pgprot_t prot; unsigned int idx; }; static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private) { struct vmap_pfn_data *data = private; unsigned long pfn = data->pfns[data->idx]; pte_t ptent; if (WARN_ON_ONCE(pfn_valid(pfn))) return -EINVAL; ptent = pte_mkspecial(pfn_pte(pfn, data->prot)); set_pte_at(&init_mm, addr, pte, ptent); data->idx++; return 0; } /** * vmap_pfn - map an array of PFNs into virtually contiguous space * @pfns: array of PFNs * @count: number of pages to map * @prot: page protection for the mapping * * Maps @count PFNs from @pfns into contiguous kernel virtual space and returns * the start address of the mapping. */ void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot) { struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) }; struct vm_struct *area; area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP, __builtin_return_address(0)); if (!area) return NULL; if (apply_to_page_range(&init_mm, (unsigned long)area->addr, count * PAGE_SIZE, vmap_pfn_apply, &data)) { free_vm_area(area); return NULL; } flush_cache_vmap((unsigned long)area->addr, (unsigned long)area->addr + count * PAGE_SIZE); return area->addr; } EXPORT_SYMBOL_GPL(vmap_pfn); #endif /* CONFIG_VMAP_PFN */ static inline unsigned int vm_area_alloc_pages(gfp_t gfp, int nid, unsigned int order, unsigned int nr_pages, struct page **pages) { unsigned int nr_allocated = 0; gfp_t alloc_gfp = gfp; bool nofail = false; struct page *page; int i; /* * For order-0 pages we make use of bulk allocator, if * the page array is partly or not at all populated due * to fails, fallback to a single page allocator that is * more permissive. */ if (!order) { /* bulk allocator doesn't support nofail req. officially */ gfp_t bulk_gfp = gfp & ~__GFP_NOFAIL; while (nr_allocated < nr_pages) { unsigned int nr, nr_pages_request; /* * A maximum allowed request is hard-coded and is 100 * pages per call. That is done in order to prevent a * long preemption off scenario in the bulk-allocator * so the range is [1:100]. */ nr_pages_request = min(100U, nr_pages - nr_allocated); /* memory allocation should consider mempolicy, we can't * wrongly use nearest node when nid == NUMA_NO_NODE, * otherwise memory may be allocated in only one node, * but mempolicy wants to alloc memory by interleaving. */ if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE) nr = alloc_pages_bulk_array_mempolicy(bulk_gfp, nr_pages_request, pages + nr_allocated); else nr = alloc_pages_bulk_array_node(bulk_gfp, nid, nr_pages_request, pages + nr_allocated); nr_allocated += nr; cond_resched(); /* * If zero or pages were obtained partly, * fallback to a single page allocator. */ if (nr != nr_pages_request) break; } } else if (gfp & __GFP_NOFAIL) { /* * Higher order nofail allocations are really expensive and * potentially dangerous (pre-mature OOM, disruptive reclaim * and compaction etc. */ alloc_gfp &= ~__GFP_NOFAIL; nofail = true; } /* High-order pages or fallback path if "bulk" fails. */ while (nr_allocated < nr_pages) { if (fatal_signal_pending(current)) break; if (nid == NUMA_NO_NODE) page = alloc_pages(alloc_gfp, order); else page = alloc_pages_node(nid, alloc_gfp, order); if (unlikely(!page)) { if (!nofail) break; /* fall back to the zero order allocations */ alloc_gfp |= __GFP_NOFAIL; order = 0; continue; } /* * Higher order allocations must be able to be treated as * indepdenent small pages by callers (as they can with * small-page vmallocs). Some drivers do their own refcounting * on vmalloc_to_page() pages, some use page->mapping, * page->lru, etc. */ if (order) split_page(page, order); /* * Careful, we allocate and map page-order pages, but * tracking is done per PAGE_SIZE page so as to keep the * vm_struct APIs independent of the physical/mapped size. */ for (i = 0; i < (1U << order); i++) pages[nr_allocated + i] = page + i; cond_resched(); nr_allocated += 1U << order; } return nr_allocated; } static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, unsigned int page_shift, int node) { const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; bool nofail = gfp_mask & __GFP_NOFAIL; unsigned long addr = (unsigned long)area->addr; unsigned long size = get_vm_area_size(area); unsigned long array_size; unsigned int nr_small_pages = size >> PAGE_SHIFT; unsigned int page_order; unsigned int flags; int ret; array_size = (unsigned long)nr_small_pages * sizeof(struct page *); if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) gfp_mask |= __GFP_HIGHMEM; /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { area->pages = __vmalloc_node(array_size, 1, nested_gfp, node, area->caller); } else { area->pages = kmalloc_node(array_size, nested_gfp, node); } if (!area->pages) { warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, failed to allocated page array size %lu", nr_small_pages * PAGE_SIZE, array_size); free_vm_area(area); return NULL; } set_vm_area_page_order(area, page_shift - PAGE_SHIFT); page_order = vm_area_page_order(area); area->nr_pages = vm_area_alloc_pages(gfp_mask | __GFP_NOWARN, node, page_order, nr_small_pages, area->pages); atomic_long_add(area->nr_pages, &nr_vmalloc_pages); if (gfp_mask & __GFP_ACCOUNT) { int i; for (i = 0; i < area->nr_pages; i++) mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, 1); } /* * If not enough pages were obtained to accomplish an * allocation request, free them via vfree() if any. */ if (area->nr_pages != nr_small_pages) { /* * vm_area_alloc_pages() can fail due to insufficient memory but * also:- * * - a pending fatal signal * - insufficient huge page-order pages * * Since we always retry allocations at order-0 in the huge page * case a warning for either is spurious. */ if (!fatal_signal_pending(current) && page_order == 0) warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, failed to allocate pages", area->nr_pages * PAGE_SIZE); goto fail; } /* * page tables allocations ignore external gfp mask, enforce it * by the scope API */ if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) flags = memalloc_nofs_save(); else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) flags = memalloc_noio_save(); do { ret = vmap_pages_range(addr, addr + size, prot, area->pages, page_shift); if (nofail && (ret < 0)) schedule_timeout_uninterruptible(1); } while (nofail && (ret < 0)); if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) memalloc_nofs_restore(flags); else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) memalloc_noio_restore(flags); if (ret < 0) { warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, failed to map pages", area->nr_pages * PAGE_SIZE); goto fail; } return area->addr; fail: vfree(area->addr); return NULL; } /** * __vmalloc_node_range - allocate virtually contiguous memory * @size: allocation size * @align: desired alignment * @start: vm area range start * @end: vm area range end * @gfp_mask: flags for the page level allocator * @prot: protection mask for the allocated pages * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD) * @node: node to use for allocation or NUMA_NO_NODE * @caller: caller's return address * * Allocate enough pages to cover @size from the page level * allocator with @gfp_mask flags. Please note that the full set of gfp * flags are not supported. GFP_KERNEL, GFP_NOFS and GFP_NOIO are all * supported. * Zone modifiers are not supported. From the reclaim modifiers * __GFP_DIRECT_RECLAIM is required (aka GFP_NOWAIT is not supported) * and only __GFP_NOFAIL is supported (i.e. __GFP_NORETRY and * __GFP_RETRY_MAYFAIL are not supported). * * __GFP_NOWARN can be used to suppress failures messages. * * Map them into contiguous kernel virtual space, using a pagetable * protection of @prot. * * Return: the address of the area or %NULL on failure */ void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, const void *caller) { struct vm_struct *area; void *ret; kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE; unsigned long real_size = size; unsigned long real_align = align; unsigned int shift = PAGE_SHIFT; if (WARN_ON_ONCE(!size)) return NULL; if ((size >> PAGE_SHIFT) > totalram_pages()) { warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, exceeds total pages", real_size); return NULL; } if (vmap_allow_huge && (vm_flags & VM_ALLOW_HUGE_VMAP)) { unsigned long size_per_node; /* * Try huge pages. Only try for PAGE_KERNEL allocations, * others like modules don't yet expect huge pages in * their allocations due to apply_to_page_range not * supporting them. */ size_per_node = size; if (node == NUMA_NO_NODE) size_per_node /= num_online_nodes(); if (arch_vmap_pmd_supported(prot) && size_per_node >= PMD_SIZE) shift = PMD_SHIFT; else shift = arch_vmap_pte_supported_shift(size_per_node); align = max(real_align, 1UL << shift); size = ALIGN(real_size, 1UL << shift); } again: area = __get_vm_area_node(real_size, align, shift, VM_ALLOC | VM_UNINITIALIZED | vm_flags, start, end, node, gfp_mask, caller); if (!area) { bool nofail = gfp_mask & __GFP_NOFAIL; warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, vm_struct allocation failed%s", real_size, (nofail) ? ". Retrying." : ""); if (nofail) { schedule_timeout_uninterruptible(1); goto again; } goto fail; } /* * Prepare arguments for __vmalloc_area_node() and * kasan_unpoison_vmalloc(). */ if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) { if (kasan_hw_tags_enabled()) { /* * Modify protection bits to allow tagging. * This must be done before mapping. */ prot = arch_vmap_pgprot_tagged(prot); /* * Skip page_alloc poisoning and zeroing for physical * pages backing VM_ALLOC mapping. Memory is instead * poisoned and zeroed by kasan_unpoison_vmalloc(). */ gfp_mask |= __GFP_SKIP_KASAN | __GFP_SKIP_ZERO; } /* Take note that the mapping is PAGE_KERNEL. */ kasan_flags |= KASAN_VMALLOC_PROT_NORMAL; } /* Allocate physical pages and map them into vmalloc space. */ ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node); if (!ret) goto fail; /* * Mark the pages as accessible, now that they are mapped. * The condition for setting KASAN_VMALLOC_INIT should complement the * one in post_alloc_hook() with regards to the __GFP_SKIP_ZERO check * to make sure that memory is initialized under the same conditions. * Tag-based KASAN modes only assign tags to normal non-executable * allocations, see __kasan_unpoison_vmalloc(). */ kasan_flags |= KASAN_VMALLOC_VM_ALLOC; if (!want_init_on_free() && want_init_on_alloc(gfp_mask) && (gfp_mask & __GFP_SKIP_ZERO)) kasan_flags |= KASAN_VMALLOC_INIT; /* KASAN_VMALLOC_PROT_NORMAL already set if required. */ area->addr = kasan_unpoison_vmalloc(area->addr, real_size, kasan_flags); /* * In this function, newly allocated vm_struct has VM_UNINITIALIZED * flag. It means that vm_struct is not fully initialized. * Now, it is fully initialized, so remove this flag here. */ clear_vm_uninitialized_flag(area); size = PAGE_ALIGN(size); if (!(vm_flags & VM_DEFER_KMEMLEAK)) kmemleak_vmalloc(area, size, gfp_mask); return area->addr; fail: if (shift > PAGE_SHIFT) { shift = PAGE_SHIFT; align = real_align; size = real_size; goto again; } return NULL; } /** * __vmalloc_node - allocate virtually contiguous memory * @size: allocation size * @align: desired alignment * @gfp_mask: flags for the page level allocator * @node: node to use for allocation or NUMA_NO_NODE * @caller: caller's return address * * Allocate enough pages to cover @size from the page level allocator with * @gfp_mask flags. Map them into contiguous kernel virtual space. * * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL * and __GFP_NOFAIL are not supported * * Any use of gfp flags outside of GFP_KERNEL should be consulted * with mm people. * * Return: pointer to the allocated memory or %NULL on error */ void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, int node, const void *caller) { return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, gfp_mask, PAGE_KERNEL, 0, node, caller); } /* * This is only for performance analysis of vmalloc and stress purpose. * It is required by vmalloc test module, therefore do not use it other * than that. */ #ifdef CONFIG_TEST_VMALLOC_MODULE EXPORT_SYMBOL_GPL(__vmalloc_node); #endif void *__vmalloc(unsigned long size, gfp_t gfp_mask) { return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(__vmalloc); /** * vmalloc - allocate virtually contiguous memory * @size: allocation size * * Allocate enough pages to cover @size from the page level * allocator and map them into contiguous kernel virtual space. * * For tight control over page level allocator and protection flags * use __vmalloc() instead. * * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc(unsigned long size) { return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc); /** * vmalloc_huge - allocate virtually contiguous memory, allow huge pages * @size: allocation size * @gfp_mask: flags for the page level allocator * * Allocate enough pages to cover @size from the page level * allocator and map them into contiguous kernel virtual space. * If @size is greater than or equal to PMD_SIZE, allow using * huge pages for the memory * * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) { return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL_GPL(vmalloc_huge); /** * vzalloc - allocate virtually contiguous memory with zero fill * @size: allocation size * * Allocate enough pages to cover @size from the page level * allocator and map them into contiguous kernel virtual space. * The memory allocated is set to zero. * * For tight control over page level allocator and protection flags * use __vmalloc() instead. * * Return: pointer to the allocated memory or %NULL on error */ void *vzalloc(unsigned long size) { return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(vzalloc); /** * vmalloc_user - allocate zeroed virtually contiguous memory for userspace * @size: allocation size * * The resulting memory area is zeroed so it can be mapped to userspace * without leaking data. * * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc_user(unsigned long size) { return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, VM_USERMAP, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_user); /** * vmalloc_node - allocate memory on a specific node * @size: allocation size * @node: numa node * * Allocate enough pages to cover @size from the page level * allocator and map them into contiguous kernel virtual space. * * For tight control over page level allocator and protection flags * use __vmalloc() instead. * * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc_node(unsigned long size, int node) { return __vmalloc_node(size, 1, GFP_KERNEL, node, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_node); /** * vzalloc_node - allocate memory on a specific node with zero fill * @size: allocation size * @node: numa node * * Allocate enough pages to cover @size from the page level * allocator and map them into contiguous kernel virtual space. * The memory allocated is set to zero. * * Return: pointer to the allocated memory or %NULL on error */ void *vzalloc_node(unsigned long size, int node) { return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node, __builtin_return_address(0)); } EXPORT_SYMBOL(vzalloc_node); #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) #define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL) #else /* * 64b systems should always have either DMA or DMA32 zones. For others * GFP_DMA32 should do the right thing and use the normal zone. */ #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) #endif /** * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) * @size: allocation size * * Allocate enough 32bit PA addressable pages to cover @size from the * page level allocator and map them into contiguous kernel virtual space. * * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc_32(unsigned long size) { return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_32); /** * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory * @size: allocation size * * The resulting memory area is 32bit addressable and zeroed so it can be * mapped to userspace without leaking data. * * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc_32_user(unsigned long size) { return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, VM_USERMAP, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_32_user); /* * Atomically zero bytes in the iterator. * * Returns the number of zeroed bytes. */ static size_t zero_iter(struct iov_iter *iter, size_t count) { size_t remains = count; while (remains > 0) { size_t num, copied; num = min_t(size_t, remains, PAGE_SIZE); copied = copy_page_to_iter_nofault(ZERO_PAGE(0), 0, num, iter); remains -= copied; if (copied < num) break; } return count - remains; } /* * small helper routine, copy contents to iter from addr. * If the page is not present, fill zero. * * Returns the number of copied bytes. */ static size_t aligned_vread_iter(struct iov_iter *iter, const char *addr, size_t count) { size_t remains = count; struct page *page; while (remains > 0) { unsigned long offset, length; size_t copied = 0; offset = offset_in_page(addr); length = PAGE_SIZE - offset; if (length > remains) length = remains; page = vmalloc_to_page(addr); /* * To do safe access to this _mapped_ area, we need lock. But * adding lock here means that we need to add overhead of * vmalloc()/vfree() calls for this _debug_ interface, rarely * used. Instead of that, we'll use an local mapping via * copy_page_to_iter_nofault() and accept a small overhead in * this access function. */ if (page) copied = copy_page_to_iter_nofault(page, offset, length, iter); else copied = zero_iter(iter, length); addr += copied; remains -= copied; if (copied != length) break; } return count - remains; } /* * Read from a vm_map_ram region of memory. * * Returns the number of copied bytes. */ static size_t vmap_ram_vread_iter(struct iov_iter *iter, const char *addr, size_t count, unsigned long flags) { char *start; struct vmap_block *vb; struct xarray *xa; unsigned long offset; unsigned int rs, re; size_t remains, n; /* * If it's area created by vm_map_ram() interface directly, but * not further subdividing and delegating management to vmap_block, * handle it here. */ if (!(flags & VMAP_BLOCK)) return aligned_vread_iter(iter, addr, count); remains = count; /* * Area is split into regions and tracked with vmap_block, read out * each region and zero fill the hole between regions. */ xa = addr_to_vb_xa((unsigned long) addr); vb = xa_load(xa, addr_to_vb_idx((unsigned long)addr)); if (!vb) goto finished_zero; spin_lock(&vb->lock); if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) { spin_unlock(&vb->lock); goto finished_zero; } for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) { size_t copied; if (remains == 0) goto finished; start = vmap_block_vaddr(vb->va->va_start, rs); if (addr < start) { size_t to_zero = min_t(size_t, start - addr, remains); size_t zeroed = zero_iter(iter, to_zero); addr += zeroed; remains -= zeroed; if (remains == 0 || zeroed != to_zero) goto finished; } /*it could start reading from the middle of used region*/ offset = offset_in_page(addr); n = ((re - rs + 1) << PAGE_SHIFT) - offset; if (n > remains) n = remains; copied = aligned_vread_iter(iter, start + offset, n); addr += copied; remains -= copied; if (copied != n) goto finished; } spin_unlock(&vb->lock); finished_zero: /* zero-fill the left dirty or free regions */ return count - remains + zero_iter(iter, remains); finished: /* We couldn't copy/zero everything */ spin_unlock(&vb->lock); return count - remains; } /** * vread_iter() - read vmalloc area in a safe way to an iterator. * @iter: the iterator to which data should be written. * @addr: vm address. * @count: number of bytes to be read. * * This function checks that addr is a valid vmalloc'ed area, and * copy data from that area to a given buffer. If the given memory range * of [addr...addr+count) includes some valid address, data is copied to * proper area of @buf. If there are memory holes, they'll be zero-filled. * IOREMAP area is treated as memory hole and no copy is done. * * If [addr...addr+count) doesn't includes any intersects with alive * vm_struct area, returns 0. @buf should be kernel's buffer. * * Note: In usual ops, vread() is never necessary because the caller * should know vmalloc() area is valid and can use memcpy(). * This is for routines which have to access vmalloc area without * any information, as /proc/kcore. * * Return: number of bytes for which addr and buf should be increased * (same number as @count) or %0 if [addr...addr+count) doesn't * include any intersection with valid vmalloc area */ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) { struct vmap_area *va; struct vm_struct *vm; char *vaddr; size_t n, size, flags, remains; addr = kasan_reset_tag(addr); /* Don't allow overflow */ if ((unsigned long) addr + count < count) count = -(unsigned long) addr; remains = count; spin_lock(&vmap_area_lock); va = find_vmap_area_exceed_addr((unsigned long)addr); if (!va) goto finished_zero; /* no intersects with alive vmap_area */ if ((unsigned long)addr + remains <= va->va_start) goto finished_zero; list_for_each_entry_from(va, &vmap_area_list, list) { size_t copied; if (remains == 0) goto finished; vm = va->vm; flags = va->flags & VMAP_FLAGS_MASK; /* * VMAP_BLOCK indicates a sub-type of vm_map_ram area, need * be set together with VMAP_RAM. */ WARN_ON(flags == VMAP_BLOCK); if (!vm && !flags) continue; if (vm && (vm->flags & VM_UNINITIALIZED)) continue; /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ smp_rmb(); vaddr = (char *) va->va_start; size = vm ? get_vm_area_size(vm) : va_size(va); if (addr >= vaddr + size) continue; if (addr < vaddr) { size_t to_zero = min_t(size_t, vaddr - addr, remains); size_t zeroed = zero_iter(iter, to_zero); addr += zeroed; remains -= zeroed; if (remains == 0 || zeroed != to_zero) goto finished; } n = vaddr + size - addr; if (n > remains) n = remains; if (flags & VMAP_RAM) copied = vmap_ram_vread_iter(iter, addr, n, flags); else if (!(vm && (vm->flags & VM_IOREMAP))) copied = aligned_vread_iter(iter, addr, n); else /* IOREMAP area is treated as memory hole */ copied = zero_iter(iter, n); addr += copied; remains -= copied; if (copied != n) goto finished; } finished_zero: spin_unlock(&vmap_area_lock); /* zero-fill memory holes */ return count - remains + zero_iter(iter, remains); finished: /* Nothing remains, or We couldn't copy/zero everything. */ spin_unlock(&vmap_area_lock); return count - remains; } /** * remap_vmalloc_range_partial - map vmalloc pages to userspace * @vma: vma to cover * @uaddr: target user address to start at * @kaddr: virtual address of vmalloc kernel memory * @pgoff: offset from @kaddr to start at * @size: size of map area * * Returns: 0 for success, -Exxx on failure * * This function checks that @kaddr is a valid vmalloc'ed area, * and that it is big enough to cover the range starting at * @uaddr in @vma. Will return failure if that criteria isn't * met. * * Similar to remap_pfn_range() (see mm/memory.c) */ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, void *kaddr, unsigned long pgoff, unsigned long size) { struct vm_struct *area; unsigned long off; unsigned long end_index; if (check_shl_overflow(pgoff, PAGE_SHIFT, &off)) return -EINVAL; size = PAGE_ALIGN(size); if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr)) return -EINVAL; area = find_vm_area(kaddr); if (!area) return -EINVAL; if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT))) return -EINVAL; if (check_add_overflow(size, off, &end_index) || end_index > get_vm_area_size(area)) return -EINVAL; kaddr += off; do { struct page *page = vmalloc_to_page(kaddr); int ret; ret = vm_insert_page(vma, uaddr, page); if (ret) return ret; uaddr += PAGE_SIZE; kaddr += PAGE_SIZE; size -= PAGE_SIZE; } while (size > 0); vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); return 0; } /** * remap_vmalloc_range - map vmalloc pages to userspace * @vma: vma to cover (map full range of vma) * @addr: vmalloc memory * @pgoff: number of pages into addr before first page to map * * Returns: 0 for success, -Exxx on failure * * This function checks that addr is a valid vmalloc'ed area, and * that it is big enough to cover the vma. Will return failure if * that criteria isn't met. * * Similar to remap_pfn_range() (see mm/memory.c) */ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, unsigned long pgoff) { return remap_vmalloc_range_partial(vma, vma->vm_start, addr, pgoff, vma->vm_end - vma->vm_start); } EXPORT_SYMBOL(remap_vmalloc_range); void free_vm_area(struct vm_struct *area) { struct vm_struct *ret; ret = remove_vm_area(area->addr); BUG_ON(ret != area); kfree(area); } EXPORT_SYMBOL_GPL(free_vm_area); #ifdef CONFIG_SMP static struct vmap_area *node_to_va(struct rb_node *n) { return rb_entry_safe(n, struct vmap_area, rb_node); } /** * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to * @addr: target address * * Returns: vmap_area if it is found. If there is no such area * the first highest(reverse order) vmap_area is returned * i.e. va->va_start < addr && va->va_end < addr or NULL * if there are no any areas before @addr. */ static struct vmap_area * pvm_find_va_enclose_addr(unsigned long addr) { struct vmap_area *va, *tmp; struct rb_node *n; n = free_vmap_area_root.rb_node; va = NULL; while (n) { tmp = rb_entry(n, struct vmap_area, rb_node); if (tmp->va_start <= addr) { va = tmp; if (tmp->va_end >= addr) break; n = n->rb_right; } else { n = n->rb_left; } } return va; } /** * pvm_determine_end_from_reverse - find the highest aligned address * of free block below VMALLOC_END * @va: * in - the VA we start the search(reverse order); * out - the VA with the highest aligned end address. * @align: alignment for required highest address * * Returns: determined end address within vmap_area */ static unsigned long pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align) { unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); unsigned long addr; if (likely(*va)) { list_for_each_entry_from_reverse((*va), &free_vmap_area_list, list) { addr = min((*va)->va_end & ~(align - 1), vmalloc_end); if ((*va)->va_start < addr) return addr; } } return 0; } /** * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator * @offsets: array containing offset of each area * @sizes: array containing size of each area * @nr_vms: the number of areas to allocate * @align: alignment, all entries in @offsets and @sizes must be aligned to this * * Returns: kmalloc'd vm_struct pointer array pointing to allocated * vm_structs on success, %NULL on failure * * Percpu allocator wants to use congruent vm areas so that it can * maintain the offsets among percpu areas. This function allocates * congruent vmalloc areas for it with GFP_KERNEL. These areas tend to * be scattered pretty far, distance between two areas easily going up * to gigabytes. To avoid interacting with regular vmallocs, these * areas are allocated from top. * * Despite its complicated look, this allocator is rather simple. It * does everything top-down and scans free blocks from the end looking * for matching base. While scanning, if any of the areas do not fit the * base address is pulled down to fit the area. Scanning is repeated till * all the areas fit and then all necessary data structures are inserted * and the result is returned. */ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, const size_t *sizes, int nr_vms, size_t align) { const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); struct vmap_area **vas, *va; struct vm_struct **vms; int area, area2, last_area, term_area; unsigned long base, start, size, end, last_end, orig_start, orig_end; bool purged = false; /* verify parameters and allocate data structures */ BUG_ON(offset_in_page(align) || !is_power_of_2(align)); for (last_area = 0, area = 0; area < nr_vms; area++) { start = offsets[area]; end = start + sizes[area]; /* is everything aligned properly? */ BUG_ON(!IS_ALIGNED(offsets[area], align)); BUG_ON(!IS_ALIGNED(sizes[area], align)); /* detect the area with the highest address */ if (start > offsets[last_area]) last_area = area; for (area2 = area + 1; area2 < nr_vms; area2++) { unsigned long start2 = offsets[area2]; unsigned long end2 = start2 + sizes[area2]; BUG_ON(start2 < end && start < end2); } } last_end = offsets[last_area] + sizes[last_area]; if (vmalloc_end - vmalloc_start < last_end) { WARN_ON(true); return NULL; } vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL); vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL); if (!vas || !vms) goto err_free2; for (area = 0; area < nr_vms; area++) { vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL); vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL); if (!vas[area] || !vms[area]) goto err_free; } retry: spin_lock(&free_vmap_area_lock); /* start scanning - we scan from the top, begin with the last area */ area = term_area = last_area; start = offsets[area]; end = start + sizes[area]; va = pvm_find_va_enclose_addr(vmalloc_end); base = pvm_determine_end_from_reverse(&va, align) - end; while (true) { /* * base might have underflowed, add last_end before * comparing. */ if (base + last_end < vmalloc_start + last_end) goto overflow; /* * Fitting base has not been found. */ if (va == NULL) goto overflow; /* * If required width exceeds current VA block, move * base downwards and then recheck. */ if (base + end > va->va_end) { base = pvm_determine_end_from_reverse(&va, align) - end; term_area = area; continue; } /* * If this VA does not fit, move base downwards and recheck. */ if (base + start < va->va_start) { va = node_to_va(rb_prev(&va->rb_node)); base = pvm_determine_end_from_reverse(&va, align) - end; term_area = area; continue; } /* * This area fits, move on to the previous one. If * the previous one is the terminal one, we're done. */ area = (area + nr_vms - 1) % nr_vms; if (area == term_area) break; start = offsets[area]; end = start + sizes[area]; va = pvm_find_va_enclose_addr(base + end); } /* we've found a fitting base, insert all va's */ for (area = 0; area < nr_vms; area++) { int ret; start = base + offsets[area]; size = sizes[area]; va = pvm_find_va_enclose_addr(start); if (WARN_ON_ONCE(va == NULL)) /* It is a BUG(), but trigger recovery instead. */ goto recovery; ret = adjust_va_to_fit_type(&free_vmap_area_root, &free_vmap_area_list, va, start, size); if (WARN_ON_ONCE(unlikely(ret))) /* It is a BUG(), but trigger recovery instead. */ goto recovery; /* Allocated area. */ va = vas[area]; va->va_start = start; va->va_end = start + size; } spin_unlock(&free_vmap_area_lock); /* populate the kasan shadow space */ for (area = 0; area < nr_vms; area++) { if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area])) goto err_free_shadow; } /* insert all vm's */ spin_lock(&vmap_area_lock); for (area = 0; area < nr_vms; area++) { insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list); setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC, pcpu_get_vm_areas); } spin_unlock(&vmap_area_lock); /* * Mark allocated areas as accessible. Do it now as a best-effort * approach, as they can be mapped outside of vmalloc code. * With hardware tag-based KASAN, marking is skipped for * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). */ for (area = 0; area < nr_vms; area++) vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr, vms[area]->size, KASAN_VMALLOC_PROT_NORMAL); kfree(vas); return vms; recovery: /* * Remove previously allocated areas. There is no * need in removing these areas from the busy tree, * because they are inserted only on the final step * and when pcpu_get_vm_areas() is success. */ while (area--) { orig_start = vas[area]->va_start; orig_end = vas[area]->va_end; va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root, &free_vmap_area_list); if (va) kasan_release_vmalloc(orig_start, orig_end, va->va_start, va->va_end); vas[area] = NULL; } overflow: spin_unlock(&free_vmap_area_lock); if (!purged) { reclaim_and_purge_vmap_areas(); purged = true; /* Before "retry", check if we recover. */ for (area = 0; area < nr_vms; area++) { if (vas[area]) continue; vas[area] = kmem_cache_zalloc( vmap_area_cachep, GFP_KERNEL); if (!vas[area]) goto err_free; } goto retry; } err_free: for (area = 0; area < nr_vms; area++) { if (vas[area]) kmem_cache_free(vmap_area_cachep, vas[area]); kfree(vms[area]); } err_free2: kfree(vas); kfree(vms); return NULL; err_free_shadow: spin_lock(&free_vmap_area_lock); /* * We release all the vmalloc shadows, even the ones for regions that * hadn't been successfully added. This relies on kasan_release_vmalloc * being able to tolerate this case. */ for (area = 0; area < nr_vms; area++) { orig_start = vas[area]->va_start; orig_end = vas[area]->va_end; va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root, &free_vmap_area_list); if (va) kasan_release_vmalloc(orig_start, orig_end, va->va_start, va->va_end); vas[area] = NULL; kfree(vms[area]); } spin_unlock(&free_vmap_area_lock); kfree(vas); kfree(vms); return NULL; } /** * pcpu_free_vm_areas - free vmalloc areas for percpu allocator * @vms: vm_struct pointer array returned by pcpu_get_vm_areas() * @nr_vms: the number of allocated areas * * Free vm_structs and the array allocated by pcpu_get_vm_areas(). */ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) { int i; for (i = 0; i < nr_vms; i++) free_vm_area(vms[i]); kfree(vms); } #endif /* CONFIG_SMP */ #ifdef CONFIG_PRINTK bool vmalloc_dump_obj(void *object) { void *objp = (void *)PAGE_ALIGN((unsigned long)object); const void *caller; struct vm_struct *vm; struct vmap_area *va; unsigned long addr; unsigned int nr_pages; if (!spin_trylock(&vmap_area_lock)) return false; va = __find_vmap_area((unsigned long)objp, &vmap_area_root); if (!va) { spin_unlock(&vmap_area_lock); return false; } vm = va->vm; if (!vm) { spin_unlock(&vmap_area_lock); return false; } addr = (unsigned long)vm->addr; caller = vm->caller; nr_pages = vm->nr_pages; spin_unlock(&vmap_area_lock); pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n", nr_pages, addr, caller); return true; } #endif #ifdef CONFIG_PROC_FS static void *s_start(struct seq_file *m, loff_t *pos) __acquires(&vmap_purge_lock) __acquires(&vmap_area_lock) { mutex_lock(&vmap_purge_lock); spin_lock(&vmap_area_lock); return seq_list_start(&vmap_area_list, *pos); } static void *s_next(struct seq_file *m, void *p, loff_t *pos) { return seq_list_next(p, &vmap_area_list, pos); } static void s_stop(struct seq_file *m, void *p) __releases(&vmap_area_lock) __releases(&vmap_purge_lock) { spin_unlock(&vmap_area_lock); mutex_unlock(&vmap_purge_lock); } static void show_numa_info(struct seq_file *m, struct vm_struct *v) { if (IS_ENABLED(CONFIG_NUMA)) { unsigned int nr, *counters = m->private; unsigned int step = 1U << vm_area_page_order(v); if (!counters) return; if (v->flags & VM_UNINITIALIZED) return; /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ smp_rmb(); memset(counters, 0, nr_node_ids * sizeof(unsigned int)); for (nr = 0; nr < v->nr_pages; nr += step) counters[page_to_nid(v->pages[nr])] += step; for_each_node_state(nr, N_HIGH_MEMORY) if (counters[nr]) seq_printf(m, " N%u=%u", nr, counters[nr]); } } static void show_purge_info(struct seq_file *m) { struct vmap_area *va; spin_lock(&purge_vmap_area_lock); list_for_each_entry(va, &purge_vmap_area_list, list) { seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n", (void *)va->va_start, (void *)va->va_end, va->va_end - va->va_start); } spin_unlock(&purge_vmap_area_lock); } static int s_show(struct seq_file *m, void *p) { struct vmap_area *va; struct vm_struct *v; va = list_entry(p, struct vmap_area, list); if (!va->vm) { if (va->flags & VMAP_RAM) seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", (void *)va->va_start, (void *)va->va_end, va->va_end - va->va_start); goto final; } v = va->vm; seq_printf(m, "0x%pK-0x%pK %7ld", v->addr, v->addr + v->size, v->size); if (v->caller) seq_printf(m, " %pS", v->caller); if (v->nr_pages) seq_printf(m, " pages=%d", v->nr_pages); if (v->phys_addr) seq_printf(m, " phys=%pa", &v->phys_addr); if (v->flags & VM_IOREMAP) seq_puts(m, " ioremap"); if (v->flags & VM_ALLOC) seq_puts(m, " vmalloc"); if (v->flags & VM_MAP) seq_puts(m, " vmap"); if (v->flags & VM_USERMAP) seq_puts(m, " user"); if (v->flags & VM_DMA_COHERENT) seq_puts(m, " dma-coherent"); if (is_vmalloc_addr(v->pages)) seq_puts(m, " vpages"); show_numa_info(m, v); seq_putc(m, '\n'); /* * As a final step, dump "unpurged" areas. */ final: if (list_is_last(&va->list, &vmap_area_list)) show_purge_info(m); return 0; } static const struct seq_operations vmalloc_op = { .start = s_start, .next = s_next, .stop = s_stop, .show = s_show, }; static int __init proc_vmalloc_init(void) { if (IS_ENABLED(CONFIG_NUMA)) proc_create_seq_private("vmallocinfo", 0400, NULL, &vmalloc_op, nr_node_ids * sizeof(unsigned int), NULL); else proc_create_seq("vmallocinfo", 0400, NULL, &vmalloc_op); return 0; } module_init(proc_vmalloc_init); #endif void __init vmalloc_init(void) { struct vmap_area *va; struct vm_struct *tmp; int i; /* * Create the cache for vmap_area objects. */ vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC); for_each_possible_cpu(i) { struct vmap_block_queue *vbq; struct vfree_deferred *p; vbq = &per_cpu(vmap_block_queue, i); spin_lock_init(&vbq->lock); INIT_LIST_HEAD(&vbq->free); p = &per_cpu(vfree_deferred, i); init_llist_head(&p->list); INIT_WORK(&p->wq, delayed_vfree_work); xa_init(&vbq->vmap_blocks); } /* Import existing vmlist entries. */ for (tmp = vmlist; tmp; tmp = tmp->next) { va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); if (WARN_ON_ONCE(!va)) continue; va->va_start = (unsigned long)tmp->addr; va->va_end = va->va_start + tmp->size; va->vm = tmp; insert_vmap_area(va, &vmap_area_root, &vmap_area_list); } /* * Now we can initialize a free vmap space. */ vmap_init_free_space(); vmap_initialized = true; }
52 101 101 101 101 49 49 49 49 49 49 49 49 49 49 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 // SPDX-License-Identifier: MIT /* * Copyright (C) 2012-2014 Canonical Ltd (Maarten Lankhorst) * * Based on bo.c which bears the following copyright notice, * but is dual licensed: * * Copyright (c) 2006-2009 VMware, Inc., Palo Alto, CA., USA * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sub license, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice (including the * next paragraph) shall be included in all copies or substantial portions * of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * USE OR OTHER DEALINGS IN THE SOFTWARE. * **************************************************************************/ /* * Authors: Thomas Hellstrom <thellstrom-at-vmware-dot-com> */ #include <linux/dma-resv.h> #include <linux/dma-fence-array.h> #include <linux/export.h> #include <linux/mm.h> #include <linux/sched/mm.h> #include <linux/mmu_notifier.h> #include <linux/seq_file.h> /** * DOC: Reservation Object Overview * * The reservation object provides a mechanism to manage a container of * dma_fence object associated with a resource. A reservation object * can have any number of fences attaches to it. Each fence carries an usage * parameter determining how the operation represented by the fence is using the * resource. The RCU mechanism is used to protect read access to fences from * locked write-side updates. * * See struct dma_resv for more details. */ DEFINE_WD_CLASS(reservation_ww_class); EXPORT_SYMBOL(reservation_ww_class); /* Mask for the lower fence pointer bits */ #define DMA_RESV_LIST_MASK 0x3 struct dma_resv_list { struct rcu_head rcu; u32 num_fences, max_fences; struct dma_fence __rcu *table[]; }; /* Extract the fence and usage flags from an RCU protected entry in the list. */ static void dma_resv_list_entry(struct dma_resv_list *list, unsigned int index, struct dma_resv *resv, struct dma_fence **fence, enum dma_resv_usage *usage) { long tmp; tmp = (long)rcu_dereference_check(list->table[index], resv ? dma_resv_held(resv) : true); *fence = (struct dma_fence *)(tmp & ~DMA_RESV_LIST_MASK); if (usage) *usage = tmp & DMA_RESV_LIST_MASK; } /* Set the fence and usage flags at the specific index in the list. */ static void dma_resv_list_set(struct dma_resv_list *list, unsigned int index, struct dma_fence *fence, enum dma_resv_usage usage) { long tmp = ((long)fence) | usage; RCU_INIT_POINTER(list->table[index], (struct dma_fence *)tmp); } /* * Allocate a new dma_resv_list and make sure to correctly initialize * max_fences. */ static struct dma_resv_list *dma_resv_list_alloc(unsigned int max_fences) { struct dma_resv_list *list; size_t size; /* Round up to the next kmalloc bucket size. */ size = kmalloc_size_roundup(struct_size(list, table, max_fences)); list = kmalloc(size, GFP_KERNEL); if (!list) return NULL; /* Given the resulting bucket size, recalculated max_fences. */ list->max_fences = (size - offsetof(typeof(*list), table)) / sizeof(*list->table); return list; } /* Free a dma_resv_list and make sure to drop all references. */ static void dma_resv_list_free(struct dma_resv_list *list) { unsigned int i; if (!list) return; for (i = 0; i < list->num_fences; ++i) { struct dma_fence *fence; dma_resv_list_entry(list, i, NULL, &fence, NULL); dma_fence_put(fence); } kfree_rcu(list, rcu); } /** * dma_resv_init - initialize a reservation object * @obj: the reservation object */ void dma_resv_init(struct dma_resv *obj) { ww_mutex_init(&obj->lock, &reservation_ww_class); RCU_INIT_POINTER(obj->fences, NULL); } EXPORT_SYMBOL(dma_resv_init); /** * dma_resv_fini - destroys a reservation object * @obj: the reservation object */ void dma_resv_fini(struct dma_resv *obj) { /* * This object should be dead and all references must have * been released to it, so no need to be protected with rcu. */ dma_resv_list_free(rcu_dereference_protected(obj->fences, true)); ww_mutex_destroy(&obj->lock); } EXPORT_SYMBOL(dma_resv_fini); /* Dereference the fences while ensuring RCU rules */ static inline struct dma_resv_list *dma_resv_fences_list(struct dma_resv *obj) { return rcu_dereference_check(obj->fences, dma_resv_held(obj)); } /** * dma_resv_reserve_fences - Reserve space to add fences to a dma_resv object. * @obj: reservation object * @num_fences: number of fences we want to add * * Should be called before dma_resv_add_fence(). Must be called with @obj * locked through dma_resv_lock(). * * Note that the preallocated slots need to be re-reserved if @obj is unlocked * at any time before calling dma_resv_add_fence(). This is validated when * CONFIG_DEBUG_MUTEXES is enabled. * * RETURNS * Zero for success, or -errno */ int dma_resv_reserve_fences(struct dma_resv *obj, unsigned int num_fences) { struct dma_resv_list *old, *new; unsigned int i, j, k, max; dma_resv_assert_held(obj); old = dma_resv_fences_list(obj); if (old && old->max_fences) { if ((old->num_fences + num_fences) <= old->max_fences) return 0; max = max(old->num_fences + num_fences, old->max_fences * 2); } else { max = max(4ul, roundup_pow_of_two(num_fences)); } new = dma_resv_list_alloc(max); if (!new) return -ENOMEM; /* * no need to bump fence refcounts, rcu_read access * requires the use of kref_get_unless_zero, and the * references from the old struct are carried over to * the new. */ for (i = 0, j = 0, k = max; i < (old ? old->num_fences : 0); ++i) { enum dma_resv_usage usage; struct dma_fence *fence; dma_resv_list_entry(old, i, obj, &fence, &usage); if (dma_fence_is_signaled(fence)) RCU_INIT_POINTER(new->table[--k], fence); else dma_resv_list_set(new, j++, fence, usage); } new->num_fences = j; /* * We are not changing the effective set of fences here so can * merely update the pointer to the new array; both existing * readers and new readers will see exactly the same set of * active (unsignaled) fences. Individual fences and the * old array are protected by RCU and so will not vanish under * the gaze of the rcu_read_lock() readers. */ rcu_assign_pointer(obj->fences, new); if (!old) return 0; /* Drop the references to the signaled fences */ for (i = k; i < max; ++i) { struct dma_fence *fence; fence = rcu_dereference_protected(new->table[i], dma_resv_held(obj)); dma_fence_put(fence); } kfree_rcu(old, rcu); return 0; } EXPORT_SYMBOL(dma_resv_reserve_fences); #ifdef CONFIG_DEBUG_MUTEXES /** * dma_resv_reset_max_fences - reset fences for debugging * @obj: the dma_resv object to reset * * Reset the number of pre-reserved fence slots to test that drivers do * correct slot allocation using dma_resv_reserve_fences(). See also * &dma_resv_list.max_fences. */ void dma_resv_reset_max_fences(struct dma_resv *obj) { struct dma_resv_list *fences = dma_resv_fences_list(obj); dma_resv_assert_held(obj); /* Test fence slot reservation */ if (fences) fences->max_fences = fences->num_fences; } EXPORT_SYMBOL(dma_resv_reset_max_fences); #endif /** * dma_resv_add_fence - Add a fence to the dma_resv obj * @obj: the reservation object * @fence: the fence to add * @usage: how the fence is used, see enum dma_resv_usage * * Add a fence to a slot, @obj must be locked with dma_resv_lock(), and * dma_resv_reserve_fences() has been called. * * See also &dma_resv.fence for a discussion of the semantics. */ void dma_resv_add_fence(struct dma_resv *obj, struct dma_fence *fence, enum dma_resv_usage usage) { struct dma_resv_list *fobj; struct dma_fence *old; unsigned int i, count; dma_fence_get(fence); dma_resv_assert_held(obj); /* Drivers should not add containers here, instead add each fence * individually. */ WARN_ON(dma_fence_is_container(fence)); fobj = dma_resv_fences_list(obj); count = fobj->num_fences; for (i = 0; i < count; ++i) { enum dma_resv_usage old_usage; dma_resv_list_entry(fobj, i, obj, &old, &old_usage); if ((old->context == fence->context && old_usage >= usage && dma_fence_is_later(fence, old)) || dma_fence_is_signaled(old)) { dma_resv_list_set(fobj, i, fence, usage); dma_fence_put(old); return; } } BUG_ON(fobj->num_fences >= fobj->max_fences); count++; dma_resv_list_set(fobj, i, fence, usage); /* pointer update must be visible before we extend the num_fences */ smp_store_mb(fobj->num_fences, count); } EXPORT_SYMBOL(dma_resv_add_fence); /** * dma_resv_replace_fences - replace fences in the dma_resv obj * @obj: the reservation object * @context: the context of the fences to replace * @replacement: the new fence to use instead * @usage: how the new fence is used, see enum dma_resv_usage * * Replace fences with a specified context with a new fence. Only valid if the * operation represented by the original fence has no longer access to the * resources represented by the dma_resv object when the new fence completes. * * And example for using this is replacing a preemption fence with a page table * update fence which makes the resource inaccessible. */ void dma_resv_replace_fences(struct dma_resv *obj, uint64_t context, struct dma_fence *replacement, enum dma_resv_usage usage) { struct dma_resv_list *list; unsigned int i; dma_resv_assert_held(obj); list = dma_resv_fences_list(obj); for (i = 0; list && i < list->num_fences; ++i) { struct dma_fence *old; dma_resv_list_entry(list, i, obj, &old, NULL); if (old->context != context) continue; dma_resv_list_set(list, i, dma_fence_get(replacement), usage); dma_fence_put(old); } } EXPORT_SYMBOL(dma_resv_replace_fences); /* Restart the unlocked iteration by initializing the cursor object. */ static void dma_resv_iter_restart_unlocked(struct dma_resv_iter *cursor) { cursor->index = 0; cursor->num_fences = 0; cursor->fences = dma_resv_fences_list(cursor->obj); if (cursor->fences) cursor->num_fences = cursor->fences->num_fences; cursor->is_restarted = true; } /* Walk to the next not signaled fence and grab a reference to it */ static void dma_resv_iter_walk_unlocked(struct dma_resv_iter *cursor) { if (!cursor->fences) return; do { /* Drop the reference from the previous round */ dma_fence_put(cursor->fence); if (cursor->index >= cursor->num_fences) { cursor->fence = NULL; break; } dma_resv_list_entry(cursor->fences, cursor->index++, cursor->obj, &cursor->fence, &cursor->fence_usage); cursor->fence = dma_fence_get_rcu(cursor->fence); if (!cursor->fence) { dma_resv_iter_restart_unlocked(cursor); continue; } if (!dma_fence_is_signaled(cursor->fence) && cursor->usage >= cursor->fence_usage) break; } while (true); } /** * dma_resv_iter_first_unlocked - first fence in an unlocked dma_resv obj. * @cursor: the cursor with the current position * * Subsequent fences are iterated with dma_resv_iter_next_unlocked(). * * Beware that the iterator can be restarted. Code which accumulates statistics * or similar needs to check for this with dma_resv_iter_is_restarted(). For * this reason prefer the locked dma_resv_iter_first() whenver possible. * * Returns the first fence from an unlocked dma_resv obj. */ struct dma_fence *dma_resv_iter_first_unlocked(struct dma_resv_iter *cursor) { rcu_read_lock(); do { dma_resv_iter_restart_unlocked(cursor); dma_resv_iter_walk_unlocked(cursor); } while (dma_resv_fences_list(cursor->obj) != cursor->fences); rcu_read_unlock(); return cursor->fence; } EXPORT_SYMBOL(dma_resv_iter_first_unlocked); /** * dma_resv_iter_next_unlocked - next fence in an unlocked dma_resv obj. * @cursor: the cursor with the current position * * Beware that the iterator can be restarted. Code which accumulates statistics * or similar needs to check for this with dma_resv_iter_is_restarted(). For * this reason prefer the locked dma_resv_iter_next() whenver possible. * * Returns the next fence from an unlocked dma_resv obj. */ struct dma_fence *dma_resv_iter_next_unlocked(struct dma_resv_iter *cursor) { bool restart; rcu_read_lock(); cursor->is_restarted = false; restart = dma_resv_fences_list(cursor->obj) != cursor->fences; do { if (restart) dma_resv_iter_restart_unlocked(cursor); dma_resv_iter_walk_unlocked(cursor); restart = true; } while (dma_resv_fences_list(cursor->obj) != cursor->fences); rcu_read_unlock(); return cursor->fence; } EXPORT_SYMBOL(dma_resv_iter_next_unlocked); /** * dma_resv_iter_first - first fence from a locked dma_resv object * @cursor: cursor to record the current position * * Subsequent fences are iterated with dma_resv_iter_next_unlocked(). * * Return the first fence in the dma_resv object while holding the * &dma_resv.lock. */ struct dma_fence *dma_resv_iter_first(struct dma_resv_iter *cursor) { struct dma_fence *fence; dma_resv_assert_held(cursor->obj); cursor->index = 0; cursor->fences = dma_resv_fences_list(cursor->obj); fence = dma_resv_iter_next(cursor); cursor->is_restarted = true; return fence; } EXPORT_SYMBOL_GPL(dma_resv_iter_first); /** * dma_resv_iter_next - next fence from a locked dma_resv object * @cursor: cursor to record the current position * * Return the next fences from the dma_resv object while holding the * &dma_resv.lock. */ struct dma_fence *dma_resv_iter_next(struct dma_resv_iter *cursor) { struct dma_fence *fence; dma_resv_assert_held(cursor->obj); cursor->is_restarted = false; do { if (!cursor->fences || cursor->index >= cursor->fences->num_fences) return NULL; dma_resv_list_entry(cursor->fences, cursor->index++, cursor->obj, &fence, &cursor->fence_usage); } while (cursor->fence_usage > cursor->usage); return fence; } EXPORT_SYMBOL_GPL(dma_resv_iter_next); /** * dma_resv_copy_fences - Copy all fences from src to dst. * @dst: the destination reservation object * @src: the source reservation object * * Copy all fences from src to dst. dst-lock must be held. */ int dma_resv_copy_fences(struct dma_resv *dst, struct dma_resv *src) { struct dma_resv_iter cursor; struct dma_resv_list *list; struct dma_fence *f; dma_resv_assert_held(dst); list = NULL; dma_resv_iter_begin(&cursor, src, DMA_RESV_USAGE_BOOKKEEP); dma_resv_for_each_fence_unlocked(&cursor, f) { if (dma_resv_iter_is_restarted(&cursor)) { dma_resv_list_free(list); list = dma_resv_list_alloc(cursor.num_fences); if (!list) { dma_resv_iter_end(&cursor); return -ENOMEM; } list->num_fences = 0; } dma_fence_get(f); dma_resv_list_set(list, list->num_fences++, f, dma_resv_iter_usage(&cursor)); } dma_resv_iter_end(&cursor); list = rcu_replace_pointer(dst->fences, list, dma_resv_held(dst)); dma_resv_list_free(list); return 0; } EXPORT_SYMBOL(dma_resv_copy_fences); /** * dma_resv_get_fences - Get an object's fences * fences without update side lock held * @obj: the reservation object * @usage: controls which fences to include, see enum dma_resv_usage. * @num_fences: the number of fences returned * @fences: the array of fence ptrs returned (array is krealloc'd to the * required size, and must be freed by caller) * * Retrieve all fences from the reservation object. * Returns either zero or -ENOMEM. */ int dma_resv_get_fences(struct dma_resv *obj, enum dma_resv_usage usage, unsigned int *num_fences, struct dma_fence ***fences) { struct dma_resv_iter cursor; struct dma_fence *fence; *num_fences = 0; *fences = NULL; dma_resv_iter_begin(&cursor, obj, usage); dma_resv_for_each_fence_unlocked(&cursor, fence) { if (dma_resv_iter_is_restarted(&cursor)) { struct dma_fence **new_fences; unsigned int count; while (*num_fences) dma_fence_put((*fences)[--(*num_fences)]); count = cursor.num_fences + 1; /* Eventually re-allocate the array */ new_fences = krealloc_array(*fences, count, sizeof(void *), GFP_KERNEL); if (count && !new_fences) { kfree(*fences); *fences = NULL; *num_fences = 0; dma_resv_iter_end(&cursor); return -ENOMEM; } *fences = new_fences; } (*fences)[(*num_fences)++] = dma_fence_get(fence); } dma_resv_iter_end(&cursor); return 0; } EXPORT_SYMBOL_GPL(dma_resv_get_fences); /** * dma_resv_get_singleton - Get a single fence for all the fences * @obj: the reservation object * @usage: controls which fences to include, see enum dma_resv_usage. * @fence: the resulting fence * * Get a single fence representing all the fences inside the resv object. * Returns either 0 for success or -ENOMEM. * * Warning: This can't be used like this when adding the fence back to the resv * object since that can lead to stack corruption when finalizing the * dma_fence_array. * * Returns 0 on success and negative error values on failure. */ int dma_resv_get_singleton(struct dma_resv *obj, enum dma_resv_usage usage, struct dma_fence **fence) { struct dma_fence_array *array; struct dma_fence **fences; unsigned count; int r; r = dma_resv_get_fences(obj, usage, &count, &fences); if (r) return r; if (count == 0) { *fence = NULL; return 0; } if (count == 1) { *fence = fences[0]; kfree(fences); return 0; } array = dma_fence_array_create(count, fences, dma_fence_context_alloc(1), 1, false); if (!array) { while (count--) dma_fence_put(fences[count]); kfree(fences); return -ENOMEM; } *fence = &array->base; return 0; } EXPORT_SYMBOL_GPL(dma_resv_get_singleton); /** * dma_resv_wait_timeout - Wait on reservation's objects fences * @obj: the reservation object * @usage: controls which fences to include, see enum dma_resv_usage. * @intr: if true, do interruptible wait * @timeout: timeout value in jiffies or zero to return immediately * * Callers are not required to hold specific locks, but maybe hold * dma_resv_lock() already * RETURNS * Returns -ERESTARTSYS if interrupted, 0 if the wait timed out, or * greater than zero on success. */ long dma_resv_wait_timeout(struct dma_resv *obj, enum dma_resv_usage usage, bool intr, unsigned long timeout) { long ret = timeout ? timeout : 1; struct dma_resv_iter cursor; struct dma_fence *fence; dma_resv_iter_begin(&cursor, obj, usage); dma_resv_for_each_fence_unlocked(&cursor, fence) { ret = dma_fence_wait_timeout(fence, intr, ret); if (ret <= 0) { dma_resv_iter_end(&cursor); return ret; } } dma_resv_iter_end(&cursor); return ret; } EXPORT_SYMBOL_GPL(dma_resv_wait_timeout); /** * dma_resv_set_deadline - Set a deadline on reservation's objects fences * @obj: the reservation object * @usage: controls which fences to include, see enum dma_resv_usage. * @deadline: the requested deadline (MONOTONIC) * * May be called without holding the dma_resv lock. Sets @deadline on * all fences filtered by @usage. */ void dma_resv_set_deadline(struct dma_resv *obj, enum dma_resv_usage usage, ktime_t deadline) { struct dma_resv_iter cursor; struct dma_fence *fence; dma_resv_iter_begin(&cursor, obj, usage); dma_resv_for_each_fence_unlocked(&cursor, fence) { dma_fence_set_deadline(fence, deadline); } dma_resv_iter_end(&cursor); } EXPORT_SYMBOL_GPL(dma_resv_set_deadline); /** * dma_resv_test_signaled - Test if a reservation object's fences have been * signaled. * @obj: the reservation object * @usage: controls which fences to include, see enum dma_resv_usage. * * Callers are not required to hold specific locks, but maybe hold * dma_resv_lock() already. * * RETURNS * * True if all fences signaled, else false. */ bool dma_resv_test_signaled(struct dma_resv *obj, enum dma_resv_usage usage) { struct dma_resv_iter cursor; struct dma_fence *fence; dma_resv_iter_begin(&cursor, obj, usage); dma_resv_for_each_fence_unlocked(&cursor, fence) { dma_resv_iter_end(&cursor); return false; } dma_resv_iter_end(&cursor); return true; } EXPORT_SYMBOL_GPL(dma_resv_test_signaled); /** * dma_resv_describe - Dump description of the resv object into seq_file * @obj: the reservation object * @seq: the seq_file to dump the description into * * Dump a textual description of the fences inside an dma_resv object into the * seq_file. */ void dma_resv_describe(struct dma_resv *obj, struct seq_file *seq) { static const char *usage[] = { "kernel", "write", "read", "bookkeep" }; struct dma_resv_iter cursor; struct dma_fence *fence; dma_resv_for_each_fence(&cursor, obj, DMA_RESV_USAGE_READ, fence) { seq_printf(seq, "\t%s fence:", usage[dma_resv_iter_usage(&cursor)]); dma_fence_describe(fence, seq); } } EXPORT_SYMBOL_GPL(dma_resv_describe); #if IS_ENABLED(CONFIG_LOCKDEP) static int __init dma_resv_lockdep(void) { struct mm_struct *mm = mm_alloc(); struct ww_acquire_ctx ctx; struct dma_resv obj; struct address_space mapping; int ret; if (!mm) return -ENOMEM; dma_resv_init(&obj); address_space_init_once(&mapping); mmap_read_lock(mm); ww_acquire_init(&ctx, &reservation_ww_class); ret = dma_resv_lock(&obj, &ctx); if (ret == -EDEADLK) dma_resv_lock_slow(&obj, &ctx); fs_reclaim_acquire(GFP_KERNEL); /* for unmap_mapping_range on trylocked buffer objects in shrinkers */ i_mmap_lock_write(&mapping); i_mmap_unlock_write(&mapping); #ifdef CONFIG_MMU_NOTIFIER lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); __dma_fence_might_wait(); lock_map_release(&__mmu_notifier_invalidate_range_start_map); #else __dma_fence_might_wait(); #endif fs_reclaim_release(GFP_KERNEL); ww_mutex_unlock(&obj.lock); ww_acquire_fini(&ctx); mmap_read_unlock(mm); mmput(mm); return 0; } subsys_initcall(dma_resv_lockdep); #endif
1 1 1 1 1 1 1 4 3 1 1 1 1 4 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux driver for M2Tech hiFace compatible devices * * Copyright 2012-2013 (C) M2TECH S.r.l and Amarula Solutions B.V. * * Authors: Michael Trimarchi <michael@amarulasolutions.com> * Antonio Ospite <ao2@amarulasolutions.com> * * The driver is based on the work done in TerraTec DMX 6Fire USB */ #include <linux/module.h> #include <linux/slab.h> #include <sound/initval.h> #include "chip.h" #include "pcm.h" MODULE_AUTHOR("Michael Trimarchi <michael@amarulasolutions.com>"); MODULE_AUTHOR("Antonio Ospite <ao2@amarulasolutions.com>"); MODULE_DESCRIPTION("M2Tech hiFace USB-SPDIF audio driver"); MODULE_LICENSE("GPL v2"); static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX; /* Index 0-max */ static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR; /* Id for card */ static bool enable[SNDRV_CARDS] = SNDRV_DEFAULT_ENABLE_PNP; /* Enable this card */ #define DRIVER_NAME "snd-usb-hiface" #define CARD_NAME "hiFace" module_param_array(index, int, NULL, 0444); MODULE_PARM_DESC(index, "Index value for " CARD_NAME " soundcard."); module_param_array(id, charp, NULL, 0444); MODULE_PARM_DESC(id, "ID string for " CARD_NAME " soundcard."); module_param_array(enable, bool, NULL, 0444); MODULE_PARM_DESC(enable, "Enable " CARD_NAME " soundcard."); static DEFINE_MUTEX(register_mutex); struct hiface_vendor_quirk { const char *device_name; u8 extra_freq; }; static int hiface_chip_create(struct usb_interface *intf, struct usb_device *device, int idx, const struct hiface_vendor_quirk *quirk, struct hiface_chip **rchip) { struct snd_card *card = NULL; struct hiface_chip *chip; int ret; int len; *rchip = NULL; /* if we are here, card can be registered in alsa. */ ret = snd_card_new(&intf->dev, index[idx], id[idx], THIS_MODULE, sizeof(*chip), &card); if (ret < 0) { dev_err(&device->dev, "cannot create alsa card.\n"); return ret; } strscpy(card->driver, DRIVER_NAME, sizeof(card->driver)); if (quirk && quirk->device_name) strscpy(card->shortname, quirk->device_name, sizeof(card->shortname)); else strscpy(card->shortname, "M2Tech generic audio", sizeof(card->shortname)); strlcat(card->longname, card->shortname, sizeof(card->longname)); len = strlcat(card->longname, " at ", sizeof(card->longname)); if (len < sizeof(card->longname)) usb_make_path(device, card->longname + len, sizeof(card->longname) - len); chip = card->private_data; chip->dev = device; chip->card = card; *rchip = chip; return 0; } static int hiface_chip_probe(struct usb_interface *intf, const struct usb_device_id *usb_id) { const struct hiface_vendor_quirk *quirk = (struct hiface_vendor_quirk *)usb_id->driver_info; int ret; int i; struct hiface_chip *chip; struct usb_device *device = interface_to_usbdev(intf); ret = usb_set_interface(device, 0, 0); if (ret != 0) { dev_err(&device->dev, "can't set first interface for " CARD_NAME " device.\n"); return -EIO; } /* check whether the card is already registered */ chip = NULL; mutex_lock(&register_mutex); for (i = 0; i < SNDRV_CARDS; i++) if (enable[i]) break; if (i >= SNDRV_CARDS) { dev_err(&device->dev, "no available " CARD_NAME " audio device\n"); ret = -ENODEV; goto err; } ret = hiface_chip_create(intf, device, i, quirk, &chip); if (ret < 0) goto err; ret = hiface_pcm_init(chip, quirk ? quirk->extra_freq : 0); if (ret < 0) goto err_chip_destroy; ret = snd_card_register(chip->card); if (ret < 0) { dev_err(&device->dev, "cannot register " CARD_NAME " card\n"); goto err_chip_destroy; } mutex_unlock(&register_mutex); usb_set_intfdata(intf, chip); return 0; err_chip_destroy: snd_card_free(chip->card); err: mutex_unlock(&register_mutex); return ret; } static void hiface_chip_disconnect(struct usb_interface *intf) { struct hiface_chip *chip; struct snd_card *card; chip = usb_get_intfdata(intf); if (!chip) return; card = chip->card; /* Make sure that the userspace cannot create new request */ snd_card_disconnect(card); hiface_pcm_abort(chip); snd_card_free_when_closed(card); } static const struct usb_device_id device_table[] = { { USB_DEVICE(0x04b4, 0x0384), .driver_info = (unsigned long)&(const struct hiface_vendor_quirk) { .device_name = "Young", .extra_freq = 1, } }, { USB_DEVICE(0x04b4, 0x930b), .driver_info = (unsigned long)&(const struct hiface_vendor_quirk) { .device_name = "hiFace", } }, { USB_DEVICE(0x04b4, 0x931b), .driver_info = (unsigned long)&(const struct hiface_vendor_quirk) { .device_name = "North Star", } }, { USB_DEVICE(0x04b4, 0x931c), .driver_info = (unsigned long)&(const struct hiface_vendor_quirk) { .device_name = "W4S Young", } }, { USB_DEVICE(0x04b4, 0x931d), .driver_info = (unsigned long)&(const struct hiface_vendor_quirk) { .device_name = "Corrson", } }, { USB_DEVICE(0x04b4, 0x931e), .driver_info = (unsigned long)&(const struct hiface_vendor_quirk) { .device_name = "AUDIA", } }, { USB_DEVICE(0x04b4, 0x931f), .driver_info = (unsigned long)&(const struct hiface_vendor_quirk) { .device_name = "SL Audio", } }, { USB_DEVICE(0x04b4, 0x9320), .driver_info = (unsigned long)&(const struct hiface_vendor_quirk) { .device_name = "Empirical", } }, { USB_DEVICE(0x04b4, 0x9321), .driver_info = (unsigned long)&(const struct hiface_vendor_quirk) { .device_name = "Rockna", } }, { USB_DEVICE(0x249c, 0x9001), .driver_info = (unsigned long)&(const struct hiface_vendor_quirk) { .device_name = "Pathos", } }, { USB_DEVICE(0x249c, 0x9002), .driver_info = (unsigned long)&(const struct hiface_vendor_quirk) { .device_name = "Metronome", } }, { USB_DEVICE(0x249c, 0x9006), .driver_info = (unsigned long)&(const struct hiface_vendor_quirk) { .device_name = "CAD", } }, { USB_DEVICE(0x249c, 0x9008), .driver_info = (unsigned long)&(const struct hiface_vendor_quirk) { .device_name = "Audio Esclusive", } }, { USB_DEVICE(0x249c, 0x931c), .driver_info = (unsigned long)&(const struct hiface_vendor_quirk) { .device_name = "Rotel", } }, { USB_DEVICE(0x249c, 0x932c), .driver_info = (unsigned long)&(const struct hiface_vendor_quirk) { .device_name = "Eeaudio", } }, { USB_DEVICE(0x245f, 0x931c), .driver_info = (unsigned long)&(const struct hiface_vendor_quirk) { .device_name = "CHORD", } }, { USB_DEVICE(0x25c6, 0x9002), .driver_info = (unsigned long)&(const struct hiface_vendor_quirk) { .device_name = "Vitus", } }, {} }; MODULE_DEVICE_TABLE(usb, device_table); static struct usb_driver hiface_usb_driver = { .name = DRIVER_NAME, .probe = hiface_chip_probe, .disconnect = hiface_chip_disconnect, .id_table = device_table, }; module_usb_driver(hiface_usb_driver);
11 11 11 11 11 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 /* * linux/drivers/video/fbmem.c * * Copyright (C) 1994 Martin Schaller * * 2001 - Documented with DocBook * - Brad Douglas <brad@neruo.com> * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of this archive * for more details. */ #include <linux/console.h> #include <linux/export.h> #include <linux/fb.h> #include <linux/fbcon.h> #include <video/nomodeset.h> #include "fb_internal.h" /* * Frame buffer device initialization and setup routines */ #define FBPIXMAPSIZE (1024 * 8) struct class *fb_class; DEFINE_MUTEX(registration_lock); struct fb_info *registered_fb[FB_MAX] __read_mostly; int num_registered_fb __read_mostly; #define for_each_registered_fb(i) \ for (i = 0; i < FB_MAX; i++) \ if (!registered_fb[i]) {} else struct fb_info *get_fb_info(unsigned int idx) { struct fb_info *fb_info; if (idx >= FB_MAX) return ERR_PTR(-ENODEV); mutex_lock(&registration_lock); fb_info = registered_fb[idx]; if (fb_info) refcount_inc(&fb_info->count); mutex_unlock(&registration_lock); return fb_info; } void put_fb_info(struct fb_info *fb_info) { if (!refcount_dec_and_test(&fb_info->count)) return; if (fb_info->fbops->fb_destroy) fb_info->fbops->fb_destroy(fb_info); } /* * Helpers */ int fb_get_color_depth(struct fb_var_screeninfo *var, struct fb_fix_screeninfo *fix) { int depth = 0; if (fix->visual == FB_VISUAL_MONO01 || fix->visual == FB_VISUAL_MONO10) depth = 1; else { if (var->green.length == var->blue.length && var->green.length == var->red.length && var->green.offset == var->blue.offset && var->green.offset == var->red.offset) depth = var->green.length; else depth = var->green.length + var->red.length + var->blue.length; } return depth; } EXPORT_SYMBOL(fb_get_color_depth); /* * Data padding functions. */ void fb_pad_aligned_buffer(u8 *dst, u32 d_pitch, u8 *src, u32 s_pitch, u32 height) { __fb_pad_aligned_buffer(dst, d_pitch, src, s_pitch, height); } EXPORT_SYMBOL(fb_pad_aligned_buffer); void fb_pad_unaligned_buffer(u8 *dst, u32 d_pitch, u8 *src, u32 idx, u32 height, u32 shift_high, u32 shift_low, u32 mod) { u8 mask = (u8) (0xfff << shift_high), tmp; int i, j; for (i = height; i--; ) { for (j = 0; j < idx; j++) { tmp = dst[j]; tmp &= mask; tmp |= *src >> shift_low; dst[j] = tmp; tmp = *src << shift_high; dst[j+1] = tmp; src++; } tmp = dst[idx]; tmp &= mask; tmp |= *src >> shift_low; dst[idx] = tmp; if (shift_high < mod) { tmp = *src << shift_high; dst[idx+1] = tmp; } src++; dst += d_pitch; } } EXPORT_SYMBOL(fb_pad_unaligned_buffer); /* * we need to lock this section since fb_cursor * may use fb_imageblit() */ char* fb_get_buffer_offset(struct fb_info *info, struct fb_pixmap *buf, u32 size) { u32 align = buf->buf_align - 1, offset; char *addr = buf->addr; /* If IO mapped, we need to sync before access, no sharing of * the pixmap is done */ if (buf->flags & FB_PIXMAP_IO) { if (info->fbops->fb_sync && (buf->flags & FB_PIXMAP_SYNC)) info->fbops->fb_sync(info); return addr; } /* See if we fit in the remaining pixmap space */ offset = buf->offset + align; offset &= ~align; if (offset + size > buf->size) { /* We do not fit. In order to be able to re-use the buffer, * we must ensure no asynchronous DMA'ing or whatever operation * is in progress, we sync for that. */ if (info->fbops->fb_sync && (buf->flags & FB_PIXMAP_SYNC)) info->fbops->fb_sync(info); offset = 0; } buf->offset = offset + size; addr += offset; return addr; } EXPORT_SYMBOL(fb_get_buffer_offset); int fb_pan_display(struct fb_info *info, struct fb_var_screeninfo *var) { struct fb_fix_screeninfo *fix = &info->fix; unsigned int yres = info->var.yres; int err = 0; if (var->yoffset > 0) { if (var->vmode & FB_VMODE_YWRAP) { if (!fix->ywrapstep || (var->yoffset % fix->ywrapstep)) err = -EINVAL; else yres = 0; } else if (!fix->ypanstep || (var->yoffset % fix->ypanstep)) err = -EINVAL; } if (var->xoffset > 0 && (!fix->xpanstep || (var->xoffset % fix->xpanstep))) err = -EINVAL; if (err || !info->fbops->fb_pan_display || var->yoffset > info->var.yres_virtual - yres || var->xoffset > info->var.xres_virtual - info->var.xres) return -EINVAL; if ((err = info->fbops->fb_pan_display(var, info))) return err; info->var.xoffset = var->xoffset; info->var.yoffset = var->yoffset; if (var->vmode & FB_VMODE_YWRAP) info->var.vmode |= FB_VMODE_YWRAP; else info->var.vmode &= ~FB_VMODE_YWRAP; return 0; } EXPORT_SYMBOL(fb_pan_display); static int fb_check_caps(struct fb_info *info, struct fb_var_screeninfo *var, u32 activate) { struct fb_blit_caps caps, fbcaps; int err = 0; memset(&caps, 0, sizeof(caps)); memset(&fbcaps, 0, sizeof(fbcaps)); caps.flags = (activate & FB_ACTIVATE_ALL) ? 1 : 0; fbcon_get_requirement(info, &caps); info->fbops->fb_get_caps(info, &fbcaps, var); if (((fbcaps.x ^ caps.x) & caps.x) || ((fbcaps.y ^ caps.y) & caps.y) || (fbcaps.len < caps.len)) err = -EINVAL; return err; } int fb_set_var(struct fb_info *info, struct fb_var_screeninfo *var) { int ret = 0; u32 activate; struct fb_var_screeninfo old_var; struct fb_videomode mode; struct fb_event event; u32 unused; if (var->activate & FB_ACTIVATE_INV_MODE) { struct fb_videomode mode1, mode2; fb_var_to_videomode(&mode1, var); fb_var_to_videomode(&mode2, &info->var); /* make sure we don't delete the videomode of current var */ ret = fb_mode_is_equal(&mode1, &mode2); if (!ret) { ret = fbcon_mode_deleted(info, &mode1); if (!ret) fb_delete_videomode(&mode1, &info->modelist); } return ret ? -EINVAL : 0; } if (!(var->activate & FB_ACTIVATE_FORCE) && !memcmp(&info->var, var, sizeof(struct fb_var_screeninfo))) return 0; activate = var->activate; /* When using FOURCC mode, make sure the red, green, blue and * transp fields are set to 0. */ if ((info->fix.capabilities & FB_CAP_FOURCC) && var->grayscale > 1) { if (var->red.offset || var->green.offset || var->blue.offset || var->transp.offset || var->red.length || var->green.length || var->blue.length || var->transp.length || var->red.msb_right || var->green.msb_right || var->blue.msb_right || var->transp.msb_right) return -EINVAL; } if (!info->fbops->fb_check_var) { *var = info->var; return 0; } /* bitfill_aligned() assumes that it's at least 8x8 */ if (var->xres < 8 || var->yres < 8) return -EINVAL; /* Too huge resolution causes multiplication overflow. */ if (check_mul_overflow(var->xres, var->yres, &unused) || check_mul_overflow(var->xres_virtual, var->yres_virtual, &unused)) return -EINVAL; ret = info->fbops->fb_check_var(var, info); if (ret) return ret; /* verify that virtual resolution >= physical resolution */ if (var->xres_virtual < var->xres || var->yres_virtual < var->yres) { pr_warn("WARNING: fbcon: Driver '%s' missed to adjust virtual screen size (%ux%u vs. %ux%u)\n", info->fix.id, var->xres_virtual, var->yres_virtual, var->xres, var->yres); return -EINVAL; } if ((var->activate & FB_ACTIVATE_MASK) != FB_ACTIVATE_NOW) return 0; if (info->fbops->fb_get_caps) { ret = fb_check_caps(info, var, activate); if (ret) return ret; } old_var = info->var; info->var = *var; if (info->fbops->fb_set_par) { ret = info->fbops->fb_set_par(info); if (ret) { info->var = old_var; printk(KERN_WARNING "detected " "fb_set_par error, " "error code: %d\n", ret); return ret; } } fb_pan_display(info, &info->var); fb_set_cmap(&info->cmap, info); fb_var_to_videomode(&mode, &info->var); if (info->modelist.prev && info->modelist.next && !list_empty(&info->modelist)) ret = fb_add_videomode(&mode, &info->modelist); if (ret) return ret; event.info = info; event.data = &mode; fb_notifier_call_chain(FB_EVENT_MODE_CHANGE, &event); return 0; } EXPORT_SYMBOL(fb_set_var); int fb_blank(struct fb_info *info, int blank) { struct fb_event event; int ret = -EINVAL; if (blank > FB_BLANK_POWERDOWN) blank = FB_BLANK_POWERDOWN; event.info = info; event.data = &blank; if (info->fbops->fb_blank) ret = info->fbops->fb_blank(blank, info); if (!ret) fb_notifier_call_chain(FB_EVENT_BLANK, &event); return ret; } EXPORT_SYMBOL(fb_blank); static int fb_check_foreignness(struct fb_info *fi) { const bool foreign_endian = fi->flags & FBINFO_FOREIGN_ENDIAN; fi->flags &= ~FBINFO_FOREIGN_ENDIAN; #ifdef __BIG_ENDIAN fi->flags |= foreign_endian ? 0 : FBINFO_BE_MATH; #else fi->flags |= foreign_endian ? FBINFO_BE_MATH : 0; #endif /* __BIG_ENDIAN */ if (fi->flags & FBINFO_BE_MATH && !fb_be_math(fi)) { pr_err("%s: enable CONFIG_FB_BIG_ENDIAN to " "support this framebuffer\n", fi->fix.id); return -ENOSYS; } else if (!(fi->flags & FBINFO_BE_MATH) && fb_be_math(fi)) { pr_err("%s: enable CONFIG_FB_LITTLE_ENDIAN to " "support this framebuffer\n", fi->fix.id); return -ENOSYS; } return 0; } static int do_register_framebuffer(struct fb_info *fb_info) { int i; struct fb_videomode mode; if (fb_check_foreignness(fb_info)) return -ENOSYS; if (num_registered_fb == FB_MAX) return -ENXIO; num_registered_fb++; for (i = 0 ; i < FB_MAX; i++) if (!registered_fb[i]) break; fb_info->node = i; refcount_set(&fb_info->count, 1); mutex_init(&fb_info->lock); mutex_init(&fb_info->mm_lock); fb_device_create(fb_info); if (fb_info->pixmap.addr == NULL) { fb_info->pixmap.addr = kmalloc(FBPIXMAPSIZE, GFP_KERNEL); if (fb_info->pixmap.addr) { fb_info->pixmap.size = FBPIXMAPSIZE; fb_info->pixmap.buf_align = 1; fb_info->pixmap.scan_align = 1; fb_info->pixmap.access_align = 32; fb_info->pixmap.flags = FB_PIXMAP_DEFAULT; } } fb_info->pixmap.offset = 0; if (!fb_info->pixmap.blit_x) fb_info->pixmap.blit_x = ~(u32)0; if (!fb_info->pixmap.blit_y) fb_info->pixmap.blit_y = ~(u32)0; if (!fb_info->modelist.prev || !fb_info->modelist.next) INIT_LIST_HEAD(&fb_info->modelist); if (fb_info->skip_vt_switch) pm_vt_switch_required(fb_info->device, false); else pm_vt_switch_required(fb_info->device, true); fb_var_to_videomode(&mode, &fb_info->var); fb_add_videomode(&mode, &fb_info->modelist); registered_fb[i] = fb_info; #ifdef CONFIG_GUMSTIX_AM200EPD { struct fb_event event; event.info = fb_info; fb_notifier_call_chain(FB_EVENT_FB_REGISTERED, &event); } #endif return fbcon_fb_registered(fb_info); } static void unbind_console(struct fb_info *fb_info) { int i = fb_info->node; if (WARN_ON(i < 0 || i >= FB_MAX || registered_fb[i] != fb_info)) return; fbcon_fb_unbind(fb_info); } static void unlink_framebuffer(struct fb_info *fb_info) { int i; i = fb_info->node; if (WARN_ON(i < 0 || i >= FB_MAX || registered_fb[i] != fb_info)) return; fb_device_destroy(fb_info); pm_vt_switch_unregister(fb_info->device); unbind_console(fb_info); } static void do_unregister_framebuffer(struct fb_info *fb_info) { unlink_framebuffer(fb_info); if (fb_info->pixmap.addr && (fb_info->pixmap.flags & FB_PIXMAP_DEFAULT)) { kfree(fb_info->pixmap.addr); fb_info->pixmap.addr = NULL; } fb_destroy_modelist(&fb_info->modelist); registered_fb[fb_info->node] = NULL; num_registered_fb--; #ifdef CONFIG_GUMSTIX_AM200EPD { struct fb_event event; event.info = fb_info; fb_notifier_call_chain(FB_EVENT_FB_UNREGISTERED, &event); } #endif fbcon_fb_unregistered(fb_info); /* this may free fb info */ put_fb_info(fb_info); } /** * register_framebuffer - registers a frame buffer device * @fb_info: frame buffer info structure * * Registers a frame buffer device @fb_info. * * Returns negative errno on error, or zero for success. * */ int register_framebuffer(struct fb_info *fb_info) { int ret; mutex_lock(&registration_lock); ret = do_register_framebuffer(fb_info); mutex_unlock(&registration_lock); return ret; } EXPORT_SYMBOL(register_framebuffer); /** * unregister_framebuffer - releases a frame buffer device * @fb_info: frame buffer info structure * * Unregisters a frame buffer device @fb_info. * * Returns negative errno on error, or zero for success. * * This function will also notify the framebuffer console * to release the driver. * * This is meant to be called within a driver's module_exit() * function. If this is called outside module_exit(), ensure * that the driver implements fb_open() and fb_release() to * check that no processes are using the device. */ void unregister_framebuffer(struct fb_info *fb_info) { mutex_lock(&registration_lock); do_unregister_framebuffer(fb_info); mutex_unlock(&registration_lock); } EXPORT_SYMBOL(unregister_framebuffer); /** * fb_set_suspend - low level driver signals suspend * @info: framebuffer affected * @state: 0 = resuming, !=0 = suspending * * This is meant to be used by low level drivers to * signal suspend/resume to the core & clients. * It must be called with the console semaphore held */ void fb_set_suspend(struct fb_info *info, int state) { WARN_CONSOLE_UNLOCKED(); if (state) { fbcon_suspended(info); info->state = FBINFO_STATE_SUSPENDED; } else { info->state = FBINFO_STATE_RUNNING; fbcon_resumed(info); } } EXPORT_SYMBOL(fb_set_suspend); static int __init fbmem_init(void) { int ret; fb_class = class_create("graphics"); if (IS_ERR(fb_class)) { ret = PTR_ERR(fb_class); pr_err("Unable to create fb class; errno = %d\n", ret); goto err_fb_class; } ret = fb_init_procfs(); if (ret) goto err_class_destroy; ret = fb_register_chrdev(); if (ret) goto err_fb_cleanup_procfs; fb_console_init(); return 0; err_fb_cleanup_procfs: fb_cleanup_procfs(); err_class_destroy: class_destroy(fb_class); err_fb_class: fb_class = NULL; return ret; } #ifdef MODULE static void __exit fbmem_exit(void) { fb_console_exit(); fb_unregister_chrdev(); fb_cleanup_procfs(); class_destroy(fb_class); } module_init(fbmem_init); module_exit(fbmem_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Framebuffer base"); #else subsys_initcall(fbmem_init); #endif int fb_new_modelist(struct fb_info *info) { struct fb_var_screeninfo var = info->var; struct list_head *pos, *n; struct fb_modelist *modelist; struct fb_videomode *m, mode; int err; list_for_each_safe(pos, n, &info->modelist) { modelist = list_entry(pos, struct fb_modelist, list); m = &modelist->mode; fb_videomode_to_var(&var, m); var.activate = FB_ACTIVATE_TEST; err = fb_set_var(info, &var); fb_var_to_videomode(&mode, &var); if (err || !fb_mode_is_equal(m, &mode)) { list_del(pos); kfree(pos); } } if (list_empty(&info->modelist)) return 1; fbcon_new_modelist(info); return 0; } #if defined(CONFIG_VIDEO_NOMODESET) bool fb_modesetting_disabled(const char *drvname) { bool fwonly = video_firmware_drivers_only(); if (fwonly) pr_warn("Driver %s not loading because of nomodeset parameter\n", drvname); return fwonly; } EXPORT_SYMBOL(fb_modesetting_disabled); #endif MODULE_LICENSE("GPL");
24 24 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 #include <linux/rtnetlink.h> #include <linux/notifier.h> #include <linux/rcupdate.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/fib_notifier.h> static unsigned int fib_notifier_net_id; struct fib_notifier_net { struct list_head fib_notifier_ops; struct atomic_notifier_head fib_chain; }; int call_fib_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info) { int err; err = nb->notifier_call(nb, event_type, info); return notifier_to_errno(err); } EXPORT_SYMBOL(call_fib_notifier); int call_fib_notifiers(struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info) { struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); int err; err = atomic_notifier_call_chain(&fn_net->fib_chain, event_type, info); return notifier_to_errno(err); } EXPORT_SYMBOL(call_fib_notifiers); static unsigned int fib_seq_sum(struct net *net) { struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); struct fib_notifier_ops *ops; unsigned int fib_seq = 0; rtnl_lock(); rcu_read_lock(); list_for_each_entry_rcu(ops, &fn_net->fib_notifier_ops, list) { if (!try_module_get(ops->owner)) continue; fib_seq += ops->fib_seq_read(net); module_put(ops->owner); } rcu_read_unlock(); rtnl_unlock(); return fib_seq; } static int fib_net_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); struct fib_notifier_ops *ops; int err = 0; rcu_read_lock(); list_for_each_entry_rcu(ops, &fn_net->fib_notifier_ops, list) { if (!try_module_get(ops->owner)) continue; err = ops->fib_dump(net, nb, extack); module_put(ops->owner); if (err) goto unlock; } unlock: rcu_read_unlock(); return err; } static bool fib_dump_is_consistent(struct net *net, struct notifier_block *nb, void (*cb)(struct notifier_block *nb), unsigned int fib_seq) { struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); atomic_notifier_chain_register(&fn_net->fib_chain, nb); if (fib_seq == fib_seq_sum(net)) return true; atomic_notifier_chain_unregister(&fn_net->fib_chain, nb); if (cb) cb(nb); return false; } #define FIB_DUMP_MAX_RETRIES 5 int register_fib_notifier(struct net *net, struct notifier_block *nb, void (*cb)(struct notifier_block *nb), struct netlink_ext_ack *extack) { int retries = 0; int err; do { unsigned int fib_seq = fib_seq_sum(net); err = fib_net_dump(net, nb, extack); if (err) return err; if (fib_dump_is_consistent(net, nb, cb, fib_seq)) return 0; } while (++retries < FIB_DUMP_MAX_RETRIES); return -EBUSY; } EXPORT_SYMBOL(register_fib_notifier); int unregister_fib_notifier(struct net *net, struct notifier_block *nb) { struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); return atomic_notifier_chain_unregister(&fn_net->fib_chain, nb); } EXPORT_SYMBOL(unregister_fib_notifier); static int __fib_notifier_ops_register(struct fib_notifier_ops *ops, struct net *net) { struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); struct fib_notifier_ops *o; list_for_each_entry(o, &fn_net->fib_notifier_ops, list) if (ops->family == o->family) return -EEXIST; list_add_tail_rcu(&ops->list, &fn_net->fib_notifier_ops); return 0; } struct fib_notifier_ops * fib_notifier_ops_register(const struct fib_notifier_ops *tmpl, struct net *net) { struct fib_notifier_ops *ops; int err; ops = kmemdup(tmpl, sizeof(*ops), GFP_KERNEL); if (!ops) return ERR_PTR(-ENOMEM); err = __fib_notifier_ops_register(ops, net); if (err) goto err_register; return ops; err_register: kfree(ops); return ERR_PTR(err); } EXPORT_SYMBOL(fib_notifier_ops_register); void fib_notifier_ops_unregister(struct fib_notifier_ops *ops) { list_del_rcu(&ops->list); kfree_rcu(ops, rcu); } EXPORT_SYMBOL(fib_notifier_ops_unregister); static int __net_init fib_notifier_net_init(struct net *net) { struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); INIT_LIST_HEAD(&fn_net->fib_notifier_ops); ATOMIC_INIT_NOTIFIER_HEAD(&fn_net->fib_chain); return 0; } static void __net_exit fib_notifier_net_exit(struct net *net) { struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); WARN_ON_ONCE(!list_empty(&fn_net->fib_notifier_ops)); } static struct pernet_operations fib_notifier_net_ops = { .init = fib_notifier_net_init, .exit = fib_notifier_net_exit, .id = &fib_notifier_net_id, .size = sizeof(struct fib_notifier_net), }; static int __init fib_notifier_init(void) { return register_pernet_subsys(&fib_notifier_net_ops); } subsys_initcall(fib_notifier_init);
231 230 230 230 230 230 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2016 Mellanox Technologies. All rights reserved. * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> */ #include "devl_internal.h" #define DEVLINK_PORT_FN_CAPS_VALID_MASK \ (_BITUL(__DEVLINK_PORT_FN_ATTR_CAPS_MAX) - 1) static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1] = { [DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] = { .type = NLA_BINARY }, [DEVLINK_PORT_FN_ATTR_STATE] = NLA_POLICY_RANGE(NLA_U8, DEVLINK_PORT_FN_STATE_INACTIVE, DEVLINK_PORT_FN_STATE_ACTIVE), [DEVLINK_PORT_FN_ATTR_CAPS] = NLA_POLICY_BITFIELD32(DEVLINK_PORT_FN_CAPS_VALID_MASK), }; #define ASSERT_DEVLINK_PORT_REGISTERED(devlink_port) \ WARN_ON_ONCE(!(devlink_port)->registered) #define ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port) \ WARN_ON_ONCE((devlink_port)->registered) struct devlink_port *devlink_port_get_by_index(struct devlink *devlink, unsigned int port_index) { return xa_load(&devlink->ports, port_index); } struct devlink_port *devlink_port_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) { if (attrs[DEVLINK_ATTR_PORT_INDEX]) { u32 port_index = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]); struct devlink_port *devlink_port; devlink_port = devlink_port_get_by_index(devlink, port_index); if (!devlink_port) return ERR_PTR(-ENODEV); return devlink_port; } return ERR_PTR(-EINVAL); } struct devlink_port *devlink_port_get_from_info(struct devlink *devlink, struct genl_info *info) { return devlink_port_get_from_attrs(devlink, info->attrs); } static void devlink_port_fn_cap_fill(struct nla_bitfield32 *caps, u32 cap, bool is_enable) { caps->selector |= cap; if (is_enable) caps->value |= cap; } static int devlink_port_fn_roce_fill(struct devlink_port *devlink_port, struct nla_bitfield32 *caps, struct netlink_ext_ack *extack) { bool is_enable; int err; if (!devlink_port->ops->port_fn_roce_get) return 0; err = devlink_port->ops->port_fn_roce_get(devlink_port, &is_enable, extack); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_ROCE, is_enable); return 0; } static int devlink_port_fn_migratable_fill(struct devlink_port *devlink_port, struct nla_bitfield32 *caps, struct netlink_ext_ack *extack) { bool is_enable; int err; if (!devlink_port->ops->port_fn_migratable_get || devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) return 0; err = devlink_port->ops->port_fn_migratable_get(devlink_port, &is_enable, extack); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_MIGRATABLE, is_enable); return 0; } static int devlink_port_fn_ipsec_crypto_fill(struct devlink_port *devlink_port, struct nla_bitfield32 *caps, struct netlink_ext_ack *extack) { bool is_enable; int err; if (!devlink_port->ops->port_fn_ipsec_crypto_get || devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) return 0; err = devlink_port->ops->port_fn_ipsec_crypto_get(devlink_port, &is_enable, extack); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO, is_enable); return 0; } static int devlink_port_fn_ipsec_packet_fill(struct devlink_port *devlink_port, struct nla_bitfield32 *caps, struct netlink_ext_ack *extack) { bool is_enable; int err; if (!devlink_port->ops->port_fn_ipsec_packet_get || devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) return 0; err = devlink_port->ops->port_fn_ipsec_packet_get(devlink_port, &is_enable, extack); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_IPSEC_PACKET, is_enable); return 0; } static int devlink_port_fn_caps_fill(struct devlink_port *devlink_port, struct sk_buff *msg, struct netlink_ext_ack *extack, bool *msg_updated) { struct nla_bitfield32 caps = {}; int err; err = devlink_port_fn_roce_fill(devlink_port, &caps, extack); if (err) return err; err = devlink_port_fn_migratable_fill(devlink_port, &caps, extack); if (err) return err; err = devlink_port_fn_ipsec_crypto_fill(devlink_port, &caps, extack); if (err) return err; err = devlink_port_fn_ipsec_packet_fill(devlink_port, &caps, extack); if (err) return err; if (!caps.selector) return 0; err = nla_put_bitfield32(msg, DEVLINK_PORT_FN_ATTR_CAPS, caps.value, caps.selector); if (err) return err; *msg_updated = true; return 0; } int devlink_nl_port_handle_fill(struct sk_buff *msg, struct devlink_port *devlink_port) { if (devlink_nl_put_handle(msg, devlink_port->devlink)) return -EMSGSIZE; if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) return -EMSGSIZE; return 0; } size_t devlink_nl_port_handle_size(struct devlink_port *devlink_port) { struct devlink *devlink = devlink_port->devlink; return nla_total_size(strlen(devlink->dev->bus->name) + 1) /* DEVLINK_ATTR_BUS_NAME */ + nla_total_size(strlen(dev_name(devlink->dev)) + 1) /* DEVLINK_ATTR_DEV_NAME */ + nla_total_size(4); /* DEVLINK_ATTR_PORT_INDEX */ } static int devlink_nl_port_attrs_put(struct sk_buff *msg, struct devlink_port *devlink_port) { struct devlink_port_attrs *attrs = &devlink_port->attrs; if (!devlink_port->attrs_set) return 0; if (attrs->lanes) { if (nla_put_u32(msg, DEVLINK_ATTR_PORT_LANES, attrs->lanes)) return -EMSGSIZE; } if (nla_put_u8(msg, DEVLINK_ATTR_PORT_SPLITTABLE, attrs->splittable)) return -EMSGSIZE; if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour)) return -EMSGSIZE; switch (devlink_port->attrs.flavour) { case DEVLINK_PORT_FLAVOUR_PCI_PF: if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, attrs->pci_pf.controller) || nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_pf.pf)) return -EMSGSIZE; if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_pf.external)) return -EMSGSIZE; break; case DEVLINK_PORT_FLAVOUR_PCI_VF: if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, attrs->pci_vf.controller) || nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_vf.pf) || nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_VF_NUMBER, attrs->pci_vf.vf)) return -EMSGSIZE; if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_vf.external)) return -EMSGSIZE; break; case DEVLINK_PORT_FLAVOUR_PCI_SF: if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, attrs->pci_sf.controller) || nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_sf.pf) || nla_put_u32(msg, DEVLINK_ATTR_PORT_PCI_SF_NUMBER, attrs->pci_sf.sf)) return -EMSGSIZE; break; case DEVLINK_PORT_FLAVOUR_PHYSICAL: case DEVLINK_PORT_FLAVOUR_CPU: case DEVLINK_PORT_FLAVOUR_DSA: if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER, attrs->phys.port_number)) return -EMSGSIZE; if (!attrs->split) return 0; if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP, attrs->phys.port_number)) return -EMSGSIZE; if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER, attrs->phys.split_subport_number)) return -EMSGSIZE; break; default: break; } return 0; } static int devlink_port_fn_hw_addr_fill(struct devlink_port *port, struct sk_buff *msg, struct netlink_ext_ack *extack, bool *msg_updated) { u8 hw_addr[MAX_ADDR_LEN]; int hw_addr_len; int err; if (!port->ops->port_fn_hw_addr_get) return 0; err = port->ops->port_fn_hw_addr_get(port, hw_addr, &hw_addr_len, extack); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } err = nla_put(msg, DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR, hw_addr_len, hw_addr); if (err) return err; *msg_updated = true; return 0; } static bool devlink_port_fn_state_valid(enum devlink_port_fn_state state) { return state == DEVLINK_PORT_FN_STATE_INACTIVE || state == DEVLINK_PORT_FN_STATE_ACTIVE; } static bool devlink_port_fn_opstate_valid(enum devlink_port_fn_opstate opstate) { return opstate == DEVLINK_PORT_FN_OPSTATE_DETACHED || opstate == DEVLINK_PORT_FN_OPSTATE_ATTACHED; } static int devlink_port_fn_state_fill(struct devlink_port *port, struct sk_buff *msg, struct netlink_ext_ack *extack, bool *msg_updated) { enum devlink_port_fn_opstate opstate; enum devlink_port_fn_state state; int err; if (!port->ops->port_fn_state_get) return 0; err = port->ops->port_fn_state_get(port, &state, &opstate, extack); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } if (!devlink_port_fn_state_valid(state)) { WARN_ON_ONCE(1); NL_SET_ERR_MSG(extack, "Invalid state read from driver"); return -EINVAL; } if (!devlink_port_fn_opstate_valid(opstate)) { WARN_ON_ONCE(1); NL_SET_ERR_MSG(extack, "Invalid operational state read from driver"); return -EINVAL; } if (nla_put_u8(msg, DEVLINK_PORT_FN_ATTR_STATE, state) || nla_put_u8(msg, DEVLINK_PORT_FN_ATTR_OPSTATE, opstate)) return -EMSGSIZE; *msg_updated = true; return 0; } static int devlink_port_fn_mig_set(struct devlink_port *devlink_port, bool enable, struct netlink_ext_ack *extack) { return devlink_port->ops->port_fn_migratable_set(devlink_port, enable, extack); } static int devlink_port_fn_roce_set(struct devlink_port *devlink_port, bool enable, struct netlink_ext_ack *extack) { return devlink_port->ops->port_fn_roce_set(devlink_port, enable, extack); } static int devlink_port_fn_ipsec_crypto_set(struct devlink_port *devlink_port, bool enable, struct netlink_ext_ack *extack) { return devlink_port->ops->port_fn_ipsec_crypto_set(devlink_port, enable, extack); } static int devlink_port_fn_ipsec_packet_set(struct devlink_port *devlink_port, bool enable, struct netlink_ext_ack *extack) { return devlink_port->ops->port_fn_ipsec_packet_set(devlink_port, enable, extack); } static int devlink_port_fn_caps_set(struct devlink_port *devlink_port, const struct nlattr *attr, struct netlink_ext_ack *extack) { struct nla_bitfield32 caps; u32 caps_value; int err; caps = nla_get_bitfield32(attr); caps_value = caps.value & caps.selector; if (caps.selector & DEVLINK_PORT_FN_CAP_ROCE) { err = devlink_port_fn_roce_set(devlink_port, caps_value & DEVLINK_PORT_FN_CAP_ROCE, extack); if (err) return err; } if (caps.selector & DEVLINK_PORT_FN_CAP_MIGRATABLE) { err = devlink_port_fn_mig_set(devlink_port, caps_value & DEVLINK_PORT_FN_CAP_MIGRATABLE, extack); if (err) return err; } if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO) { err = devlink_port_fn_ipsec_crypto_set(devlink_port, caps_value & DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO, extack); if (err) return err; } if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_PACKET) { err = devlink_port_fn_ipsec_packet_set(devlink_port, caps_value & DEVLINK_PORT_FN_CAP_IPSEC_PACKET, extack); if (err) return err; } return 0; } static int devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *port, struct netlink_ext_ack *extack) { struct nlattr *function_attr; bool msg_updated = false; int err; function_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_PORT_FUNCTION); if (!function_attr) return -EMSGSIZE; err = devlink_port_fn_hw_addr_fill(port, msg, extack, &msg_updated); if (err) goto out; err = devlink_port_fn_caps_fill(port, msg, extack, &msg_updated); if (err) goto out; err = devlink_port_fn_state_fill(port, msg, extack, &msg_updated); if (err) goto out; err = devlink_rel_devlink_handle_put(msg, port->devlink, port->rel_index, DEVLINK_PORT_FN_ATTR_DEVLINK, &msg_updated); out: if (err || !msg_updated) nla_nest_cancel(msg, function_attr); else nla_nest_end(msg, function_attr); return err; } static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink_port *devlink_port, enum devlink_command cmd, u32 portid, u32 seq, int flags, struct netlink_ext_ack *extack) { struct devlink *devlink = devlink_port->devlink; void *hdr; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; if (devlink_nl_put_handle(msg, devlink)) goto nla_put_failure; if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) goto nla_put_failure; spin_lock_bh(&devlink_port->type_lock); if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type)) goto nla_put_failure_type_locked; if (devlink_port->desired_type != DEVLINK_PORT_TYPE_NOTSET && nla_put_u16(msg, DEVLINK_ATTR_PORT_DESIRED_TYPE, devlink_port->desired_type)) goto nla_put_failure_type_locked; if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) { if (devlink_port->type_eth.netdev && (nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX, devlink_port->type_eth.ifindex) || nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME, devlink_port->type_eth.ifname))) goto nla_put_failure_type_locked; } if (devlink_port->type == DEVLINK_PORT_TYPE_IB) { struct ib_device *ibdev = devlink_port->type_ib.ibdev; if (ibdev && nla_put_string(msg, DEVLINK_ATTR_PORT_IBDEV_NAME, ibdev->name)) goto nla_put_failure_type_locked; } spin_unlock_bh(&devlink_port->type_lock); if (devlink_nl_port_attrs_put(msg, devlink_port)) goto nla_put_failure; if (devlink_nl_port_function_attrs_put(msg, devlink_port, extack)) goto nla_put_failure; if (devlink_port->linecard && nla_put_u32(msg, DEVLINK_ATTR_LINECARD_INDEX, devlink_linecard_index(devlink_port->linecard))) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure_type_locked: spin_unlock_bh(&devlink_port->type_lock); nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static void devlink_port_notify(struct devlink_port *devlink_port, enum devlink_command cmd) { struct devlink *devlink = devlink_port->devlink; struct sk_buff *msg; int err; WARN_ON(cmd != DEVLINK_CMD_PORT_NEW && cmd != DEVLINK_CMD_PORT_DEL); if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; err = devlink_nl_port_fill(msg, devlink_port, cmd, 0, 0, 0, NULL); if (err) { nlmsg_free(msg); return; } genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } static void devlink_ports_notify(struct devlink *devlink, enum devlink_command cmd) { struct devlink_port *devlink_port; unsigned long port_index; xa_for_each(&devlink->ports, port_index, devlink_port) devlink_port_notify(devlink_port, cmd); } void devlink_ports_notify_register(struct devlink *devlink) { devlink_ports_notify(devlink, DEVLINK_CMD_PORT_NEW); } void devlink_ports_notify_unregister(struct devlink *devlink) { devlink_ports_notify(devlink, DEVLINK_CMD_PORT_DEL); } int devlink_nl_port_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; struct sk_buff *msg; int err; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_PORT_NEW, info->snd_portid, info->snd_seq, 0, info->extack); if (err) { nlmsg_free(msg); return err; } return genlmsg_reply(msg, info); } static int devlink_nl_port_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_port *devlink_port; unsigned long port_index; int err = 0; xa_for_each_start(&devlink->ports, port_index, devlink_port, state->idx) { err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_NEW, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags, cb->extack); if (err) { state->idx = port_index; break; } } return err; } int devlink_nl_port_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return devlink_nl_dumpit(skb, cb, devlink_nl_port_get_dump_one); } static int devlink_port_type_set(struct devlink_port *devlink_port, enum devlink_port_type port_type) { int err; if (!devlink_port->ops->port_type_set) return -EOPNOTSUPP; if (port_type == devlink_port->type) return 0; err = devlink_port->ops->port_type_set(devlink_port, port_type); if (err) return err; devlink_port->desired_type = port_type; devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); return 0; } static int devlink_port_function_hw_addr_set(struct devlink_port *port, const struct nlattr *attr, struct netlink_ext_ack *extack) { const u8 *hw_addr; int hw_addr_len; hw_addr = nla_data(attr); hw_addr_len = nla_len(attr); if (hw_addr_len > MAX_ADDR_LEN) { NL_SET_ERR_MSG(extack, "Port function hardware address too long"); return -EINVAL; } if (port->type == DEVLINK_PORT_TYPE_ETH) { if (hw_addr_len != ETH_ALEN) { NL_SET_ERR_MSG(extack, "Address must be 6 bytes for Ethernet device"); return -EINVAL; } if (!is_unicast_ether_addr(hw_addr)) { NL_SET_ERR_MSG(extack, "Non-unicast hardware address unsupported"); return -EINVAL; } } return port->ops->port_fn_hw_addr_set(port, hw_addr, hw_addr_len, extack); } static int devlink_port_fn_state_set(struct devlink_port *port, const struct nlattr *attr, struct netlink_ext_ack *extack) { enum devlink_port_fn_state state; state = nla_get_u8(attr); return port->ops->port_fn_state_set(port, state, extack); } static int devlink_port_function_validate(struct devlink_port *devlink_port, struct nlattr **tb, struct netlink_ext_ack *extack) { const struct devlink_port_ops *ops = devlink_port->ops; struct nlattr *attr; if (tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] && !ops->port_fn_hw_addr_set) { NL_SET_ERR_MSG_ATTR(extack, tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR], "Port doesn't support function attributes"); return -EOPNOTSUPP; } if (tb[DEVLINK_PORT_FN_ATTR_STATE] && !ops->port_fn_state_set) { NL_SET_ERR_MSG_ATTR(extack, tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR], "Function does not support state setting"); return -EOPNOTSUPP; } attr = tb[DEVLINK_PORT_FN_ATTR_CAPS]; if (attr) { struct nla_bitfield32 caps; caps = nla_get_bitfield32(attr); if (caps.selector & DEVLINK_PORT_FN_CAP_ROCE && !ops->port_fn_roce_set) { NL_SET_ERR_MSG_ATTR(extack, attr, "Port doesn't support RoCE function attribute"); return -EOPNOTSUPP; } if (caps.selector & DEVLINK_PORT_FN_CAP_MIGRATABLE) { if (!ops->port_fn_migratable_set) { NL_SET_ERR_MSG_ATTR(extack, attr, "Port doesn't support migratable function attribute"); return -EOPNOTSUPP; } if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) { NL_SET_ERR_MSG_ATTR(extack, attr, "migratable function attribute supported for VFs only"); return -EOPNOTSUPP; } } if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO) { if (!ops->port_fn_ipsec_crypto_set) { NL_SET_ERR_MSG_ATTR(extack, attr, "Port doesn't support ipsec_crypto function attribute"); return -EOPNOTSUPP; } if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) { NL_SET_ERR_MSG_ATTR(extack, attr, "ipsec_crypto function attribute supported for VFs only"); return -EOPNOTSUPP; } } if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_PACKET) { if (!ops->port_fn_ipsec_packet_set) { NL_SET_ERR_MSG_ATTR(extack, attr, "Port doesn't support ipsec_packet function attribute"); return -EOPNOTSUPP; } if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) { NL_SET_ERR_MSG_ATTR(extack, attr, "ipsec_packet function attribute supported for VFs only"); return -EOPNOTSUPP; } } } return 0; } static int devlink_port_function_set(struct devlink_port *port, const struct nlattr *attr, struct netlink_ext_ack *extack) { struct nlattr *tb[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1]; int err; err = nla_parse_nested(tb, DEVLINK_PORT_FUNCTION_ATTR_MAX, attr, devlink_function_nl_policy, extack); if (err < 0) { NL_SET_ERR_MSG(extack, "Fail to parse port function attributes"); return err; } err = devlink_port_function_validate(port, tb, extack); if (err) return err; attr = tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR]; if (attr) { err = devlink_port_function_hw_addr_set(port, attr, extack); if (err) return err; } attr = tb[DEVLINK_PORT_FN_ATTR_CAPS]; if (attr) { err = devlink_port_fn_caps_set(port, attr, extack); if (err) return err; } /* Keep this as the last function attribute set, so that when * multiple port function attributes are set along with state, * Those can be applied first before activating the state. */ attr = tb[DEVLINK_PORT_FN_ATTR_STATE]; if (attr) err = devlink_port_fn_state_set(port, attr, extack); if (!err) devlink_port_notify(port, DEVLINK_CMD_PORT_NEW); return err; } int devlink_nl_port_set_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; int err; if (info->attrs[DEVLINK_ATTR_PORT_TYPE]) { enum devlink_port_type port_type; port_type = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_TYPE]); err = devlink_port_type_set(devlink_port, port_type); if (err) return err; } if (info->attrs[DEVLINK_ATTR_PORT_FUNCTION]) { struct nlattr *attr = info->attrs[DEVLINK_ATTR_PORT_FUNCTION]; struct netlink_ext_ack *extack = info->extack; err = devlink_port_function_set(devlink_port, attr, extack); if (err) return err; } return 0; } int devlink_nl_port_split_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; struct devlink *devlink = info->user_ptr[0]; u32 count; if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PORT_SPLIT_COUNT)) return -EINVAL; if (!devlink_port->ops->port_split) return -EOPNOTSUPP; count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]); if (!devlink_port->attrs.splittable) { /* Split ports cannot be split. */ if (devlink_port->attrs.split) NL_SET_ERR_MSG(info->extack, "Port cannot be split further"); else NL_SET_ERR_MSG(info->extack, "Port cannot be split"); return -EINVAL; } if (count < 2 || !is_power_of_2(count) || count > devlink_port->attrs.lanes) { NL_SET_ERR_MSG(info->extack, "Invalid split count"); return -EINVAL; } return devlink_port->ops->port_split(devlink, devlink_port, count, info->extack); } int devlink_nl_port_unsplit_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; struct devlink *devlink = info->user_ptr[0]; if (!devlink_port->ops->port_unsplit) return -EOPNOTSUPP; return devlink_port->ops->port_unsplit(devlink, devlink_port, info->extack); } int devlink_nl_port_new_doit(struct sk_buff *skb, struct genl_info *info) { struct netlink_ext_ack *extack = info->extack; struct devlink_port_new_attrs new_attrs = {}; struct devlink *devlink = info->user_ptr[0]; struct devlink_port *devlink_port; struct sk_buff *msg; int err; if (!devlink->ops->port_new) return -EOPNOTSUPP; if (!info->attrs[DEVLINK_ATTR_PORT_FLAVOUR] || !info->attrs[DEVLINK_ATTR_PORT_PCI_PF_NUMBER]) { NL_SET_ERR_MSG(extack, "Port flavour or PCI PF are not specified"); return -EINVAL; } new_attrs.flavour = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_FLAVOUR]); new_attrs.pfnum = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_PCI_PF_NUMBER]); if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) { /* Port index of the new port being created by driver. */ new_attrs.port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); new_attrs.port_index_valid = true; } if (info->attrs[DEVLINK_ATTR_PORT_CONTROLLER_NUMBER]) { new_attrs.controller = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_CONTROLLER_NUMBER]); new_attrs.controller_valid = true; } if (new_attrs.flavour == DEVLINK_PORT_FLAVOUR_PCI_SF && info->attrs[DEVLINK_ATTR_PORT_PCI_SF_NUMBER]) { new_attrs.sfnum = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_PCI_SF_NUMBER]); new_attrs.sfnum_valid = true; } err = devlink->ops->port_new(devlink, &new_attrs, extack, &devlink_port); if (err) return err; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { err = -ENOMEM; goto err_out_port_del; } err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_NEW, info->snd_portid, info->snd_seq, 0, NULL); if (WARN_ON_ONCE(err)) goto err_out_msg_free; err = genlmsg_reply(msg, info); if (err) goto err_out_port_del; return 0; err_out_msg_free: nlmsg_free(msg); err_out_port_del: devlink_port->ops->port_del(devlink, devlink_port, NULL); return err; } int devlink_nl_port_del_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; struct netlink_ext_ack *extack = info->extack; struct devlink *devlink = info->user_ptr[0]; if (!devlink_port->ops->port_del) return -EOPNOTSUPP; return devlink_port->ops->port_del(devlink, devlink_port, extack); } static void devlink_port_type_warn(struct work_struct *work) { struct devlink_port *port = container_of(to_delayed_work(work), struct devlink_port, type_warn_dw); dev_warn(port->devlink->dev, "Type was not set for devlink port."); } static bool devlink_port_type_should_warn(struct devlink_port *devlink_port) { /* Ignore CPU and DSA flavours. */ return devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_CPU && devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA && devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_UNUSED; } #define DEVLINK_PORT_TYPE_WARN_TIMEOUT (HZ * 3600) static void devlink_port_type_warn_schedule(struct devlink_port *devlink_port) { if (!devlink_port_type_should_warn(devlink_port)) return; /* Schedule a work to WARN in case driver does not set port * type within timeout. */ schedule_delayed_work(&devlink_port->type_warn_dw, DEVLINK_PORT_TYPE_WARN_TIMEOUT); } static void devlink_port_type_warn_cancel(struct devlink_port *devlink_port) { if (!devlink_port_type_should_warn(devlink_port)) return; cancel_delayed_work_sync(&devlink_port->type_warn_dw); } /** * devlink_port_init() - Init devlink port * * @devlink: devlink * @devlink_port: devlink port * * Initialize essential stuff that is needed for functions * that may be called before devlink port registration. * Call to this function is optional and not needed * in case the driver does not use such functions. */ void devlink_port_init(struct devlink *devlink, struct devlink_port *devlink_port) { if (devlink_port->initialized) return; devlink_port->devlink = devlink; INIT_LIST_HEAD(&devlink_port->region_list); devlink_port->initialized = true; } EXPORT_SYMBOL_GPL(devlink_port_init); /** * devlink_port_fini() - Deinitialize devlink port * * @devlink_port: devlink port * * Deinitialize essential stuff that is in use for functions * that may be called after devlink port unregistration. * Call to this function is optional and not needed * in case the driver does not use such functions. */ void devlink_port_fini(struct devlink_port *devlink_port) { WARN_ON(!list_empty(&devlink_port->region_list)); } EXPORT_SYMBOL_GPL(devlink_port_fini); static const struct devlink_port_ops devlink_port_dummy_ops = {}; /** * devl_port_register_with_ops() - Register devlink port * * @devlink: devlink * @devlink_port: devlink port * @port_index: driver-specific numerical identifier of the port * @ops: port ops * * Register devlink port with provided port index. User can use * any indexing, even hw-related one. devlink_port structure * is convenient to be embedded inside user driver private structure. * Note that the caller should take care of zeroing the devlink_port * structure. */ int devl_port_register_with_ops(struct devlink *devlink, struct devlink_port *devlink_port, unsigned int port_index, const struct devlink_port_ops *ops) { int err; devl_assert_locked(devlink); ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); devlink_port_init(devlink, devlink_port); devlink_port->registered = true; devlink_port->index = port_index; devlink_port->ops = ops ? ops : &devlink_port_dummy_ops; spin_lock_init(&devlink_port->type_lock); INIT_LIST_HEAD(&devlink_port->reporter_list); err = xa_insert(&devlink->ports, port_index, devlink_port, GFP_KERNEL); if (err) { devlink_port->registered = false; return err; } INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn); devlink_port_type_warn_schedule(devlink_port); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); return 0; } EXPORT_SYMBOL_GPL(devl_port_register_with_ops); /** * devlink_port_register_with_ops - Register devlink port * * @devlink: devlink * @devlink_port: devlink port * @port_index: driver-specific numerical identifier of the port * @ops: port ops * * Register devlink port with provided port index. User can use * any indexing, even hw-related one. devlink_port structure * is convenient to be embedded inside user driver private structure. * Note that the caller should take care of zeroing the devlink_port * structure. * * Context: Takes and release devlink->lock <mutex>. */ int devlink_port_register_with_ops(struct devlink *devlink, struct devlink_port *devlink_port, unsigned int port_index, const struct devlink_port_ops *ops) { int err; devl_lock(devlink); err = devl_port_register_with_ops(devlink, devlink_port, port_index, ops); devl_unlock(devlink); return err; } EXPORT_SYMBOL_GPL(devlink_port_register_with_ops); /** * devl_port_unregister() - Unregister devlink port * * @devlink_port: devlink port */ void devl_port_unregister(struct devlink_port *devlink_port) { lockdep_assert_held(&devlink_port->devlink->lock); WARN_ON(devlink_port->type != DEVLINK_PORT_TYPE_NOTSET); devlink_port_type_warn_cancel(devlink_port); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL); xa_erase(&devlink_port->devlink->ports, devlink_port->index); WARN_ON(!list_empty(&devlink_port->reporter_list)); devlink_port->registered = false; } EXPORT_SYMBOL_GPL(devl_port_unregister); /** * devlink_port_unregister - Unregister devlink port * * @devlink_port: devlink port * * Context: Takes and release devlink->lock <mutex>. */ void devlink_port_unregister(struct devlink_port *devlink_port) { struct devlink *devlink = devlink_port->devlink; devl_lock(devlink); devl_port_unregister(devlink_port); devl_unlock(devlink); } EXPORT_SYMBOL_GPL(devlink_port_unregister); static void devlink_port_type_netdev_checks(struct devlink_port *devlink_port, struct net_device *netdev) { const struct net_device_ops *ops = netdev->netdev_ops; /* If driver registers devlink port, it should set devlink port * attributes accordingly so the compat functions are called * and the original ops are not used. */ if (ops->ndo_get_phys_port_name) { /* Some drivers use the same set of ndos for netdevs * that have devlink_port registered and also for * those who don't. Make sure that ndo_get_phys_port_name * returns -EOPNOTSUPP here in case it is defined. * Warn if not. */ char name[IFNAMSIZ]; int err; err = ops->ndo_get_phys_port_name(netdev, name, sizeof(name)); WARN_ON(err != -EOPNOTSUPP); } if (ops->ndo_get_port_parent_id) { /* Some drivers use the same set of ndos for netdevs * that have devlink_port registered and also for * those who don't. Make sure that ndo_get_port_parent_id * returns -EOPNOTSUPP here in case it is defined. * Warn if not. */ struct netdev_phys_item_id ppid; int err; err = ops->ndo_get_port_parent_id(netdev, &ppid); WARN_ON(err != -EOPNOTSUPP); } } static void __devlink_port_type_set(struct devlink_port *devlink_port, enum devlink_port_type type, void *type_dev) { struct net_device *netdev = type_dev; ASSERT_DEVLINK_PORT_REGISTERED(devlink_port); if (type == DEVLINK_PORT_TYPE_NOTSET) { devlink_port_type_warn_schedule(devlink_port); } else { devlink_port_type_warn_cancel(devlink_port); if (type == DEVLINK_PORT_TYPE_ETH && netdev) devlink_port_type_netdev_checks(devlink_port, netdev); } spin_lock_bh(&devlink_port->type_lock); devlink_port->type = type; switch (type) { case DEVLINK_PORT_TYPE_ETH: devlink_port->type_eth.netdev = netdev; if (netdev) { ASSERT_RTNL(); devlink_port->type_eth.ifindex = netdev->ifindex; BUILD_BUG_ON(sizeof(devlink_port->type_eth.ifname) != sizeof(netdev->name)); strcpy(devlink_port->type_eth.ifname, netdev->name); } break; case DEVLINK_PORT_TYPE_IB: devlink_port->type_ib.ibdev = type_dev; break; default: break; } spin_unlock_bh(&devlink_port->type_lock); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); } /** * devlink_port_type_eth_set - Set port type to Ethernet * * @devlink_port: devlink port * * If driver is calling this, most likely it is doing something wrong. */ void devlink_port_type_eth_set(struct devlink_port *devlink_port) { dev_warn(devlink_port->devlink->dev, "devlink port type for port %d set to Ethernet without a software interface reference, device type not supported by the kernel?\n", devlink_port->index); __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, NULL); } EXPORT_SYMBOL_GPL(devlink_port_type_eth_set); /** * devlink_port_type_ib_set - Set port type to InfiniBand * * @devlink_port: devlink port * @ibdev: related IB device */ void devlink_port_type_ib_set(struct devlink_port *devlink_port, struct ib_device *ibdev) { __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_IB, ibdev); } EXPORT_SYMBOL_GPL(devlink_port_type_ib_set); /** * devlink_port_type_clear - Clear port type * * @devlink_port: devlink port * * If driver is calling this for clearing Ethernet type, most likely * it is doing something wrong. */ void devlink_port_type_clear(struct devlink_port *devlink_port) { if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) dev_warn(devlink_port->devlink->dev, "devlink port type for port %d cleared without a software interface reference, device type not supported by the kernel?\n", devlink_port->index); __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, NULL); } EXPORT_SYMBOL_GPL(devlink_port_type_clear); int devlink_port_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); struct devlink_port *devlink_port = netdev->devlink_port; struct devlink *devlink; if (!devlink_port) return NOTIFY_OK; devlink = devlink_port->devlink; switch (event) { case NETDEV_POST_INIT: /* Set the type but not netdev pointer. It is going to be set * later on by NETDEV_REGISTER event. Happens once during * netdevice register */ __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, NULL); break; case NETDEV_REGISTER: case NETDEV_CHANGENAME: if (devlink_net(devlink) != dev_net(netdev)) return NOTIFY_OK; /* Set the netdev on top of previously set type. Note this * event happens also during net namespace change so here * we take into account netdev pointer appearing in this * namespace. */ __devlink_port_type_set(devlink_port, devlink_port->type, netdev); break; case NETDEV_UNREGISTER: if (devlink_net(devlink) != dev_net(netdev)) return NOTIFY_OK; /* Clear netdev pointer, but not the type. This event happens * also during net namespace change so we need to clear * pointer to netdev that is going to another net namespace. */ __devlink_port_type_set(devlink_port, devlink_port->type, NULL); break; case NETDEV_PRE_UNINIT: /* Clear the type and the netdev pointer. Happens one during * netdevice unregister. */ __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, NULL); break; } return NOTIFY_OK; } static int __devlink_port_attrs_set(struct devlink_port *devlink_port, enum devlink_port_flavour flavour) { struct devlink_port_attrs *attrs = &devlink_port->attrs; devlink_port->attrs_set = true; attrs->flavour = flavour; if (attrs->switch_id.id_len) { devlink_port->switch_port = true; if (WARN_ON(attrs->switch_id.id_len > MAX_PHYS_ITEM_ID_LEN)) attrs->switch_id.id_len = MAX_PHYS_ITEM_ID_LEN; } else { devlink_port->switch_port = false; } return 0; } /** * devlink_port_attrs_set - Set port attributes * * @devlink_port: devlink port * @attrs: devlink port attrs */ void devlink_port_attrs_set(struct devlink_port *devlink_port, struct devlink_port_attrs *attrs) { int ret; ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); devlink_port->attrs = *attrs; ret = __devlink_port_attrs_set(devlink_port, attrs->flavour); if (ret) return; WARN_ON(attrs->splittable && attrs->split); } EXPORT_SYMBOL_GPL(devlink_port_attrs_set); /** * devlink_port_attrs_pci_pf_set - Set PCI PF port attributes * * @devlink_port: devlink port * @controller: associated controller number for the devlink port instance * @pf: associated PF for the devlink port instance * @external: indicates if the port is for an external controller */ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 controller, u16 pf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_PF); if (ret) return; attrs->pci_pf.controller = controller; attrs->pci_pf.pf = pf; attrs->pci_pf.external = external; } EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set); /** * devlink_port_attrs_pci_vf_set - Set PCI VF port attributes * * @devlink_port: devlink port * @controller: associated controller number for the devlink port instance * @pf: associated PF for the devlink port instance * @vf: associated VF of a PF for the devlink port instance * @external: indicates if the port is for an external controller */ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 controller, u16 pf, u16 vf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_VF); if (ret) return; attrs->pci_vf.controller = controller; attrs->pci_vf.pf = pf; attrs->pci_vf.vf = vf; attrs->pci_vf.external = external; } EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_vf_set); /** * devlink_port_attrs_pci_sf_set - Set PCI SF port attributes * * @devlink_port: devlink port * @controller: associated controller number for the devlink port instance * @pf: associated PF for the devlink port instance * @sf: associated SF of a PF for the devlink port instance * @external: indicates if the port is for an external controller */ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 controller, u16 pf, u32 sf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_SF); if (ret) return; attrs->pci_sf.controller = controller; attrs->pci_sf.pf = pf; attrs->pci_sf.sf = sf; attrs->pci_sf.external = external; } EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_sf_set); static void devlink_port_rel_notify_cb(struct devlink *devlink, u32 port_index) { struct devlink_port *devlink_port; devlink_port = devlink_port_get_by_index(devlink, port_index); if (!devlink_port) return; devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); } static void devlink_port_rel_cleanup_cb(struct devlink *devlink, u32 port_index, u32 rel_index) { struct devlink_port *devlink_port; devlink_port = devlink_port_get_by_index(devlink, port_index); if (devlink_port && devlink_port->rel_index == rel_index) devlink_port->rel_index = 0; } /** * devl_port_fn_devlink_set - Attach peer devlink * instance to port function. * @devlink_port: devlink port * @fn_devlink: devlink instance to attach */ int devl_port_fn_devlink_set(struct devlink_port *devlink_port, struct devlink *fn_devlink) { ASSERT_DEVLINK_PORT_REGISTERED(devlink_port); if (WARN_ON(devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_SF || devlink_port->attrs.pci_sf.external)) return -EINVAL; return devlink_rel_nested_in_add(&devlink_port->rel_index, devlink_port->devlink->index, devlink_port->index, devlink_port_rel_notify_cb, devlink_port_rel_cleanup_cb, fn_devlink); } EXPORT_SYMBOL_GPL(devl_port_fn_devlink_set); /** * devlink_port_linecard_set - Link port with a linecard * * @devlink_port: devlink port * @linecard: devlink linecard */ void devlink_port_linecard_set(struct devlink_port *devlink_port, struct devlink_linecard *linecard) { ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); devlink_port->linecard = linecard; } EXPORT_SYMBOL_GPL(devlink_port_linecard_set); static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, char *name, size_t len) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int n = 0; if (!devlink_port->attrs_set) return -EOPNOTSUPP; switch (attrs->flavour) { case DEVLINK_PORT_FLAVOUR_PHYSICAL: if (devlink_port->linecard) n = snprintf(name, len, "l%u", devlink_linecard_index(devlink_port->linecard)); if (n < len) n += snprintf(name + n, len - n, "p%u", attrs->phys.port_number); if (n < len && attrs->split) n += snprintf(name + n, len - n, "s%u", attrs->phys.split_subport_number); break; case DEVLINK_PORT_FLAVOUR_CPU: case DEVLINK_PORT_FLAVOUR_DSA: case DEVLINK_PORT_FLAVOUR_UNUSED: /* As CPU and DSA ports do not have a netdevice associated * case should not ever happen. */ WARN_ON(1); return -EINVAL; case DEVLINK_PORT_FLAVOUR_PCI_PF: if (attrs->pci_pf.external) { n = snprintf(name, len, "c%u", attrs->pci_pf.controller); if (n >= len) return -EINVAL; len -= n; name += n; } n = snprintf(name, len, "pf%u", attrs->pci_pf.pf); break; case DEVLINK_PORT_FLAVOUR_PCI_VF: if (attrs->pci_vf.external) { n = snprintf(name, len, "c%u", attrs->pci_vf.controller); if (n >= len) return -EINVAL; len -= n; name += n; } n = snprintf(name, len, "pf%uvf%u", attrs->pci_vf.pf, attrs->pci_vf.vf); break; case DEVLINK_PORT_FLAVOUR_PCI_SF: if (attrs->pci_sf.external) { n = snprintf(name, len, "c%u", attrs->pci_sf.controller); if (n >= len) return -EINVAL; len -= n; name += n; } n = snprintf(name, len, "pf%usf%u", attrs->pci_sf.pf, attrs->pci_sf.sf); break; case DEVLINK_PORT_FLAVOUR_VIRTUAL: return -EOPNOTSUPP; } if (n >= len) return -EINVAL; return 0; } int devlink_compat_phys_port_name_get(struct net_device *dev, char *name, size_t len) { struct devlink_port *devlink_port; /* RTNL mutex is held here which ensures that devlink_port * instance cannot disappear in the middle. No need to take * any devlink lock as only permanent values are accessed. */ ASSERT_RTNL(); devlink_port = dev->devlink_port; if (!devlink_port) return -EOPNOTSUPP; return __devlink_port_phys_port_name_get(devlink_port, name, len); } int devlink_compat_switch_id_get(struct net_device *dev, struct netdev_phys_item_id *ppid) { struct devlink_port *devlink_port; /* Caller must hold RTNL mutex or reference to dev, which ensures that * devlink_port instance cannot disappear in the middle. No need to take * any devlink lock as only permanent values are accessed. */ devlink_port = dev->devlink_port; if (!devlink_port || !devlink_port->switch_port) return -EOPNOTSUPP; memcpy(ppid, &devlink_port->attrs.switch_id, sizeof(*ppid)); return 0; }
4111 2897 3602 3603 3603 6578 5569 6566 1199 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 /* SPDX-License-Identifier: GPL-2.0 */ /* * Variant of atomic_t specialized for reference counts. * * The interface matches the atomic_t interface (to aid in porting) but only * provides the few functions one should use for reference counting. * * Saturation semantics * ==================== * * refcount_t differs from atomic_t in that the counter saturates at * REFCOUNT_SATURATED and will not move once there. This avoids wrapping the * counter and causing 'spurious' use-after-free issues. In order to avoid the * cost associated with introducing cmpxchg() loops into all of the saturating * operations, we temporarily allow the counter to take on an unchecked value * and then explicitly set it to REFCOUNT_SATURATED on detecting that underflow * or overflow has occurred. Although this is racy when multiple threads * access the refcount concurrently, by placing REFCOUNT_SATURATED roughly * equidistant from 0 and INT_MAX we minimise the scope for error: * * INT_MAX REFCOUNT_SATURATED UINT_MAX * 0 (0x7fff_ffff) (0xc000_0000) (0xffff_ffff) * +--------------------------------+----------------+----------------+ * <---------- bad value! ----------> * * (in a signed view of the world, the "bad value" range corresponds to * a negative counter value). * * As an example, consider a refcount_inc() operation that causes the counter * to overflow: * * int old = atomic_fetch_add_relaxed(r); * // old is INT_MAX, refcount now INT_MIN (0x8000_0000) * if (old < 0) * atomic_set(r, REFCOUNT_SATURATED); * * If another thread also performs a refcount_inc() operation between the two * atomic operations, then the count will continue to edge closer to 0. If it * reaches a value of 1 before /any/ of the threads reset it to the saturated * value, then a concurrent refcount_dec_and_test() may erroneously free the * underlying object. * Linux limits the maximum number of tasks to PID_MAX_LIMIT, which is currently * 0x400000 (and can't easily be raised in the future beyond FUTEX_TID_MASK). * With the current PID limit, if no batched refcounting operations are used and * the attacker can't repeatedly trigger kernel oopses in the middle of refcount * operations, this makes it impossible for a saturated refcount to leave the * saturation range, even if it is possible for multiple uses of the same * refcount to nest in the context of a single task: * * (UINT_MAX+1-REFCOUNT_SATURATED) / PID_MAX_LIMIT = * 0x40000000 / 0x400000 = 0x100 = 256 * * If hundreds of references are added/removed with a single refcounting * operation, it may potentially be possible to leave the saturation range; but * given the precise timing details involved with the round-robin scheduling of * each thread manipulating the refcount and the need to hit the race multiple * times in succession, there doesn't appear to be a practical avenue of attack * even if using refcount_add() operations with larger increments. * * Memory ordering * =============== * * Memory ordering rules are slightly relaxed wrt regular atomic_t functions * and provide only what is strictly required for refcounts. * * The increments are fully relaxed; these will not provide ordering. The * rationale is that whatever is used to obtain the object we're increasing the * reference count on will provide the ordering. For locked data structures, * its the lock acquire, for RCU/lockless data structures its the dependent * load. * * Do note that inc_not_zero() provides a control dependency which will order * future stores against the inc, this ensures we'll never modify the object * if we did not in fact acquire a reference. * * The decrements will provide release order, such that all the prior loads and * stores will be issued before, it also provides a control dependency, which * will order us against the subsequent free(). * * The control dependency is against the load of the cmpxchg (ll/sc) that * succeeded. This means the stores aren't fully ordered, but this is fine * because the 1->0 transition indicates no concurrency. * * Note that the allocator is responsible for ordering things between free() * and alloc(). * * The decrements dec_and_test() and sub_and_test() also provide acquire * ordering on success. * */ #ifndef _LINUX_REFCOUNT_H #define _LINUX_REFCOUNT_H #include <linux/atomic.h> #include <linux/bug.h> #include <linux/compiler.h> #include <linux/limits.h> #include <linux/spinlock_types.h> struct mutex; /** * typedef refcount_t - variant of atomic_t specialized for reference counts * @refs: atomic_t counter field * * The counter saturates at REFCOUNT_SATURATED and will not move once * there. This avoids wrapping the counter and causing 'spurious' * use-after-free bugs. */ typedef struct refcount_struct { atomic_t refs; } refcount_t; #define REFCOUNT_INIT(n) { .refs = ATOMIC_INIT(n), } #define REFCOUNT_MAX INT_MAX #define REFCOUNT_SATURATED (INT_MIN / 2) enum refcount_saturation_type { REFCOUNT_ADD_NOT_ZERO_OVF, REFCOUNT_ADD_OVF, REFCOUNT_ADD_UAF, REFCOUNT_SUB_UAF, REFCOUNT_DEC_LEAK, }; void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t); /** * refcount_set - set a refcount's value * @r: the refcount * @n: value to which the refcount will be set */ static inline void refcount_set(refcount_t *r, int n) { atomic_set(&r->refs, n); } /** * refcount_read - get a refcount's value * @r: the refcount * * Return: the refcount's value */ static inline unsigned int refcount_read(const refcount_t *r) { return atomic_read(&r->refs); } static inline __must_check bool __refcount_add_not_zero(int i, refcount_t *r, int *oldp) { int old = refcount_read(r); do { if (!old) break; } while (!atomic_try_cmpxchg_relaxed(&r->refs, &old, old + i)); if (oldp) *oldp = old; if (unlikely(old < 0 || old + i < 0)) refcount_warn_saturate(r, REFCOUNT_ADD_NOT_ZERO_OVF); return old; } /** * refcount_add_not_zero - add a value to a refcount unless it is 0 * @i: the value to add to the refcount * @r: the refcount * * Will saturate at REFCOUNT_SATURATED and WARN. * * Provides no memory ordering, it is assumed the caller has guaranteed the * object memory to be stable (RCU, etc.). It does provide a control dependency * and thereby orders future stores. See the comment on top. * * Use of this function is not recommended for the normal reference counting * use case in which references are taken and released one at a time. In these * cases, refcount_inc(), or one of its variants, should instead be used to * increment a reference count. * * Return: false if the passed refcount is 0, true otherwise */ static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r) { return __refcount_add_not_zero(i, r, NULL); } static inline void __refcount_add(int i, refcount_t *r, int *oldp) { int old = atomic_fetch_add_relaxed(i, &r->refs); if (oldp) *oldp = old; if (unlikely(!old)) refcount_warn_saturate(r, REFCOUNT_ADD_UAF); else if (unlikely(old < 0 || old + i < 0)) refcount_warn_saturate(r, REFCOUNT_ADD_OVF); } /** * refcount_add - add a value to a refcount * @i: the value to add to the refcount * @r: the refcount * * Similar to atomic_add(), but will saturate at REFCOUNT_SATURATED and WARN. * * Provides no memory ordering, it is assumed the caller has guaranteed the * object memory to be stable (RCU, etc.). It does provide a control dependency * and thereby orders future stores. See the comment on top. * * Use of this function is not recommended for the normal reference counting * use case in which references are taken and released one at a time. In these * cases, refcount_inc(), or one of its variants, should instead be used to * increment a reference count. */ static inline void refcount_add(int i, refcount_t *r) { __refcount_add(i, r, NULL); } static inline __must_check bool __refcount_inc_not_zero(refcount_t *r, int *oldp) { return __refcount_add_not_zero(1, r, oldp); } /** * refcount_inc_not_zero - increment a refcount unless it is 0 * @r: the refcount to increment * * Similar to atomic_inc_not_zero(), but will saturate at REFCOUNT_SATURATED * and WARN. * * Provides no memory ordering, it is assumed the caller has guaranteed the * object memory to be stable (RCU, etc.). It does provide a control dependency * and thereby orders future stores. See the comment on top. * * Return: true if the increment was successful, false otherwise */ static inline __must_check bool refcount_inc_not_zero(refcount_t *r) { return __refcount_inc_not_zero(r, NULL); } static inline void __refcount_inc(refcount_t *r, int *oldp) { __refcount_add(1, r, oldp); } /** * refcount_inc - increment a refcount * @r: the refcount to increment * * Similar to atomic_inc(), but will saturate at REFCOUNT_SATURATED and WARN. * * Provides no memory ordering, it is assumed the caller already has a * reference on the object. * * Will WARN if the refcount is 0, as this represents a possible use-after-free * condition. */ static inline void refcount_inc(refcount_t *r) { __refcount_inc(r, NULL); } static inline __must_check bool __refcount_sub_and_test(int i, refcount_t *r, int *oldp) { int old = atomic_fetch_sub_release(i, &r->refs); if (oldp) *oldp = old; if (old == i) { smp_acquire__after_ctrl_dep(); return true; } if (unlikely(old < 0 || old - i < 0)) refcount_warn_saturate(r, REFCOUNT_SUB_UAF); return false; } /** * refcount_sub_and_test - subtract from a refcount and test if it is 0 * @i: amount to subtract from the refcount * @r: the refcount * * Similar to atomic_dec_and_test(), but it will WARN, return false and * ultimately leak on underflow and will fail to decrement when saturated * at REFCOUNT_SATURATED. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides an acquire ordering on success such that free() * must come after. * * Use of this function is not recommended for the normal reference counting * use case in which references are taken and released one at a time. In these * cases, refcount_dec(), or one of its variants, should instead be used to * decrement a reference count. * * Return: true if the resulting refcount is 0, false otherwise */ static inline __must_check bool refcount_sub_and_test(int i, refcount_t *r) { return __refcount_sub_and_test(i, r, NULL); } static inline __must_check bool __refcount_dec_and_test(refcount_t *r, int *oldp) { return __refcount_sub_and_test(1, r, oldp); } /** * refcount_dec_and_test - decrement a refcount and test if it is 0 * @r: the refcount * * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to * decrement when saturated at REFCOUNT_SATURATED. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides an acquire ordering on success such that free() * must come after. * * Return: true if the resulting refcount is 0, false otherwise */ static inline __must_check bool refcount_dec_and_test(refcount_t *r) { return __refcount_dec_and_test(r, NULL); } static inline void __refcount_dec(refcount_t *r, int *oldp) { int old = atomic_fetch_sub_release(1, &r->refs); if (oldp) *oldp = old; if (unlikely(old <= 1)) refcount_warn_saturate(r, REFCOUNT_DEC_LEAK); } /** * refcount_dec - decrement a refcount * @r: the refcount * * Similar to atomic_dec(), it will WARN on underflow and fail to decrement * when saturated at REFCOUNT_SATURATED. * * Provides release memory ordering, such that prior loads and stores are done * before. */ static inline void refcount_dec(refcount_t *r) { __refcount_dec(r, NULL); } extern __must_check bool refcount_dec_if_one(refcount_t *r); extern __must_check bool refcount_dec_not_one(refcount_t *r); extern __must_check bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock) __cond_acquires(lock); extern __must_check bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock) __cond_acquires(lock); extern __must_check bool refcount_dec_and_lock_irqsave(refcount_t *r, spinlock_t *lock, unsigned long *flags) __cond_acquires(lock); #endif /* _LINUX_REFCOUNT_H */
344 395 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef _LINUX_RCUREF_H #define _LINUX_RCUREF_H #include <linux/atomic.h> #include <linux/bug.h> #include <linux/limits.h> #include <linux/lockdep.h> #include <linux/preempt.h> #include <linux/rcupdate.h> #define RCUREF_ONEREF 0x00000000U #define RCUREF_MAXREF 0x7FFFFFFFU #define RCUREF_SATURATED 0xA0000000U #define RCUREF_RELEASED 0xC0000000U #define RCUREF_DEAD 0xE0000000U #define RCUREF_NOREF 0xFFFFFFFFU /** * rcuref_init - Initialize a rcuref reference count with the given reference count * @ref: Pointer to the reference count * @cnt: The initial reference count typically '1' */ static inline void rcuref_init(rcuref_t *ref, unsigned int cnt) { atomic_set(&ref->refcnt, cnt - 1); } /** * rcuref_read - Read the number of held reference counts of a rcuref * @ref: Pointer to the reference count * * Return: The number of held references (0 ... N) */ static inline unsigned int rcuref_read(rcuref_t *ref) { unsigned int c = atomic_read(&ref->refcnt); /* Return 0 if within the DEAD zone. */ return c >= RCUREF_RELEASED ? 0 : c + 1; } extern __must_check bool rcuref_get_slowpath(rcuref_t *ref); /** * rcuref_get - Acquire one reference on a rcuref reference count * @ref: Pointer to the reference count * * Similar to atomic_inc_not_zero() but saturates at RCUREF_MAXREF. * * Provides no memory ordering, it is assumed the caller has guaranteed the * object memory to be stable (RCU, etc.). It does provide a control dependency * and thereby orders future stores. See documentation in lib/rcuref.c * * Return: * False if the attempt to acquire a reference failed. This happens * when the last reference has been put already * * True if a reference was successfully acquired */ static inline __must_check bool rcuref_get(rcuref_t *ref) { /* * Unconditionally increase the reference count. The saturation and * dead zones provide enough tolerance for this. */ if (likely(!atomic_add_negative_relaxed(1, &ref->refcnt))) return true; /* Handle the cases inside the saturation and dead zones */ return rcuref_get_slowpath(ref); } extern __must_check bool rcuref_put_slowpath(rcuref_t *ref); /* * Internal helper. Do not invoke directly. */ static __always_inline __must_check bool __rcuref_put(rcuref_t *ref) { RCU_LOCKDEP_WARN(!rcu_read_lock_held() && preemptible(), "suspicious rcuref_put_rcusafe() usage"); /* * Unconditionally decrease the reference count. The saturation and * dead zones provide enough tolerance for this. */ if (likely(!atomic_add_negative_release(-1, &ref->refcnt))) return false; /* * Handle the last reference drop and cases inside the saturation * and dead zones. */ return rcuref_put_slowpath(ref); } /** * rcuref_put_rcusafe -- Release one reference for a rcuref reference count RCU safe * @ref: Pointer to the reference count * * Provides release memory ordering, such that prior loads and stores are done * before, and provides an acquire ordering on success such that free() * must come after. * * Can be invoked from contexts, which guarantee that no grace period can * happen which would free the object concurrently if the decrement drops * the last reference and the slowpath races against a concurrent get() and * put() pair. rcu_read_lock()'ed and atomic contexts qualify. * * Return: * True if this was the last reference with no future references * possible. This signals the caller that it can safely release the * object which is protected by the reference counter. * * False if there are still active references or the put() raced * with a concurrent get()/put() pair. Caller is not allowed to * release the protected object. */ static inline __must_check bool rcuref_put_rcusafe(rcuref_t *ref) { return __rcuref_put(ref); } /** * rcuref_put -- Release one reference for a rcuref reference count * @ref: Pointer to the reference count * * Can be invoked from any context. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides an acquire ordering on success such that free() * must come after. * * Return: * * True if this was the last reference with no future references * possible. This signals the caller that it can safely schedule the * object, which is protected by the reference counter, for * deconstruction. * * False if there are still active references or the put() raced * with a concurrent get()/put() pair. Caller is not allowed to * deconstruct the protected object. */ static inline __must_check bool rcuref_put(rcuref_t *ref) { bool released; preempt_disable(); released = __rcuref_put(ref); preempt_enable(); return released; } #endif
21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PATH_H #define _LINUX_PATH_H struct dentry; struct vfsmount; struct path { struct vfsmount *mnt; struct dentry *dentry; } __randomize_layout; extern void path_get(const struct path *); extern void path_put(const struct path *); static inline int path_equal(const struct path *path1, const struct path *path2) { return path1->mnt == path2->mnt && path1->dentry == path2->dentry; } static inline void path_put_init(struct path *path) { path_put(path); *path = (struct path) { }; } #endif /* _LINUX_PATH_H */
1 43 43 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 #undef TRACE_SYSTEM #define TRACE_SYSTEM qdisc #if !defined(_TRACE_QDISC_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_QDISC_H #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/tracepoint.h> #include <linux/ftrace.h> #include <linux/pkt_sched.h> #include <net/sch_generic.h> TRACE_EVENT(qdisc_dequeue, TP_PROTO(struct Qdisc *qdisc, const struct netdev_queue *txq, int packets, struct sk_buff *skb), TP_ARGS(qdisc, txq, packets, skb), TP_STRUCT__entry( __field( struct Qdisc *, qdisc ) __field(const struct netdev_queue *, txq ) __field( int, packets ) __field( void *, skbaddr ) __field( int, ifindex ) __field( u32, handle ) __field( u32, parent ) __field( unsigned long, txq_state) ), /* skb==NULL indicate packets dequeued was 0, even when packets==1 */ TP_fast_assign( __entry->qdisc = qdisc; __entry->txq = txq; __entry->packets = skb ? packets : 0; __entry->skbaddr = skb; __entry->ifindex = txq->dev ? txq->dev->ifindex : 0; __entry->handle = qdisc->handle; __entry->parent = qdisc->parent; __entry->txq_state = txq->state; ), TP_printk("dequeue ifindex=%d qdisc handle=0x%X parent=0x%X txq_state=0x%lX packets=%d skbaddr=%p", __entry->ifindex, __entry->handle, __entry->parent, __entry->txq_state, __entry->packets, __entry->skbaddr ) ); TRACE_EVENT(qdisc_enqueue, TP_PROTO(struct Qdisc *qdisc, const struct netdev_queue *txq, struct sk_buff *skb), TP_ARGS(qdisc, txq, skb), TP_STRUCT__entry( __field(struct Qdisc *, qdisc) __field(const struct netdev_queue *, txq) __field(void *, skbaddr) __field(int, ifindex) __field(u32, handle) __field(u32, parent) ), TP_fast_assign( __entry->qdisc = qdisc; __entry->txq = txq; __entry->skbaddr = skb; __entry->ifindex = txq->dev ? txq->dev->ifindex : 0; __entry->handle = qdisc->handle; __entry->parent = qdisc->parent; ), TP_printk("enqueue ifindex=%d qdisc handle=0x%X parent=0x%X skbaddr=%p", __entry->ifindex, __entry->handle, __entry->parent, __entry->skbaddr) ); TRACE_EVENT(qdisc_reset, TP_PROTO(struct Qdisc *q), TP_ARGS(q), TP_STRUCT__entry( __string( dev, qdisc_dev(q) ) __string( kind, q->ops->id ) __field( u32, parent ) __field( u32, handle ) ), TP_fast_assign( __assign_str(dev, qdisc_dev(q)); __assign_str(kind, q->ops->id); __entry->parent = q->parent; __entry->handle = q->handle; ), TP_printk("dev=%s kind=%s parent=%x:%x handle=%x:%x", __get_str(dev), __get_str(kind), TC_H_MAJ(__entry->parent) >> 16, TC_H_MIN(__entry->parent), TC_H_MAJ(__entry->handle) >> 16, TC_H_MIN(__entry->handle)) ); TRACE_EVENT(qdisc_destroy, TP_PROTO(struct Qdisc *q), TP_ARGS(q), TP_STRUCT__entry( __string( dev, qdisc_dev(q) ) __string( kind, q->ops->id ) __field( u32, parent ) __field( u32, handle ) ), TP_fast_assign( __assign_str(dev, qdisc_dev(q)); __assign_str(kind, q->ops->id); __entry->parent = q->parent; __entry->handle = q->handle; ), TP_printk("dev=%s kind=%s parent=%x:%x handle=%x:%x", __get_str(dev), __get_str(kind), TC_H_MAJ(__entry->parent) >> 16, TC_H_MIN(__entry->parent), TC_H_MAJ(__entry->handle) >> 16, TC_H_MIN(__entry->handle)) ); TRACE_EVENT(qdisc_create, TP_PROTO(const struct Qdisc_ops *ops, struct net_device *dev, u32 parent), TP_ARGS(ops, dev, parent), TP_STRUCT__entry( __string( dev, dev->name ) __string( kind, ops->id ) __field( u32, parent ) ), TP_fast_assign( __assign_str(dev, dev->name); __assign_str(kind, ops->id); __entry->parent = parent; ), TP_printk("dev=%s kind=%s parent=%x:%x", __get_str(dev), __get_str(kind), TC_H_MAJ(__entry->parent) >> 16, TC_H_MIN(__entry->parent)) ); #endif /* _TRACE_QDISC_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
51 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 /************************************************************************** * * Copyright 2006-2008 Tungsten Graphics, Inc., Cedar Park, TX. USA. * Copyright 2016 Intel Corporation * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sub license, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice (including the * next paragraph) shall be included in all copies or substantial portions * of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * USE OR OTHER DEALINGS IN THE SOFTWARE. * * **************************************************************************/ /* * Authors: * Thomas Hellstrom <thomas-at-tungstengraphics-dot-com> */ #ifndef _DRM_MM_H_ #define _DRM_MM_H_ /* * Generic range manager structs */ #include <linux/bug.h> #include <linux/rbtree.h> #include <linux/limits.h> #include <linux/mm_types.h> #include <linux/list.h> #include <linux/spinlock.h> #ifdef CONFIG_DRM_DEBUG_MM #include <linux/stackdepot.h> #endif #include <linux/types.h> #include <drm/drm_print.h> #ifdef CONFIG_DRM_DEBUG_MM #define DRM_MM_BUG_ON(expr) BUG_ON(expr) #else #define DRM_MM_BUG_ON(expr) BUILD_BUG_ON_INVALID(expr) #endif /** * enum drm_mm_insert_mode - control search and allocation behaviour * * The &struct drm_mm range manager supports finding a suitable modes using * a number of search trees. These trees are oranised by size, by address and * in most recent eviction order. This allows the user to find either the * smallest hole to reuse, the lowest or highest address to reuse, or simply * reuse the most recent eviction that fits. When allocating the &drm_mm_node * from within the hole, the &drm_mm_insert_mode also dictate whether to * allocate the lowest matching address or the highest. */ enum drm_mm_insert_mode { /** * @DRM_MM_INSERT_BEST: * * Search for the smallest hole (within the search range) that fits * the desired node. * * Allocates the node from the bottom of the found hole. */ DRM_MM_INSERT_BEST = 0, /** * @DRM_MM_INSERT_LOW: * * Search for the lowest hole (address closest to 0, within the search * range) that fits the desired node. * * Allocates the node from the bottom of the found hole. */ DRM_MM_INSERT_LOW, /** * @DRM_MM_INSERT_HIGH: * * Search for the highest hole (address closest to U64_MAX, within the * search range) that fits the desired node. * * Allocates the node from the *top* of the found hole. The specified * alignment for the node is applied to the base of the node * (&drm_mm_node.start). */ DRM_MM_INSERT_HIGH, /** * @DRM_MM_INSERT_EVICT: * * Search for the most recently evicted hole (within the search range) * that fits the desired node. This is appropriate for use immediately * after performing an eviction scan (see drm_mm_scan_init()) and * removing the selected nodes to form a hole. * * Allocates the node from the bottom of the found hole. */ DRM_MM_INSERT_EVICT, /** * @DRM_MM_INSERT_ONCE: * * Only check the first hole for suitablity and report -ENOSPC * immediately otherwise, rather than check every hole until a * suitable one is found. Can only be used in conjunction with another * search method such as DRM_MM_INSERT_HIGH or DRM_MM_INSERT_LOW. */ DRM_MM_INSERT_ONCE = BIT(31), /** * @DRM_MM_INSERT_HIGHEST: * * Only check the highest hole (the hole with the largest address) and * insert the node at the top of the hole or report -ENOSPC if * unsuitable. * * Does not search all holes. */ DRM_MM_INSERT_HIGHEST = DRM_MM_INSERT_HIGH | DRM_MM_INSERT_ONCE, /** * @DRM_MM_INSERT_LOWEST: * * Only check the lowest hole (the hole with the smallest address) and * insert the node at the bottom of the hole or report -ENOSPC if * unsuitable. * * Does not search all holes. */ DRM_MM_INSERT_LOWEST = DRM_MM_INSERT_LOW | DRM_MM_INSERT_ONCE, }; /** * struct drm_mm_node - allocated block in the DRM allocator * * This represents an allocated block in a &drm_mm allocator. Except for * pre-reserved nodes inserted using drm_mm_reserve_node() the structure is * entirely opaque and should only be accessed through the provided funcions. * Since allocation of these nodes is entirely handled by the driver they can be * embedded. */ struct drm_mm_node { /** @color: Opaque driver-private tag. */ unsigned long color; /** @start: Start address of the allocated block. */ u64 start; /** @size: Size of the allocated block. */ u64 size; /* private: */ struct drm_mm *mm; struct list_head node_list; struct list_head hole_stack; struct rb_node rb; struct rb_node rb_hole_size; struct rb_node rb_hole_addr; u64 __subtree_last; u64 hole_size; u64 subtree_max_hole; unsigned long flags; #define DRM_MM_NODE_ALLOCATED_BIT 0 #define DRM_MM_NODE_SCANNED_BIT 1 #ifdef CONFIG_DRM_DEBUG_MM depot_stack_handle_t stack; #endif }; /** * struct drm_mm - DRM allocator * * DRM range allocator with a few special functions and features geared towards * managing GPU memory. Except for the @color_adjust callback the structure is * entirely opaque and should only be accessed through the provided functions * and macros. This structure can be embedded into larger driver structures. */ struct drm_mm { /** * @color_adjust: * * Optional driver callback to further apply restrictions on a hole. The * node argument points at the node containing the hole from which the * block would be allocated (see drm_mm_hole_follows() and friends). The * other arguments are the size of the block to be allocated. The driver * can adjust the start and end as needed to e.g. insert guard pages. */ void (*color_adjust)(const struct drm_mm_node *node, unsigned long color, u64 *start, u64 *end); /* private: */ /* List of all memory nodes that immediately precede a free hole. */ struct list_head hole_stack; /* head_node.node_list is the list of all memory nodes, ordered * according to the (increasing) start address of the memory node. */ struct drm_mm_node head_node; /* Keep an interval_tree for fast lookup of drm_mm_nodes by address. */ struct rb_root_cached interval_tree; struct rb_root_cached holes_size; struct rb_root holes_addr; unsigned long scan_active; }; /** * struct drm_mm_scan - DRM allocator eviction roaster data * * This structure tracks data needed for the eviction roaster set up using * drm_mm_scan_init(), and used with drm_mm_scan_add_block() and * drm_mm_scan_remove_block(). The structure is entirely opaque and should only * be accessed through the provided functions and macros. It is meant to be * allocated temporarily by the driver on the stack. */ struct drm_mm_scan { /* private: */ struct drm_mm *mm; u64 size; u64 alignment; u64 remainder_mask; u64 range_start; u64 range_end; u64 hit_start; u64 hit_end; unsigned long color; enum drm_mm_insert_mode mode; }; /** * drm_mm_node_allocated - checks whether a node is allocated * @node: drm_mm_node to check * * Drivers are required to clear a node prior to using it with the * drm_mm range manager. * * Drivers should use this helper for proper encapsulation of drm_mm * internals. * * Returns: * True if the @node is allocated. */ static inline bool drm_mm_node_allocated(const struct drm_mm_node *node) { return test_bit(DRM_MM_NODE_ALLOCATED_BIT, &node->flags); } /** * drm_mm_initialized - checks whether an allocator is initialized * @mm: drm_mm to check * * Drivers should clear the struct drm_mm prior to initialisation if they * want to use this function. * * Drivers should use this helper for proper encapsulation of drm_mm * internals. * * Returns: * True if the @mm is initialized. */ static inline bool drm_mm_initialized(const struct drm_mm *mm) { return READ_ONCE(mm->hole_stack.next); } /** * drm_mm_hole_follows - checks whether a hole follows this node * @node: drm_mm_node to check * * Holes are embedded into the drm_mm using the tail of a drm_mm_node. * If you wish to know whether a hole follows this particular node, * query this function. See also drm_mm_hole_node_start() and * drm_mm_hole_node_end(). * * Returns: * True if a hole follows the @node. */ static inline bool drm_mm_hole_follows(const struct drm_mm_node *node) { return node->hole_size; } static inline u64 __drm_mm_hole_node_start(const struct drm_mm_node *hole_node) { return hole_node->start + hole_node->size; } /** * drm_mm_hole_node_start - computes the start of the hole following @node * @hole_node: drm_mm_node which implicitly tracks the following hole * * This is useful for driver-specific debug dumpers. Otherwise drivers should * not inspect holes themselves. Drivers must check first whether a hole indeed * follows by looking at drm_mm_hole_follows() * * Returns: * Start of the subsequent hole. */ static inline u64 drm_mm_hole_node_start(const struct drm_mm_node *hole_node) { DRM_MM_BUG_ON(!drm_mm_hole_follows(hole_node)); return __drm_mm_hole_node_start(hole_node); } static inline u64 __drm_mm_hole_node_end(const struct drm_mm_node *hole_node) { return list_next_entry(hole_node, node_list)->start; } /** * drm_mm_hole_node_end - computes the end of the hole following @node * @hole_node: drm_mm_node which implicitly tracks the following hole * * This is useful for driver-specific debug dumpers. Otherwise drivers should * not inspect holes themselves. Drivers must check first whether a hole indeed * follows by looking at drm_mm_hole_follows(). * * Returns: * End of the subsequent hole. */ static inline u64 drm_mm_hole_node_end(const struct drm_mm_node *hole_node) { return __drm_mm_hole_node_end(hole_node); } /** * drm_mm_nodes - list of nodes under the drm_mm range manager * @mm: the struct drm_mm range manager * * As the drm_mm range manager hides its node_list deep with its * structure, extracting it looks painful and repetitive. This is * not expected to be used outside of the drm_mm_for_each_node() * macros and similar internal functions. * * Returns: * The node list, may be empty. */ #define drm_mm_nodes(mm) (&(mm)->head_node.node_list) /** * drm_mm_for_each_node - iterator to walk over all allocated nodes * @entry: &struct drm_mm_node to assign to in each iteration step * @mm: &drm_mm allocator to walk * * This iterator walks over all nodes in the range allocator. It is implemented * with list_for_each(), so not save against removal of elements. */ #define drm_mm_for_each_node(entry, mm) \ list_for_each_entry(entry, drm_mm_nodes(mm), node_list) /** * drm_mm_for_each_node_safe - iterator to walk over all allocated nodes * @entry: &struct drm_mm_node to assign to in each iteration step * @next: &struct drm_mm_node to store the next step * @mm: &drm_mm allocator to walk * * This iterator walks over all nodes in the range allocator. It is implemented * with list_for_each_safe(), so save against removal of elements. */ #define drm_mm_for_each_node_safe(entry, next, mm) \ list_for_each_entry_safe(entry, next, drm_mm_nodes(mm), node_list) /** * drm_mm_for_each_hole - iterator to walk over all holes * @pos: &drm_mm_node used internally to track progress * @mm: &drm_mm allocator to walk * @hole_start: ulong variable to assign the hole start to on each iteration * @hole_end: ulong variable to assign the hole end to on each iteration * * This iterator walks over all holes in the range allocator. It is implemented * with list_for_each(), so not save against removal of elements. @entry is used * internally and will not reflect a real drm_mm_node for the very first hole. * Hence users of this iterator may not access it. * * Implementation Note: * We need to inline list_for_each_entry in order to be able to set hole_start * and hole_end on each iteration while keeping the macro sane. */ #define drm_mm_for_each_hole(pos, mm, hole_start, hole_end) \ for (pos = list_first_entry(&(mm)->hole_stack, \ typeof(*pos), hole_stack); \ &pos->hole_stack != &(mm)->hole_stack ? \ hole_start = drm_mm_hole_node_start(pos), \ hole_end = hole_start + pos->hole_size, \ 1 : 0; \ pos = list_next_entry(pos, hole_stack)) /* * Basic range manager support (drm_mm.c) */ int drm_mm_reserve_node(struct drm_mm *mm, struct drm_mm_node *node); int drm_mm_insert_node_in_range(struct drm_mm *mm, struct drm_mm_node *node, u64 size, u64 alignment, unsigned long color, u64 start, u64 end, enum drm_mm_insert_mode mode); /** * drm_mm_insert_node_generic - search for space and insert @node * @mm: drm_mm to allocate from * @node: preallocate node to insert * @size: size of the allocation * @alignment: alignment of the allocation * @color: opaque tag value to use for this node * @mode: fine-tune the allocation search and placement * * This is a simplified version of drm_mm_insert_node_in_range() with no * range restrictions applied. * * The preallocated node must be cleared to 0. * * Returns: * 0 on success, -ENOSPC if there's no suitable hole. */ static inline int drm_mm_insert_node_generic(struct drm_mm *mm, struct drm_mm_node *node, u64 size, u64 alignment, unsigned long color, enum drm_mm_insert_mode mode) { return drm_mm_insert_node_in_range(mm, node, size, alignment, color, 0, U64_MAX, mode); } /** * drm_mm_insert_node - search for space and insert @node * @mm: drm_mm to allocate from * @node: preallocate node to insert * @size: size of the allocation * * This is a simplified version of drm_mm_insert_node_generic() with @color set * to 0. * * The preallocated node must be cleared to 0. * * Returns: * 0 on success, -ENOSPC if there's no suitable hole. */ static inline int drm_mm_insert_node(struct drm_mm *mm, struct drm_mm_node *node, u64 size) { return drm_mm_insert_node_generic(mm, node, size, 0, 0, 0); } void drm_mm_remove_node(struct drm_mm_node *node); void drm_mm_replace_node(struct drm_mm_node *old, struct drm_mm_node *new); void drm_mm_init(struct drm_mm *mm, u64 start, u64 size); void drm_mm_takedown(struct drm_mm *mm); /** * drm_mm_clean - checks whether an allocator is clean * @mm: drm_mm allocator to check * * Returns: * True if the allocator is completely free, false if there's still a node * allocated in it. */ static inline bool drm_mm_clean(const struct drm_mm *mm) { return list_empty(drm_mm_nodes(mm)); } struct drm_mm_node * __drm_mm_interval_first(const struct drm_mm *mm, u64 start, u64 last); /** * drm_mm_for_each_node_in_range - iterator to walk over a range of * allocated nodes * @node__: drm_mm_node structure to assign to in each iteration step * @mm__: drm_mm allocator to walk * @start__: starting offset, the first node will overlap this * @end__: ending offset, the last node will start before this (but may overlap) * * This iterator walks over all nodes in the range allocator that lie * between @start and @end. It is implemented similarly to list_for_each(), * but using the internal interval tree to accelerate the search for the * starting node, and so not safe against removal of elements. It assumes * that @end is within (or is the upper limit of) the drm_mm allocator. * If [@start, @end] are beyond the range of the drm_mm, the iterator may walk * over the special _unallocated_ &drm_mm.head_node, and may even continue * indefinitely. */ #define drm_mm_for_each_node_in_range(node__, mm__, start__, end__) \ for (node__ = __drm_mm_interval_first((mm__), (start__), (end__)-1); \ node__->start < (end__); \ node__ = list_next_entry(node__, node_list)) void drm_mm_scan_init_with_range(struct drm_mm_scan *scan, struct drm_mm *mm, u64 size, u64 alignment, unsigned long color, u64 start, u64 end, enum drm_mm_insert_mode mode); /** * drm_mm_scan_init - initialize lru scanning * @scan: scan state * @mm: drm_mm to scan * @size: size of the allocation * @alignment: alignment of the allocation * @color: opaque tag value to use for the allocation * @mode: fine-tune the allocation search and placement * * This is a simplified version of drm_mm_scan_init_with_range() with no range * restrictions applied. * * This simply sets up the scanning routines with the parameters for the desired * hole. * * Warning: * As long as the scan list is non-empty, no other operations than * adding/removing nodes to/from the scan list are allowed. */ static inline void drm_mm_scan_init(struct drm_mm_scan *scan, struct drm_mm *mm, u64 size, u64 alignment, unsigned long color, enum drm_mm_insert_mode mode) { drm_mm_scan_init_with_range(scan, mm, size, alignment, color, 0, U64_MAX, mode); } bool drm_mm_scan_add_block(struct drm_mm_scan *scan, struct drm_mm_node *node); bool drm_mm_scan_remove_block(struct drm_mm_scan *scan, struct drm_mm_node *node); struct drm_mm_node *drm_mm_scan_color_evict(struct drm_mm_scan *scan); void drm_mm_print(const struct drm_mm *mm, struct drm_printer *p); #endif
3 3 3 1 1 1 2 3 3 1 2 3 3 2 3 2 1 3 2 2 1 2 3 7 7 6 6 4 6 4 6 4 3 3 3 3 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 // SPDX-License-Identifier: GPL-2.0-only /* * Sony NFC Port-100 Series driver * Copyright (c) 2013, Intel Corporation. * * Partly based/Inspired by Stephen Tiedemann's nfcpy */ #include <linux/module.h> #include <linux/usb.h> #include <net/nfc/digital.h> #define VERSION "0.1" #define SONY_VENDOR_ID 0x054c #define RCS380S_PRODUCT_ID 0x06c1 #define RCS380P_PRODUCT_ID 0x06c3 #define PORT100_PROTOCOLS (NFC_PROTO_JEWEL_MASK | \ NFC_PROTO_MIFARE_MASK | \ NFC_PROTO_FELICA_MASK | \ NFC_PROTO_NFC_DEP_MASK | \ NFC_PROTO_ISO14443_MASK | \ NFC_PROTO_ISO14443_B_MASK) #define PORT100_CAPABILITIES (NFC_DIGITAL_DRV_CAPS_IN_CRC | \ NFC_DIGITAL_DRV_CAPS_TG_CRC) /* Standard port100 frame definitions */ #define PORT100_FRAME_HEADER_LEN (sizeof(struct port100_frame) \ + 2) /* data[0] CC, data[1] SCC */ #define PORT100_FRAME_TAIL_LEN 2 /* data[len] DCS, data[len + 1] postamble*/ #define PORT100_COMM_RF_HEAD_MAX_LEN (sizeof(struct port100_tg_comm_rf_cmd)) /* * Max extended frame payload len, excluding CC and SCC * which are already in PORT100_FRAME_HEADER_LEN. */ #define PORT100_FRAME_MAX_PAYLOAD_LEN 1001 #define PORT100_FRAME_ACK_SIZE 6 /* Preamble (1), SoPC (2), ACK Code (2), Postamble (1) */ static u8 ack_frame[PORT100_FRAME_ACK_SIZE] = { 0x00, 0x00, 0xff, 0x00, 0xff, 0x00 }; #define PORT100_FRAME_CHECKSUM(f) (f->data[le16_to_cpu(f->datalen)]) #define PORT100_FRAME_POSTAMBLE(f) (f->data[le16_to_cpu(f->datalen) + 1]) /* start of frame */ #define PORT100_FRAME_SOF 0x00FF #define PORT100_FRAME_EXT 0xFFFF #define PORT100_FRAME_ACK 0x00FF /* Port-100 command: in or out */ #define PORT100_FRAME_DIRECTION(f) (f->data[0]) /* CC */ #define PORT100_FRAME_DIR_OUT 0xD6 #define PORT100_FRAME_DIR_IN 0xD7 /* Port-100 sub-command */ #define PORT100_FRAME_CMD(f) (f->data[1]) /* SCC */ #define PORT100_CMD_GET_FIRMWARE_VERSION 0x20 #define PORT100_CMD_GET_COMMAND_TYPE 0x28 #define PORT100_CMD_SET_COMMAND_TYPE 0x2A #define PORT100_CMD_IN_SET_RF 0x00 #define PORT100_CMD_IN_SET_PROTOCOL 0x02 #define PORT100_CMD_IN_COMM_RF 0x04 #define PORT100_CMD_TG_SET_RF 0x40 #define PORT100_CMD_TG_SET_PROTOCOL 0x42 #define PORT100_CMD_TG_SET_RF_OFF 0x46 #define PORT100_CMD_TG_COMM_RF 0x48 #define PORT100_CMD_SWITCH_RF 0x06 #define PORT100_CMD_RESPONSE(cmd) (cmd + 1) #define PORT100_CMD_TYPE_IS_SUPPORTED(mask, cmd_type) \ ((mask) & (0x01 << (cmd_type))) #define PORT100_CMD_TYPE_0 0 #define PORT100_CMD_TYPE_1 1 #define PORT100_CMD_STATUS_OK 0x00 #define PORT100_CMD_STATUS_TIMEOUT 0x80 #define PORT100_MDAA_TGT_HAS_BEEN_ACTIVATED_MASK 0x01 #define PORT100_MDAA_TGT_WAS_ACTIVATED_MASK 0x02 struct port100; typedef void (*port100_send_async_complete_t)(struct port100 *dev, void *arg, struct sk_buff *resp); /* * Setting sets structure for in_set_rf command * * @in_*_set_number: Represent the entry indexes in the port-100 RF Base Table. * This table contains multiple RF setting sets required for RF * communication. * * @in_*_comm_type: Theses fields set the communication type to be used. */ struct port100_in_rf_setting { u8 in_send_set_number; u8 in_send_comm_type; u8 in_recv_set_number; u8 in_recv_comm_type; } __packed; #define PORT100_COMM_TYPE_IN_212F 0x01 #define PORT100_COMM_TYPE_IN_424F 0x02 #define PORT100_COMM_TYPE_IN_106A 0x03 #define PORT100_COMM_TYPE_IN_106B 0x07 static const struct port100_in_rf_setting in_rf_settings[] = { [NFC_DIGITAL_RF_TECH_212F] = { .in_send_set_number = 1, .in_send_comm_type = PORT100_COMM_TYPE_IN_212F, .in_recv_set_number = 15, .in_recv_comm_type = PORT100_COMM_TYPE_IN_212F, }, [NFC_DIGITAL_RF_TECH_424F] = { .in_send_set_number = 1, .in_send_comm_type = PORT100_COMM_TYPE_IN_424F, .in_recv_set_number = 15, .in_recv_comm_type = PORT100_COMM_TYPE_IN_424F, }, [NFC_DIGITAL_RF_TECH_106A] = { .in_send_set_number = 2, .in_send_comm_type = PORT100_COMM_TYPE_IN_106A, .in_recv_set_number = 15, .in_recv_comm_type = PORT100_COMM_TYPE_IN_106A, }, [NFC_DIGITAL_RF_TECH_106B] = { .in_send_set_number = 3, .in_send_comm_type = PORT100_COMM_TYPE_IN_106B, .in_recv_set_number = 15, .in_recv_comm_type = PORT100_COMM_TYPE_IN_106B, }, /* Ensures the array has NFC_DIGITAL_RF_TECH_LAST elements */ [NFC_DIGITAL_RF_TECH_LAST] = { 0 }, }; /** * struct port100_tg_rf_setting - Setting sets structure for tg_set_rf command * * @tg_set_number: Represents the entry index in the port-100 RF Base Table. * This table contains multiple RF setting sets required for RF * communication. this field is used for both send and receive * settings. * * @tg_comm_type: Sets the communication type to be used to send and receive * data. */ struct port100_tg_rf_setting { u8 tg_set_number; u8 tg_comm_type; } __packed; #define PORT100_COMM_TYPE_TG_106A 0x0B #define PORT100_COMM_TYPE_TG_212F 0x0C #define PORT100_COMM_TYPE_TG_424F 0x0D static const struct port100_tg_rf_setting tg_rf_settings[] = { [NFC_DIGITAL_RF_TECH_106A] = { .tg_set_number = 8, .tg_comm_type = PORT100_COMM_TYPE_TG_106A, }, [NFC_DIGITAL_RF_TECH_212F] = { .tg_set_number = 8, .tg_comm_type = PORT100_COMM_TYPE_TG_212F, }, [NFC_DIGITAL_RF_TECH_424F] = { .tg_set_number = 8, .tg_comm_type = PORT100_COMM_TYPE_TG_424F, }, /* Ensures the array has NFC_DIGITAL_RF_TECH_LAST elements */ [NFC_DIGITAL_RF_TECH_LAST] = { 0 }, }; #define PORT100_IN_PROT_INITIAL_GUARD_TIME 0x00 #define PORT100_IN_PROT_ADD_CRC 0x01 #define PORT100_IN_PROT_CHECK_CRC 0x02 #define PORT100_IN_PROT_MULTI_CARD 0x03 #define PORT100_IN_PROT_ADD_PARITY 0x04 #define PORT100_IN_PROT_CHECK_PARITY 0x05 #define PORT100_IN_PROT_BITWISE_AC_RECV_MODE 0x06 #define PORT100_IN_PROT_VALID_BIT_NUMBER 0x07 #define PORT100_IN_PROT_CRYPTO1 0x08 #define PORT100_IN_PROT_ADD_SOF 0x09 #define PORT100_IN_PROT_CHECK_SOF 0x0A #define PORT100_IN_PROT_ADD_EOF 0x0B #define PORT100_IN_PROT_CHECK_EOF 0x0C #define PORT100_IN_PROT_DEAF_TIME 0x0E #define PORT100_IN_PROT_CRM 0x0F #define PORT100_IN_PROT_CRM_MIN_LEN 0x10 #define PORT100_IN_PROT_T1_TAG_FRAME 0x11 #define PORT100_IN_PROT_RFCA 0x12 #define PORT100_IN_PROT_GUARD_TIME_AT_INITIATOR 0x13 #define PORT100_IN_PROT_END 0x14 #define PORT100_IN_MAX_NUM_PROTOCOLS 19 #define PORT100_TG_PROT_TU 0x00 #define PORT100_TG_PROT_RF_OFF 0x01 #define PORT100_TG_PROT_CRM 0x02 #define PORT100_TG_PROT_END 0x03 #define PORT100_TG_MAX_NUM_PROTOCOLS 3 struct port100_protocol { u8 number; u8 value; } __packed; static const struct port100_protocol in_protocols[][PORT100_IN_MAX_NUM_PROTOCOLS + 1] = { [NFC_DIGITAL_FRAMING_NFCA_SHORT] = { { PORT100_IN_PROT_INITIAL_GUARD_TIME, 6 }, { PORT100_IN_PROT_ADD_CRC, 0 }, { PORT100_IN_PROT_CHECK_CRC, 0 }, { PORT100_IN_PROT_MULTI_CARD, 0 }, { PORT100_IN_PROT_ADD_PARITY, 0 }, { PORT100_IN_PROT_CHECK_PARITY, 1 }, { PORT100_IN_PROT_BITWISE_AC_RECV_MODE, 0 }, { PORT100_IN_PROT_VALID_BIT_NUMBER, 7 }, { PORT100_IN_PROT_CRYPTO1, 0 }, { PORT100_IN_PROT_ADD_SOF, 0 }, { PORT100_IN_PROT_CHECK_SOF, 0 }, { PORT100_IN_PROT_ADD_EOF, 0 }, { PORT100_IN_PROT_CHECK_EOF, 0 }, { PORT100_IN_PROT_DEAF_TIME, 4 }, { PORT100_IN_PROT_CRM, 0 }, { PORT100_IN_PROT_CRM_MIN_LEN, 0 }, { PORT100_IN_PROT_T1_TAG_FRAME, 0 }, { PORT100_IN_PROT_RFCA, 0 }, { PORT100_IN_PROT_GUARD_TIME_AT_INITIATOR, 6 }, { PORT100_IN_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFCA_STANDARD] = { { PORT100_IN_PROT_INITIAL_GUARD_TIME, 6 }, { PORT100_IN_PROT_ADD_CRC, 0 }, { PORT100_IN_PROT_CHECK_CRC, 0 }, { PORT100_IN_PROT_MULTI_CARD, 0 }, { PORT100_IN_PROT_ADD_PARITY, 1 }, { PORT100_IN_PROT_CHECK_PARITY, 1 }, { PORT100_IN_PROT_BITWISE_AC_RECV_MODE, 0 }, { PORT100_IN_PROT_VALID_BIT_NUMBER, 8 }, { PORT100_IN_PROT_CRYPTO1, 0 }, { PORT100_IN_PROT_ADD_SOF, 0 }, { PORT100_IN_PROT_CHECK_SOF, 0 }, { PORT100_IN_PROT_ADD_EOF, 0 }, { PORT100_IN_PROT_CHECK_EOF, 0 }, { PORT100_IN_PROT_DEAF_TIME, 4 }, { PORT100_IN_PROT_CRM, 0 }, { PORT100_IN_PROT_CRM_MIN_LEN, 0 }, { PORT100_IN_PROT_T1_TAG_FRAME, 0 }, { PORT100_IN_PROT_RFCA, 0 }, { PORT100_IN_PROT_GUARD_TIME_AT_INITIATOR, 6 }, { PORT100_IN_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFCA_STANDARD_WITH_CRC_A] = { { PORT100_IN_PROT_INITIAL_GUARD_TIME, 6 }, { PORT100_IN_PROT_ADD_CRC, 1 }, { PORT100_IN_PROT_CHECK_CRC, 1 }, { PORT100_IN_PROT_MULTI_CARD, 0 }, { PORT100_IN_PROT_ADD_PARITY, 1 }, { PORT100_IN_PROT_CHECK_PARITY, 1 }, { PORT100_IN_PROT_BITWISE_AC_RECV_MODE, 0 }, { PORT100_IN_PROT_VALID_BIT_NUMBER, 8 }, { PORT100_IN_PROT_CRYPTO1, 0 }, { PORT100_IN_PROT_ADD_SOF, 0 }, { PORT100_IN_PROT_CHECK_SOF, 0 }, { PORT100_IN_PROT_ADD_EOF, 0 }, { PORT100_IN_PROT_CHECK_EOF, 0 }, { PORT100_IN_PROT_DEAF_TIME, 4 }, { PORT100_IN_PROT_CRM, 0 }, { PORT100_IN_PROT_CRM_MIN_LEN, 0 }, { PORT100_IN_PROT_T1_TAG_FRAME, 0 }, { PORT100_IN_PROT_RFCA, 0 }, { PORT100_IN_PROT_GUARD_TIME_AT_INITIATOR, 6 }, { PORT100_IN_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFCA_T1T] = { /* nfc_digital_framing_nfca_short */ { PORT100_IN_PROT_ADD_CRC, 2 }, { PORT100_IN_PROT_CHECK_CRC, 2 }, { PORT100_IN_PROT_VALID_BIT_NUMBER, 8 }, { PORT100_IN_PROT_T1_TAG_FRAME, 2 }, { PORT100_IN_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFCA_T2T] = { /* nfc_digital_framing_nfca_standard */ { PORT100_IN_PROT_ADD_CRC, 1 }, { PORT100_IN_PROT_CHECK_CRC, 0 }, { PORT100_IN_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFCA_T4T] = { /* nfc_digital_framing_nfca_standard_with_crc_a */ { PORT100_IN_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFCA_NFC_DEP] = { /* nfc_digital_framing_nfca_standard */ { PORT100_IN_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFCF] = { { PORT100_IN_PROT_INITIAL_GUARD_TIME, 18 }, { PORT100_IN_PROT_ADD_CRC, 1 }, { PORT100_IN_PROT_CHECK_CRC, 1 }, { PORT100_IN_PROT_MULTI_CARD, 0 }, { PORT100_IN_PROT_ADD_PARITY, 0 }, { PORT100_IN_PROT_CHECK_PARITY, 0 }, { PORT100_IN_PROT_BITWISE_AC_RECV_MODE, 0 }, { PORT100_IN_PROT_VALID_BIT_NUMBER, 8 }, { PORT100_IN_PROT_CRYPTO1, 0 }, { PORT100_IN_PROT_ADD_SOF, 0 }, { PORT100_IN_PROT_CHECK_SOF, 0 }, { PORT100_IN_PROT_ADD_EOF, 0 }, { PORT100_IN_PROT_CHECK_EOF, 0 }, { PORT100_IN_PROT_DEAF_TIME, 4 }, { PORT100_IN_PROT_CRM, 0 }, { PORT100_IN_PROT_CRM_MIN_LEN, 0 }, { PORT100_IN_PROT_T1_TAG_FRAME, 0 }, { PORT100_IN_PROT_RFCA, 0 }, { PORT100_IN_PROT_GUARD_TIME_AT_INITIATOR, 6 }, { PORT100_IN_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFCF_T3T] = { /* nfc_digital_framing_nfcf */ { PORT100_IN_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFCF_NFC_DEP] = { /* nfc_digital_framing_nfcf */ { PORT100_IN_PROT_INITIAL_GUARD_TIME, 18 }, { PORT100_IN_PROT_ADD_CRC, 1 }, { PORT100_IN_PROT_CHECK_CRC, 1 }, { PORT100_IN_PROT_MULTI_CARD, 0 }, { PORT100_IN_PROT_ADD_PARITY, 0 }, { PORT100_IN_PROT_CHECK_PARITY, 0 }, { PORT100_IN_PROT_BITWISE_AC_RECV_MODE, 0 }, { PORT100_IN_PROT_VALID_BIT_NUMBER, 8 }, { PORT100_IN_PROT_CRYPTO1, 0 }, { PORT100_IN_PROT_ADD_SOF, 0 }, { PORT100_IN_PROT_CHECK_SOF, 0 }, { PORT100_IN_PROT_ADD_EOF, 0 }, { PORT100_IN_PROT_CHECK_EOF, 0 }, { PORT100_IN_PROT_DEAF_TIME, 4 }, { PORT100_IN_PROT_CRM, 0 }, { PORT100_IN_PROT_CRM_MIN_LEN, 0 }, { PORT100_IN_PROT_T1_TAG_FRAME, 0 }, { PORT100_IN_PROT_RFCA, 0 }, { PORT100_IN_PROT_GUARD_TIME_AT_INITIATOR, 6 }, { PORT100_IN_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFC_DEP_ACTIVATED] = { { PORT100_IN_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFCB] = { { PORT100_IN_PROT_INITIAL_GUARD_TIME, 20 }, { PORT100_IN_PROT_ADD_CRC, 1 }, { PORT100_IN_PROT_CHECK_CRC, 1 }, { PORT100_IN_PROT_MULTI_CARD, 0 }, { PORT100_IN_PROT_ADD_PARITY, 0 }, { PORT100_IN_PROT_CHECK_PARITY, 0 }, { PORT100_IN_PROT_BITWISE_AC_RECV_MODE, 0 }, { PORT100_IN_PROT_VALID_BIT_NUMBER, 8 }, { PORT100_IN_PROT_CRYPTO1, 0 }, { PORT100_IN_PROT_ADD_SOF, 1 }, { PORT100_IN_PROT_CHECK_SOF, 1 }, { PORT100_IN_PROT_ADD_EOF, 1 }, { PORT100_IN_PROT_CHECK_EOF, 1 }, { PORT100_IN_PROT_DEAF_TIME, 4 }, { PORT100_IN_PROT_CRM, 0 }, { PORT100_IN_PROT_CRM_MIN_LEN, 0 }, { PORT100_IN_PROT_T1_TAG_FRAME, 0 }, { PORT100_IN_PROT_RFCA, 0 }, { PORT100_IN_PROT_GUARD_TIME_AT_INITIATOR, 6 }, { PORT100_IN_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFCB_T4T] = { /* nfc_digital_framing_nfcb */ { PORT100_IN_PROT_END, 0 }, }, /* Ensures the array has NFC_DIGITAL_FRAMING_LAST elements */ [NFC_DIGITAL_FRAMING_LAST] = { { PORT100_IN_PROT_END, 0 }, }, }; static const struct port100_protocol tg_protocols[][PORT100_TG_MAX_NUM_PROTOCOLS + 1] = { [NFC_DIGITAL_FRAMING_NFCA_SHORT] = { { PORT100_TG_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFCA_STANDARD] = { { PORT100_TG_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFCA_STANDARD_WITH_CRC_A] = { { PORT100_TG_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFCA_T1T] = { { PORT100_TG_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFCA_T2T] = { { PORT100_TG_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFCA_NFC_DEP] = { { PORT100_TG_PROT_TU, 1 }, { PORT100_TG_PROT_RF_OFF, 0 }, { PORT100_TG_PROT_CRM, 7 }, { PORT100_TG_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFCF] = { { PORT100_TG_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFCF_T3T] = { { PORT100_TG_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFCF_NFC_DEP] = { { PORT100_TG_PROT_TU, 1 }, { PORT100_TG_PROT_RF_OFF, 0 }, { PORT100_TG_PROT_CRM, 7 }, { PORT100_TG_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFC_DEP_ACTIVATED] = { { PORT100_TG_PROT_RF_OFF, 1 }, { PORT100_TG_PROT_END, 0 }, }, /* Ensures the array has NFC_DIGITAL_FRAMING_LAST elements */ [NFC_DIGITAL_FRAMING_LAST] = { { PORT100_TG_PROT_END, 0 }, }, }; struct port100 { struct nfc_digital_dev *nfc_digital_dev; int skb_headroom; int skb_tailroom; struct usb_device *udev; struct usb_interface *interface; struct urb *out_urb; struct urb *in_urb; /* This mutex protects the out_urb and avoids to submit a new command * through port100_send_frame_async() while the previous one is being * canceled through port100_abort_cmd(). */ struct mutex out_urb_lock; struct work_struct cmd_complete_work; u8 cmd_type; /* The digital stack serializes commands to be sent. There is no need * for any queuing/locking mechanism at driver level. */ struct port100_cmd *cmd; bool cmd_cancel; struct completion cmd_cancel_done; }; struct port100_cmd { u8 code; int status; struct sk_buff *req; struct sk_buff *resp; int resp_len; port100_send_async_complete_t complete_cb; void *complete_cb_context; }; struct port100_frame { u8 preamble; __be16 start_frame; __be16 extended_frame; __le16 datalen; u8 datalen_checksum; u8 data[]; } __packed; struct port100_ack_frame { u8 preamble; __be16 start_frame; __be16 ack_frame; u8 postambule; } __packed; struct port100_cb_arg { nfc_digital_cmd_complete_t complete_cb; void *complete_arg; u8 mdaa; }; struct port100_tg_comm_rf_cmd { __le16 guard_time; __le16 send_timeout; u8 mdaa; u8 nfca_param[6]; u8 nfcf_param[18]; u8 mf_halted; u8 arae_flag; __le16 recv_timeout; u8 data[]; } __packed; struct port100_tg_comm_rf_res { u8 comm_type; u8 ar_status; u8 target_activated; __le32 status; u8 data[]; } __packed; /* The rule: value + checksum = 0 */ static inline u8 port100_checksum(u16 value) { return ~(((u8 *)&value)[0] + ((u8 *)&value)[1]) + 1; } /* The rule: sum(data elements) + checksum = 0 */ static u8 port100_data_checksum(const u8 *data, int datalen) { u8 sum = 0; int i; for (i = 0; i < datalen; i++) sum += data[i]; return port100_checksum(sum); } static void port100_tx_frame_init(void *_frame, u8 cmd_code) { struct port100_frame *frame = _frame; frame->preamble = 0; frame->start_frame = cpu_to_be16(PORT100_FRAME_SOF); frame->extended_frame = cpu_to_be16(PORT100_FRAME_EXT); PORT100_FRAME_DIRECTION(frame) = PORT100_FRAME_DIR_OUT; PORT100_FRAME_CMD(frame) = cmd_code; frame->datalen = cpu_to_le16(2); } static void port100_tx_frame_finish(void *_frame) { struct port100_frame *frame = _frame; frame->datalen_checksum = port100_checksum(le16_to_cpu(frame->datalen)); PORT100_FRAME_CHECKSUM(frame) = port100_data_checksum(frame->data, le16_to_cpu(frame->datalen)); PORT100_FRAME_POSTAMBLE(frame) = 0; } static void port100_tx_update_payload_len(void *_frame, int len) { struct port100_frame *frame = _frame; le16_add_cpu(&frame->datalen, len); } static bool port100_rx_frame_is_valid(const void *_frame) { u8 checksum; const struct port100_frame *frame = _frame; if (frame->start_frame != cpu_to_be16(PORT100_FRAME_SOF) || frame->extended_frame != cpu_to_be16(PORT100_FRAME_EXT)) return false; checksum = port100_checksum(le16_to_cpu(frame->datalen)); if (checksum != frame->datalen_checksum) return false; checksum = port100_data_checksum(frame->data, le16_to_cpu(frame->datalen)); if (checksum != PORT100_FRAME_CHECKSUM(frame)) return false; return true; } static bool port100_rx_frame_is_ack(const struct port100_ack_frame *frame) { return (frame->start_frame == cpu_to_be16(PORT100_FRAME_SOF) && frame->ack_frame == cpu_to_be16(PORT100_FRAME_ACK)); } static inline int port100_rx_frame_size(const void *frame) { const struct port100_frame *f = frame; return sizeof(struct port100_frame) + le16_to_cpu(f->datalen) + PORT100_FRAME_TAIL_LEN; } static bool port100_rx_frame_is_cmd_response(const struct port100 *dev, const void *frame) { const struct port100_frame *f = frame; return (PORT100_FRAME_CMD(f) == PORT100_CMD_RESPONSE(dev->cmd->code)); } static void port100_recv_response(struct urb *urb) { struct port100 *dev = urb->context; struct port100_cmd *cmd = dev->cmd; u8 *in_frame; cmd->status = urb->status; switch (urb->status) { case 0: break; /* success */ case -ECONNRESET: case -ENOENT: nfc_dbg(&dev->interface->dev, "The urb has been canceled (status %d)\n", urb->status); goto sched_wq; case -ESHUTDOWN: default: nfc_err(&dev->interface->dev, "Urb failure (status %d)\n", urb->status); goto sched_wq; } in_frame = dev->in_urb->transfer_buffer; if (!port100_rx_frame_is_valid(in_frame)) { nfc_err(&dev->interface->dev, "Received an invalid frame\n"); cmd->status = -EIO; goto sched_wq; } print_hex_dump_debug("PORT100 RX: ", DUMP_PREFIX_NONE, 16, 1, in_frame, port100_rx_frame_size(in_frame), false); if (!port100_rx_frame_is_cmd_response(dev, in_frame)) { nfc_err(&dev->interface->dev, "It's not the response to the last command\n"); cmd->status = -EIO; goto sched_wq; } sched_wq: schedule_work(&dev->cmd_complete_work); } static int port100_submit_urb_for_response(const struct port100 *dev, gfp_t flags) { dev->in_urb->complete = port100_recv_response; return usb_submit_urb(dev->in_urb, flags); } static void port100_recv_ack(struct urb *urb) { struct port100 *dev = urb->context; struct port100_cmd *cmd = dev->cmd; const struct port100_ack_frame *in_frame; int rc; cmd->status = urb->status; switch (urb->status) { case 0: break; /* success */ case -ECONNRESET: case -ENOENT: nfc_dbg(&dev->interface->dev, "The urb has been stopped (status %d)\n", urb->status); goto sched_wq; case -ESHUTDOWN: default: nfc_err(&dev->interface->dev, "Urb failure (status %d)\n", urb->status); goto sched_wq; } in_frame = dev->in_urb->transfer_buffer; if (!port100_rx_frame_is_ack(in_frame)) { nfc_err(&dev->interface->dev, "Received an invalid ack\n"); cmd->status = -EIO; goto sched_wq; } rc = port100_submit_urb_for_response(dev, GFP_ATOMIC); if (rc) { nfc_err(&dev->interface->dev, "usb_submit_urb failed with result %d\n", rc); cmd->status = rc; goto sched_wq; } return; sched_wq: schedule_work(&dev->cmd_complete_work); } static int port100_submit_urb_for_ack(const struct port100 *dev, gfp_t flags) { dev->in_urb->complete = port100_recv_ack; return usb_submit_urb(dev->in_urb, flags); } static int port100_send_ack(struct port100 *dev) { int rc = 0; mutex_lock(&dev->out_urb_lock); /* * If prior cancel is in-flight (dev->cmd_cancel == true), we * can skip to send cancel. Then this will wait the prior * cancel, or merged into the next cancel rarely if next * cancel was started before waiting done. In any case, this * will be waked up soon or later. */ if (!dev->cmd_cancel) { reinit_completion(&dev->cmd_cancel_done); usb_kill_urb(dev->out_urb); dev->out_urb->transfer_buffer = ack_frame; dev->out_urb->transfer_buffer_length = sizeof(ack_frame); rc = usb_submit_urb(dev->out_urb, GFP_KERNEL); /* * Set the cmd_cancel flag only if the URB has been * successfully submitted. It will be reset by the out * URB completion callback port100_send_complete(). */ dev->cmd_cancel = !rc; } mutex_unlock(&dev->out_urb_lock); if (!rc) wait_for_completion(&dev->cmd_cancel_done); return rc; } static int port100_send_frame_async(struct port100 *dev, const struct sk_buff *out, const struct sk_buff *in, int in_len) { int rc; mutex_lock(&dev->out_urb_lock); /* A command cancel frame as been sent through dev->out_urb. Don't try * to submit a new one. */ if (dev->cmd_cancel) { rc = -EAGAIN; goto exit; } dev->out_urb->transfer_buffer = out->data; dev->out_urb->transfer_buffer_length = out->len; dev->in_urb->transfer_buffer = in->data; dev->in_urb->transfer_buffer_length = in_len; print_hex_dump_debug("PORT100 TX: ", DUMP_PREFIX_NONE, 16, 1, out->data, out->len, false); rc = usb_submit_urb(dev->out_urb, GFP_KERNEL); if (rc) goto exit; rc = port100_submit_urb_for_ack(dev, GFP_KERNEL); if (rc) usb_kill_urb(dev->out_urb); exit: mutex_unlock(&dev->out_urb_lock); return rc; } static void port100_build_cmd_frame(struct port100 *dev, u8 cmd_code, struct sk_buff *skb) { /* payload is already there, just update datalen */ int payload_len = skb->len; skb_push(skb, PORT100_FRAME_HEADER_LEN); skb_put(skb, PORT100_FRAME_TAIL_LEN); port100_tx_frame_init(skb->data, cmd_code); port100_tx_update_payload_len(skb->data, payload_len); port100_tx_frame_finish(skb->data); } static void port100_send_async_complete(struct port100 *dev) { struct port100_cmd *cmd = dev->cmd; int status = cmd->status; struct sk_buff *req = cmd->req; struct sk_buff *resp = cmd->resp; dev_kfree_skb(req); dev->cmd = NULL; if (status < 0) { cmd->complete_cb(dev, cmd->complete_cb_context, ERR_PTR(status)); dev_kfree_skb(resp); goto done; } skb_put(resp, port100_rx_frame_size(resp->data)); skb_pull(resp, PORT100_FRAME_HEADER_LEN); skb_trim(resp, resp->len - PORT100_FRAME_TAIL_LEN); cmd->complete_cb(dev, cmd->complete_cb_context, resp); done: kfree(cmd); } static int port100_send_cmd_async(struct port100 *dev, u8 cmd_code, struct sk_buff *req, port100_send_async_complete_t complete_cb, void *complete_cb_context) { struct port100_cmd *cmd; struct sk_buff *resp; int rc; int resp_len = PORT100_FRAME_HEADER_LEN + PORT100_FRAME_MAX_PAYLOAD_LEN + PORT100_FRAME_TAIL_LEN; if (dev->cmd) { nfc_err(&dev->interface->dev, "A command is still in process\n"); return -EBUSY; } resp = alloc_skb(resp_len, GFP_KERNEL); if (!resp) return -ENOMEM; cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); if (!cmd) { dev_kfree_skb(resp); return -ENOMEM; } cmd->code = cmd_code; cmd->req = req; cmd->resp = resp; cmd->resp_len = resp_len; cmd->complete_cb = complete_cb; cmd->complete_cb_context = complete_cb_context; port100_build_cmd_frame(dev, cmd_code, req); dev->cmd = cmd; rc = port100_send_frame_async(dev, req, resp, resp_len); if (rc) { kfree(cmd); dev_kfree_skb(resp); dev->cmd = NULL; } return rc; } struct port100_sync_cmd_response { struct sk_buff *resp; struct completion done; }; static void port100_wq_cmd_complete(struct work_struct *work) { struct port100 *dev = container_of(work, struct port100, cmd_complete_work); port100_send_async_complete(dev); } static void port100_send_sync_complete(struct port100 *dev, void *_arg, struct sk_buff *resp) { struct port100_sync_cmd_response *arg = _arg; arg->resp = resp; complete(&arg->done); } static struct sk_buff *port100_send_cmd_sync(struct port100 *dev, u8 cmd_code, struct sk_buff *req) { int rc; struct port100_sync_cmd_response arg; init_completion(&arg.done); rc = port100_send_cmd_async(dev, cmd_code, req, port100_send_sync_complete, &arg); if (rc) { dev_kfree_skb(req); return ERR_PTR(rc); } wait_for_completion(&arg.done); return arg.resp; } static void port100_send_complete(struct urb *urb) { struct port100 *dev = urb->context; if (dev->cmd_cancel) { complete_all(&dev->cmd_cancel_done); dev->cmd_cancel = false; } switch (urb->status) { case 0: break; /* success */ case -ECONNRESET: case -ENOENT: nfc_dbg(&dev->interface->dev, "The urb has been stopped (status %d)\n", urb->status); break; case -ESHUTDOWN: default: nfc_err(&dev->interface->dev, "Urb failure (status %d)\n", urb->status); } } static void port100_abort_cmd(struct nfc_digital_dev *ddev) { struct port100 *dev = nfc_digital_get_drvdata(ddev); /* An ack will cancel the last issued command */ port100_send_ack(dev); /* cancel the urb request */ usb_kill_urb(dev->in_urb); } static struct sk_buff *port100_alloc_skb(const struct port100 *dev, unsigned int size) { struct sk_buff *skb; skb = alloc_skb(dev->skb_headroom + dev->skb_tailroom + size, GFP_KERNEL); if (skb) skb_reserve(skb, dev->skb_headroom); return skb; } static int port100_set_command_type(struct port100 *dev, u8 command_type) { struct sk_buff *skb; struct sk_buff *resp; int rc; skb = port100_alloc_skb(dev, 1); if (!skb) return -ENOMEM; skb_put_u8(skb, command_type); resp = port100_send_cmd_sync(dev, PORT100_CMD_SET_COMMAND_TYPE, skb); if (IS_ERR(resp)) return PTR_ERR(resp); rc = resp->data[0]; dev_kfree_skb(resp); return rc; } static u64 port100_get_command_type_mask(struct port100 *dev) { struct sk_buff *skb; struct sk_buff *resp; u64 mask; skb = port100_alloc_skb(dev, 0); if (!skb) return 0; resp = port100_send_cmd_sync(dev, PORT100_CMD_GET_COMMAND_TYPE, skb); if (IS_ERR(resp)) return 0; if (resp->len < 8) mask = 0; else mask = be64_to_cpu(*(__be64 *)resp->data); dev_kfree_skb(resp); return mask; } static u16 port100_get_firmware_version(struct port100 *dev) { struct sk_buff *skb; struct sk_buff *resp; u16 fw_ver; skb = port100_alloc_skb(dev, 0); if (!skb) return 0; resp = port100_send_cmd_sync(dev, PORT100_CMD_GET_FIRMWARE_VERSION, skb); if (IS_ERR(resp)) return 0; fw_ver = le16_to_cpu(*(__le16 *)resp->data); dev_kfree_skb(resp); return fw_ver; } static int port100_switch_rf(struct nfc_digital_dev *ddev, bool on) { struct port100 *dev = nfc_digital_get_drvdata(ddev); struct sk_buff *skb, *resp; skb = port100_alloc_skb(dev, 1); if (!skb) return -ENOMEM; skb_put_u8(skb, on ? 1 : 0); /* Cancel the last command if the device is being switched off */ if (!on) port100_abort_cmd(ddev); resp = port100_send_cmd_sync(dev, PORT100_CMD_SWITCH_RF, skb); if (IS_ERR(resp)) return PTR_ERR(resp); dev_kfree_skb(resp); return 0; } static int port100_in_set_rf(struct nfc_digital_dev *ddev, u8 rf) { struct port100 *dev = nfc_digital_get_drvdata(ddev); struct sk_buff *skb; struct sk_buff *resp; int rc; if (rf >= NFC_DIGITAL_RF_TECH_LAST) return -EINVAL; skb = port100_alloc_skb(dev, sizeof(struct port100_in_rf_setting)); if (!skb) return -ENOMEM; skb_put_data(skb, &in_rf_settings[rf], sizeof(struct port100_in_rf_setting)); resp = port100_send_cmd_sync(dev, PORT100_CMD_IN_SET_RF, skb); if (IS_ERR(resp)) return PTR_ERR(resp); rc = resp->data[0]; dev_kfree_skb(resp); return rc; } static int port100_in_set_framing(struct nfc_digital_dev *ddev, int param) { struct port100 *dev = nfc_digital_get_drvdata(ddev); const struct port100_protocol *protocols; struct sk_buff *skb; struct sk_buff *resp; int num_protocols; size_t size; int rc; if (param >= NFC_DIGITAL_FRAMING_LAST) return -EINVAL; protocols = in_protocols[param]; num_protocols = 0; while (protocols[num_protocols].number != PORT100_IN_PROT_END) num_protocols++; if (!num_protocols) return 0; size = sizeof(struct port100_protocol) * num_protocols; skb = port100_alloc_skb(dev, size); if (!skb) return -ENOMEM; skb_put_data(skb, protocols, size); resp = port100_send_cmd_sync(dev, PORT100_CMD_IN_SET_PROTOCOL, skb); if (IS_ERR(resp)) return PTR_ERR(resp); rc = resp->data[0]; dev_kfree_skb(resp); return rc; } static int port100_in_configure_hw(struct nfc_digital_dev *ddev, int type, int param) { if (type == NFC_DIGITAL_CONFIG_RF_TECH) return port100_in_set_rf(ddev, param); if (type == NFC_DIGITAL_CONFIG_FRAMING) return port100_in_set_framing(ddev, param); return -EINVAL; } static void port100_in_comm_rf_complete(struct port100 *dev, void *arg, struct sk_buff *resp) { const struct port100_cb_arg *cb_arg = arg; nfc_digital_cmd_complete_t cb = cb_arg->complete_cb; u32 status; int rc; if (IS_ERR(resp)) { rc = PTR_ERR(resp); goto exit; } if (resp->len < 4) { nfc_err(&dev->interface->dev, "Invalid packet length received\n"); rc = -EIO; goto error; } status = le32_to_cpu(*(__le32 *)resp->data); skb_pull(resp, sizeof(u32)); if (status == PORT100_CMD_STATUS_TIMEOUT) { rc = -ETIMEDOUT; goto error; } if (status != PORT100_CMD_STATUS_OK) { nfc_err(&dev->interface->dev, "in_comm_rf failed with status 0x%08x\n", status); rc = -EIO; goto error; } /* Remove collision bits byte */ skb_pull(resp, 1); goto exit; error: kfree_skb(resp); resp = ERR_PTR(rc); exit: cb(dev->nfc_digital_dev, cb_arg->complete_arg, resp); kfree(cb_arg); } static int port100_in_send_cmd(struct nfc_digital_dev *ddev, struct sk_buff *skb, u16 _timeout, nfc_digital_cmd_complete_t cb, void *arg) { struct port100 *dev = nfc_digital_get_drvdata(ddev); struct port100_cb_arg *cb_arg; __le16 timeout; cb_arg = kzalloc(sizeof(struct port100_cb_arg), GFP_KERNEL); if (!cb_arg) return -ENOMEM; cb_arg->complete_cb = cb; cb_arg->complete_arg = arg; timeout = cpu_to_le16(_timeout * 10); memcpy(skb_push(skb, sizeof(__le16)), &timeout, sizeof(__le16)); return port100_send_cmd_async(dev, PORT100_CMD_IN_COMM_RF, skb, port100_in_comm_rf_complete, cb_arg); } static int port100_tg_set_rf(struct nfc_digital_dev *ddev, u8 rf) { struct port100 *dev = nfc_digital_get_drvdata(ddev); struct sk_buff *skb; struct sk_buff *resp; int rc; if (rf >= NFC_DIGITAL_RF_TECH_LAST) return -EINVAL; skb = port100_alloc_skb(dev, sizeof(struct port100_tg_rf_setting)); if (!skb) return -ENOMEM; skb_put_data(skb, &tg_rf_settings[rf], sizeof(struct port100_tg_rf_setting)); resp = port100_send_cmd_sync(dev, PORT100_CMD_TG_SET_RF, skb); if (IS_ERR(resp)) return PTR_ERR(resp); rc = resp->data[0]; dev_kfree_skb(resp); return rc; } static int port100_tg_set_framing(struct nfc_digital_dev *ddev, int param) { struct port100 *dev = nfc_digital_get_drvdata(ddev); const struct port100_protocol *protocols; struct sk_buff *skb; struct sk_buff *resp; int rc; int num_protocols; size_t size; if (param >= NFC_DIGITAL_FRAMING_LAST) return -EINVAL; protocols = tg_protocols[param]; num_protocols = 0; while (protocols[num_protocols].number != PORT100_TG_PROT_END) num_protocols++; if (!num_protocols) return 0; size = sizeof(struct port100_protocol) * num_protocols; skb = port100_alloc_skb(dev, size); if (!skb) return -ENOMEM; skb_put_data(skb, protocols, size); resp = port100_send_cmd_sync(dev, PORT100_CMD_TG_SET_PROTOCOL, skb); if (IS_ERR(resp)) return PTR_ERR(resp); rc = resp->data[0]; dev_kfree_skb(resp); return rc; } static int port100_tg_configure_hw(struct nfc_digital_dev *ddev, int type, int param) { if (type == NFC_DIGITAL_CONFIG_RF_TECH) return port100_tg_set_rf(ddev, param); if (type == NFC_DIGITAL_CONFIG_FRAMING) return port100_tg_set_framing(ddev, param); return -EINVAL; } static bool port100_tg_target_activated(struct port100 *dev, u8 tgt_activated) { u8 mask; switch (dev->cmd_type) { case PORT100_CMD_TYPE_0: mask = PORT100_MDAA_TGT_HAS_BEEN_ACTIVATED_MASK; break; case PORT100_CMD_TYPE_1: mask = PORT100_MDAA_TGT_HAS_BEEN_ACTIVATED_MASK | PORT100_MDAA_TGT_WAS_ACTIVATED_MASK; break; default: nfc_err(&dev->interface->dev, "Unknown command type\n"); return false; } return ((tgt_activated & mask) == mask); } static void port100_tg_comm_rf_complete(struct port100 *dev, void *arg, struct sk_buff *resp) { u32 status; const struct port100_cb_arg *cb_arg = arg; nfc_digital_cmd_complete_t cb = cb_arg->complete_cb; struct port100_tg_comm_rf_res *hdr; if (IS_ERR(resp)) goto exit; hdr = (struct port100_tg_comm_rf_res *)resp->data; status = le32_to_cpu(hdr->status); if (cb_arg->mdaa && !port100_tg_target_activated(dev, hdr->target_activated)) { kfree_skb(resp); resp = ERR_PTR(-ETIMEDOUT); goto exit; } skb_pull(resp, sizeof(struct port100_tg_comm_rf_res)); if (status != PORT100_CMD_STATUS_OK) { kfree_skb(resp); if (status == PORT100_CMD_STATUS_TIMEOUT) resp = ERR_PTR(-ETIMEDOUT); else resp = ERR_PTR(-EIO); } exit: cb(dev->nfc_digital_dev, cb_arg->complete_arg, resp); kfree(cb_arg); } static int port100_tg_send_cmd(struct nfc_digital_dev *ddev, struct sk_buff *skb, u16 timeout, nfc_digital_cmd_complete_t cb, void *arg) { struct port100 *dev = nfc_digital_get_drvdata(ddev); struct port100_tg_comm_rf_cmd *hdr; struct port100_cb_arg *cb_arg; cb_arg = kzalloc(sizeof(struct port100_cb_arg), GFP_KERNEL); if (!cb_arg) return -ENOMEM; cb_arg->complete_cb = cb; cb_arg->complete_arg = arg; skb_push(skb, sizeof(struct port100_tg_comm_rf_cmd)); hdr = (struct port100_tg_comm_rf_cmd *)skb->data; memset(hdr, 0, sizeof(struct port100_tg_comm_rf_cmd)); hdr->guard_time = cpu_to_le16(500); hdr->send_timeout = cpu_to_le16(0xFFFF); hdr->recv_timeout = cpu_to_le16(timeout); return port100_send_cmd_async(dev, PORT100_CMD_TG_COMM_RF, skb, port100_tg_comm_rf_complete, cb_arg); } static int port100_listen_mdaa(struct nfc_digital_dev *ddev, struct digital_tg_mdaa_params *params, u16 timeout, nfc_digital_cmd_complete_t cb, void *arg) { struct port100 *dev = nfc_digital_get_drvdata(ddev); struct port100_tg_comm_rf_cmd *hdr; struct port100_cb_arg *cb_arg; struct sk_buff *skb; int rc; rc = port100_tg_configure_hw(ddev, NFC_DIGITAL_CONFIG_RF_TECH, NFC_DIGITAL_RF_TECH_106A); if (rc) return rc; rc = port100_tg_configure_hw(ddev, NFC_DIGITAL_CONFIG_FRAMING, NFC_DIGITAL_FRAMING_NFCA_NFC_DEP); if (rc) return rc; cb_arg = kzalloc(sizeof(struct port100_cb_arg), GFP_KERNEL); if (!cb_arg) return -ENOMEM; cb_arg->complete_cb = cb; cb_arg->complete_arg = arg; cb_arg->mdaa = 1; skb = port100_alloc_skb(dev, 0); if (!skb) { kfree(cb_arg); return -ENOMEM; } skb_push(skb, sizeof(struct port100_tg_comm_rf_cmd)); hdr = (struct port100_tg_comm_rf_cmd *)skb->data; memset(hdr, 0, sizeof(struct port100_tg_comm_rf_cmd)); hdr->guard_time = 0; hdr->send_timeout = cpu_to_le16(0xFFFF); hdr->mdaa = 1; hdr->nfca_param[0] = (params->sens_res >> 8) & 0xFF; hdr->nfca_param[1] = params->sens_res & 0xFF; memcpy(hdr->nfca_param + 2, params->nfcid1, 3); hdr->nfca_param[5] = params->sel_res; memcpy(hdr->nfcf_param, params->nfcid2, 8); hdr->nfcf_param[16] = (params->sc >> 8) & 0xFF; hdr->nfcf_param[17] = params->sc & 0xFF; hdr->recv_timeout = cpu_to_le16(timeout); return port100_send_cmd_async(dev, PORT100_CMD_TG_COMM_RF, skb, port100_tg_comm_rf_complete, cb_arg); } static int port100_listen(struct nfc_digital_dev *ddev, u16 timeout, nfc_digital_cmd_complete_t cb, void *arg) { const struct port100 *dev = nfc_digital_get_drvdata(ddev); struct sk_buff *skb; skb = port100_alloc_skb(dev, 0); if (!skb) return -ENOMEM; return port100_tg_send_cmd(ddev, skb, timeout, cb, arg); } static const struct nfc_digital_ops port100_digital_ops = { .in_configure_hw = port100_in_configure_hw, .in_send_cmd = port100_in_send_cmd, .tg_listen_mdaa = port100_listen_mdaa, .tg_listen = port100_listen, .tg_configure_hw = port100_tg_configure_hw, .tg_send_cmd = port100_tg_send_cmd, .switch_rf = port100_switch_rf, .abort_cmd = port100_abort_cmd, }; static const struct usb_device_id port100_table[] = { { USB_DEVICE(SONY_VENDOR_ID, RCS380S_PRODUCT_ID), }, { USB_DEVICE(SONY_VENDOR_ID, RCS380P_PRODUCT_ID), }, { } }; MODULE_DEVICE_TABLE(usb, port100_table); static int port100_probe(struct usb_interface *interface, const struct usb_device_id *id) { struct port100 *dev; int rc; struct usb_host_interface *iface_desc; struct usb_endpoint_descriptor *endpoint; int in_endpoint; int out_endpoint; u16 fw_version; u64 cmd_type_mask; int i; dev = devm_kzalloc(&interface->dev, sizeof(struct port100), GFP_KERNEL); if (!dev) return -ENOMEM; mutex_init(&dev->out_urb_lock); dev->udev = usb_get_dev(interface_to_usbdev(interface)); dev->interface = interface; usb_set_intfdata(interface, dev); in_endpoint = out_endpoint = 0; iface_desc = interface->cur_altsetting; for (i = 0; i < iface_desc->desc.bNumEndpoints; ++i) { endpoint = &iface_desc->endpoint[i].desc; if (!in_endpoint && usb_endpoint_is_bulk_in(endpoint)) in_endpoint = endpoint->bEndpointAddress; if (!out_endpoint && usb_endpoint_is_bulk_out(endpoint)) out_endpoint = endpoint->bEndpointAddress; } if (!in_endpoint || !out_endpoint) { nfc_err(&interface->dev, "Could not find bulk-in or bulk-out endpoint\n"); rc = -ENODEV; goto error; } dev->in_urb = usb_alloc_urb(0, GFP_KERNEL); dev->out_urb = usb_alloc_urb(0, GFP_KERNEL); if (!dev->in_urb || !dev->out_urb) { nfc_err(&interface->dev, "Could not allocate USB URBs\n"); rc = -ENOMEM; goto error; } usb_fill_bulk_urb(dev->in_urb, dev->udev, usb_rcvbulkpipe(dev->udev, in_endpoint), NULL, 0, NULL, dev); usb_fill_bulk_urb(dev->out_urb, dev->udev, usb_sndbulkpipe(dev->udev, out_endpoint), NULL, 0, port100_send_complete, dev); dev->out_urb->transfer_flags = URB_ZERO_PACKET; dev->skb_headroom = PORT100_FRAME_HEADER_LEN + PORT100_COMM_RF_HEAD_MAX_LEN; dev->skb_tailroom = PORT100_FRAME_TAIL_LEN; init_completion(&dev->cmd_cancel_done); INIT_WORK(&dev->cmd_complete_work, port100_wq_cmd_complete); /* The first thing to do with the Port-100 is to set the command type * to be used. If supported we use command type 1. 0 otherwise. */ cmd_type_mask = port100_get_command_type_mask(dev); if (!cmd_type_mask) { nfc_err(&interface->dev, "Could not get supported command types\n"); rc = -ENODEV; goto error; } if (PORT100_CMD_TYPE_IS_SUPPORTED(cmd_type_mask, PORT100_CMD_TYPE_1)) dev->cmd_type = PORT100_CMD_TYPE_1; else dev->cmd_type = PORT100_CMD_TYPE_0; rc = port100_set_command_type(dev, dev->cmd_type); if (rc) { nfc_err(&interface->dev, "The device does not support command type %u\n", dev->cmd_type); goto error; } fw_version = port100_get_firmware_version(dev); if (!fw_version) nfc_err(&interface->dev, "Could not get device firmware version\n"); nfc_info(&interface->dev, "Sony NFC Port-100 Series attached (firmware v%x.%02x)\n", (fw_version & 0xFF00) >> 8, fw_version & 0xFF); dev->nfc_digital_dev = nfc_digital_allocate_device(&port100_digital_ops, PORT100_PROTOCOLS, PORT100_CAPABILITIES, dev->skb_headroom, dev->skb_tailroom); if (!dev->nfc_digital_dev) { nfc_err(&interface->dev, "Could not allocate nfc_digital_dev\n"); rc = -ENOMEM; goto error; } nfc_digital_set_parent_dev(dev->nfc_digital_dev, &interface->dev); nfc_digital_set_drvdata(dev->nfc_digital_dev, dev); rc = nfc_digital_register_device(dev->nfc_digital_dev); if (rc) { nfc_err(&interface->dev, "Could not register digital device\n"); goto free_nfc_dev; } return 0; free_nfc_dev: nfc_digital_free_device(dev->nfc_digital_dev); error: usb_kill_urb(dev->in_urb); usb_free_urb(dev->in_urb); usb_kill_urb(dev->out_urb); usb_free_urb(dev->out_urb); usb_put_dev(dev->udev); return rc; } static void port100_disconnect(struct usb_interface *interface) { struct port100 *dev; dev = usb_get_intfdata(interface); usb_set_intfdata(interface, NULL); nfc_digital_unregister_device(dev->nfc_digital_dev); nfc_digital_free_device(dev->nfc_digital_dev); usb_kill_urb(dev->in_urb); usb_kill_urb(dev->out_urb); usb_free_urb(dev->in_urb); usb_free_urb(dev->out_urb); usb_put_dev(dev->udev); kfree(dev->cmd); nfc_info(&interface->dev, "Sony Port-100 NFC device disconnected\n"); } static struct usb_driver port100_driver = { .name = "port100", .probe = port100_probe, .disconnect = port100_disconnect, .id_table = port100_table, }; module_usb_driver(port100_driver); MODULE_DESCRIPTION("NFC Port-100 series usb driver ver " VERSION); MODULE_VERSION(VERSION); MODULE_LICENSE("GPL");
2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 // SPDX-License-Identifier: GPL-2.0-only /* * Yama Linux Security Module * * Author: Kees Cook <keescook@chromium.org> * * Copyright (C) 2010 Canonical, Ltd. * Copyright (C) 2011 The Chromium OS Authors. */ #include <linux/lsm_hooks.h> #include <linux/sysctl.h> #include <linux/ptrace.h> #include <linux/prctl.h> #include <linux/ratelimit.h> #include <linux/workqueue.h> #include <linux/string_helpers.h> #include <linux/task_work.h> #include <linux/sched.h> #include <linux/spinlock.h> #define YAMA_SCOPE_DISABLED 0 #define YAMA_SCOPE_RELATIONAL 1 #define YAMA_SCOPE_CAPABILITY 2 #define YAMA_SCOPE_NO_ATTACH 3 static int ptrace_scope = YAMA_SCOPE_RELATIONAL; /* describe a ptrace relationship for potential exception */ struct ptrace_relation { struct task_struct *tracer; struct task_struct *tracee; bool invalid; struct list_head node; struct rcu_head rcu; }; static LIST_HEAD(ptracer_relations); static DEFINE_SPINLOCK(ptracer_relations_lock); static void yama_relation_cleanup(struct work_struct *work); static DECLARE_WORK(yama_relation_work, yama_relation_cleanup); struct access_report_info { struct callback_head work; const char *access; struct task_struct *target; struct task_struct *agent; }; static void __report_access(struct callback_head *work) { struct access_report_info *info = container_of(work, struct access_report_info, work); char *target_cmd, *agent_cmd; target_cmd = kstrdup_quotable_cmdline(info->target, GFP_KERNEL); agent_cmd = kstrdup_quotable_cmdline(info->agent, GFP_KERNEL); pr_notice_ratelimited( "ptrace %s of \"%s\"[%d] was attempted by \"%s\"[%d]\n", info->access, target_cmd, info->target->pid, agent_cmd, info->agent->pid); kfree(agent_cmd); kfree(target_cmd); put_task_struct(info->agent); put_task_struct(info->target); kfree(info); } /* defers execution because cmdline access can sleep */ static void report_access(const char *access, struct task_struct *target, struct task_struct *agent) { struct access_report_info *info; char agent_comm[sizeof(agent->comm)]; assert_spin_locked(&target->alloc_lock); /* for target->comm */ if (current->flags & PF_KTHREAD) { /* I don't think kthreads call task_work_run() before exiting. * Imagine angry ranting about procfs here. */ pr_notice_ratelimited( "ptrace %s of \"%s\"[%d] was attempted by \"%s\"[%d]\n", access, target->comm, target->pid, get_task_comm(agent_comm, agent), agent->pid); return; } info = kmalloc(sizeof(*info), GFP_ATOMIC); if (!info) return; init_task_work(&info->work, __report_access); get_task_struct(target); get_task_struct(agent); info->access = access; info->target = target; info->agent = agent; if (task_work_add(current, &info->work, TWA_RESUME) == 0) return; /* success */ WARN(1, "report_access called from exiting task"); put_task_struct(target); put_task_struct(agent); kfree(info); } /** * yama_relation_cleanup - remove invalid entries from the relation list * */ static void yama_relation_cleanup(struct work_struct *work) { struct ptrace_relation *relation; spin_lock(&ptracer_relations_lock); rcu_read_lock(); list_for_each_entry_rcu(relation, &ptracer_relations, node) { if (relation->invalid) { list_del_rcu(&relation->node); kfree_rcu(relation, rcu); } } rcu_read_unlock(); spin_unlock(&ptracer_relations_lock); } /** * yama_ptracer_add - add/replace an exception for this tracer/tracee pair * @tracer: the task_struct of the process doing the ptrace * @tracee: the task_struct of the process to be ptraced * * Each tracee can have, at most, one tracer registered. Each time this * is called, the prior registered tracer will be replaced for the tracee. * * Returns 0 if relationship was added, -ve on error. */ static int yama_ptracer_add(struct task_struct *tracer, struct task_struct *tracee) { struct ptrace_relation *relation, *added; added = kmalloc(sizeof(*added), GFP_KERNEL); if (!added) return -ENOMEM; added->tracee = tracee; added->tracer = tracer; added->invalid = false; spin_lock(&ptracer_relations_lock); rcu_read_lock(); list_for_each_entry_rcu(relation, &ptracer_relations, node) { if (relation->invalid) continue; if (relation->tracee == tracee) { list_replace_rcu(&relation->node, &added->node); kfree_rcu(relation, rcu); goto out; } } list_add_rcu(&added->node, &ptracer_relations); out: rcu_read_unlock(); spin_unlock(&ptracer_relations_lock); return 0; } /** * yama_ptracer_del - remove exceptions related to the given tasks * @tracer: remove any relation where tracer task matches * @tracee: remove any relation where tracee task matches */ static void yama_ptracer_del(struct task_struct *tracer, struct task_struct *tracee) { struct ptrace_relation *relation; bool marked = false; rcu_read_lock(); list_for_each_entry_rcu(relation, &ptracer_relations, node) { if (relation->invalid) continue; if (relation->tracee == tracee || (tracer && relation->tracer == tracer)) { relation->invalid = true; marked = true; } } rcu_read_unlock(); if (marked) schedule_work(&yama_relation_work); } /** * yama_task_free - check for task_pid to remove from exception list * @task: task being removed */ static void yama_task_free(struct task_struct *task) { yama_ptracer_del(task, task); } /** * yama_task_prctl - check for Yama-specific prctl operations * @option: operation * @arg2: argument * @arg3: argument * @arg4: argument * @arg5: argument * * Return 0 on success, -ve on error. -ENOSYS is returned when Yama * does not handle the given option. */ static int yama_task_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) { int rc = -ENOSYS; struct task_struct *myself = current; switch (option) { case PR_SET_PTRACER: /* Since a thread can call prctl(), find the group leader * before calling _add() or _del() on it, since we want * process-level granularity of control. The tracer group * leader checking is handled later when walking the ancestry * at the time of PTRACE_ATTACH check. */ rcu_read_lock(); if (!thread_group_leader(myself)) myself = rcu_dereference(myself->group_leader); get_task_struct(myself); rcu_read_unlock(); if (arg2 == 0) { yama_ptracer_del(NULL, myself); rc = 0; } else if (arg2 == PR_SET_PTRACER_ANY || (int)arg2 == -1) { rc = yama_ptracer_add(NULL, myself); } else { struct task_struct *tracer; tracer = find_get_task_by_vpid(arg2); if (!tracer) { rc = -EINVAL; } else { rc = yama_ptracer_add(tracer, myself); put_task_struct(tracer); } } put_task_struct(myself); break; } return rc; } /** * task_is_descendant - walk up a process family tree looking for a match * @parent: the process to compare against while walking up from child * @child: the process to start from while looking upwards for parent * * Returns 1 if child is a descendant of parent, 0 if not. */ static int task_is_descendant(struct task_struct *parent, struct task_struct *child) { int rc = 0; struct task_struct *walker = child; if (!parent || !child) return 0; rcu_read_lock(); if (!thread_group_leader(parent)) parent = rcu_dereference(parent->group_leader); while (walker->pid > 0) { if (!thread_group_leader(walker)) walker = rcu_dereference(walker->group_leader); if (walker == parent) { rc = 1; break; } walker = rcu_dereference(walker->real_parent); } rcu_read_unlock(); return rc; } /** * ptracer_exception_found - tracer registered as exception for this tracee * @tracer: the task_struct of the process attempting ptrace * @tracee: the task_struct of the process to be ptraced * * Returns 1 if tracer has a ptracer exception ancestor for tracee. */ static int ptracer_exception_found(struct task_struct *tracer, struct task_struct *tracee) { int rc = 0; struct ptrace_relation *relation; struct task_struct *parent = NULL; bool found = false; rcu_read_lock(); /* * If there's already an active tracing relationship, then make an * exception for the sake of other accesses, like process_vm_rw(). */ parent = ptrace_parent(tracee); if (parent != NULL && same_thread_group(parent, tracer)) { rc = 1; goto unlock; } /* Look for a PR_SET_PTRACER relationship. */ if (!thread_group_leader(tracee)) tracee = rcu_dereference(tracee->group_leader); list_for_each_entry_rcu(relation, &ptracer_relations, node) { if (relation->invalid) continue; if (relation->tracee == tracee) { parent = relation->tracer; found = true; break; } } if (found && (parent == NULL || task_is_descendant(parent, tracer))) rc = 1; unlock: rcu_read_unlock(); return rc; } /** * yama_ptrace_access_check - validate PTRACE_ATTACH calls * @child: task that current task is attempting to ptrace * @mode: ptrace attach mode * * Returns 0 if following the ptrace is allowed, -ve on error. */ static int yama_ptrace_access_check(struct task_struct *child, unsigned int mode) { int rc = 0; /* require ptrace target be a child of ptracer on attach */ if (mode & PTRACE_MODE_ATTACH) { switch (ptrace_scope) { case YAMA_SCOPE_DISABLED: /* No additional restrictions. */ break; case YAMA_SCOPE_RELATIONAL: rcu_read_lock(); if (!pid_alive(child)) rc = -EPERM; if (!rc && !task_is_descendant(current, child) && !ptracer_exception_found(current, child) && !ns_capable(__task_cred(child)->user_ns, CAP_SYS_PTRACE)) rc = -EPERM; rcu_read_unlock(); break; case YAMA_SCOPE_CAPABILITY: rcu_read_lock(); if (!ns_capable(__task_cred(child)->user_ns, CAP_SYS_PTRACE)) rc = -EPERM; rcu_read_unlock(); break; case YAMA_SCOPE_NO_ATTACH: default: rc = -EPERM; break; } } if (rc && (mode & PTRACE_MODE_NOAUDIT) == 0) report_access("attach", child, current); return rc; } /** * yama_ptrace_traceme - validate PTRACE_TRACEME calls * @parent: task that will become the ptracer of the current task * * Returns 0 if following the ptrace is allowed, -ve on error. */ static int yama_ptrace_traceme(struct task_struct *parent) { int rc = 0; /* Only disallow PTRACE_TRACEME on more aggressive settings. */ switch (ptrace_scope) { case YAMA_SCOPE_CAPABILITY: if (!has_ns_capability(parent, current_user_ns(), CAP_SYS_PTRACE)) rc = -EPERM; break; case YAMA_SCOPE_NO_ATTACH: rc = -EPERM; break; } if (rc) { task_lock(current); report_access("traceme", current, parent); task_unlock(current); } return rc; } static struct security_hook_list yama_hooks[] __ro_after_init = { LSM_HOOK_INIT(ptrace_access_check, yama_ptrace_access_check), LSM_HOOK_INIT(ptrace_traceme, yama_ptrace_traceme), LSM_HOOK_INIT(task_prctl, yama_task_prctl), LSM_HOOK_INIT(task_free, yama_task_free), }; #ifdef CONFIG_SYSCTL static int yama_dointvec_minmax(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table table_copy; if (write && !capable(CAP_SYS_PTRACE)) return -EPERM; /* Lock the max value if it ever gets set. */ table_copy = *table; if (*(int *)table_copy.data == *(int *)table_copy.extra2) table_copy.extra1 = table_copy.extra2; return proc_dointvec_minmax(&table_copy, write, buffer, lenp, ppos); } static int max_scope = YAMA_SCOPE_NO_ATTACH; static struct ctl_table yama_sysctl_table[] = { { .procname = "ptrace_scope", .data = &ptrace_scope, .maxlen = sizeof(int), .mode = 0644, .proc_handler = yama_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &max_scope, }, { } }; static void __init yama_init_sysctl(void) { if (!register_sysctl("kernel/yama", yama_sysctl_table)) panic("Yama: sysctl registration failed.\n"); } #else static inline void yama_init_sysctl(void) { } #endif /* CONFIG_SYSCTL */ static int __init yama_init(void) { pr_info("Yama: becoming mindful.\n"); security_add_hooks(yama_hooks, ARRAY_SIZE(yama_hooks), "yama"); yama_init_sysctl(); return 0; } DEFINE_LSM(yama) = { .name = "yama", .init = yama_init, };
19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 81 343 343 343 343 343 343 19 19 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 // SPDX-License-Identifier: GPL-2.0-only /* * fs/kernfs/inode.c - kernfs inode implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> */ #include <linux/pagemap.h> #include <linux/backing-dev.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/xattr.h> #include <linux/security.h> #include "kernfs-internal.h" static const struct inode_operations kernfs_iops = { .permission = kernfs_iop_permission, .setattr = kernfs_iop_setattr, .getattr = kernfs_iop_getattr, .listxattr = kernfs_iop_listxattr, }; static struct kernfs_iattrs *__kernfs_iattrs(struct kernfs_node *kn, int alloc) { static DEFINE_MUTEX(iattr_mutex); struct kernfs_iattrs *ret; mutex_lock(&iattr_mutex); if (kn->iattr || !alloc) goto out_unlock; kn->iattr = kmem_cache_zalloc(kernfs_iattrs_cache, GFP_KERNEL); if (!kn->iattr) goto out_unlock; /* assign default attributes */ kn->iattr->ia_uid = GLOBAL_ROOT_UID; kn->iattr->ia_gid = GLOBAL_ROOT_GID; ktime_get_real_ts64(&kn->iattr->ia_atime); kn->iattr->ia_mtime = kn->iattr->ia_atime; kn->iattr->ia_ctime = kn->iattr->ia_atime; simple_xattrs_init(&kn->iattr->xattrs); atomic_set(&kn->iattr->nr_user_xattrs, 0); atomic_set(&kn->iattr->user_xattr_size, 0); out_unlock: ret = kn->iattr; mutex_unlock(&iattr_mutex); return ret; } static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn) { return __kernfs_iattrs(kn, 1); } static struct kernfs_iattrs *kernfs_iattrs_noalloc(struct kernfs_node *kn) { return __kernfs_iattrs(kn, 0); } int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) { struct kernfs_iattrs *attrs; unsigned int ia_valid = iattr->ia_valid; attrs = kernfs_iattrs(kn); if (!attrs) return -ENOMEM; if (ia_valid & ATTR_UID) attrs->ia_uid = iattr->ia_uid; if (ia_valid & ATTR_GID) attrs->ia_gid = iattr->ia_gid; if (ia_valid & ATTR_ATIME) attrs->ia_atime = iattr->ia_atime; if (ia_valid & ATTR_MTIME) attrs->ia_mtime = iattr->ia_mtime; if (ia_valid & ATTR_CTIME) attrs->ia_ctime = iattr->ia_ctime; if (ia_valid & ATTR_MODE) kn->mode = iattr->ia_mode; return 0; } /** * kernfs_setattr - set iattr on a node * @kn: target node * @iattr: iattr to set * * Return: %0 on success, -errno on failure. */ int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) { int ret; struct kernfs_root *root = kernfs_root(kn); down_write(&root->kernfs_iattr_rwsem); ret = __kernfs_setattr(kn, iattr); up_write(&root->kernfs_iattr_rwsem); return ret; } int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); struct kernfs_node *kn = inode->i_private; struct kernfs_root *root; int error; if (!kn) return -EINVAL; root = kernfs_root(kn); down_write(&root->kernfs_iattr_rwsem); error = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (error) goto out; error = __kernfs_setattr(kn, iattr); if (error) goto out; /* this ignores size changes */ setattr_copy(&nop_mnt_idmap, inode, iattr); out: up_write(&root->kernfs_iattr_rwsem); return error; } ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size) { struct kernfs_node *kn = kernfs_dentry_node(dentry); struct kernfs_iattrs *attrs; attrs = kernfs_iattrs(kn); if (!attrs) return -ENOMEM; return simple_xattr_list(d_inode(dentry), &attrs->xattrs, buf, size); } static inline void set_default_inode_attr(struct inode *inode, umode_t mode) { inode->i_mode = mode; simple_inode_init_ts(inode); } static inline void set_inode_attr(struct inode *inode, struct kernfs_iattrs *attrs) { inode->i_uid = attrs->ia_uid; inode->i_gid = attrs->ia_gid; inode_set_atime_to_ts(inode, attrs->ia_atime); inode_set_mtime_to_ts(inode, attrs->ia_mtime); inode_set_ctime_to_ts(inode, attrs->ia_ctime); } static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode) { struct kernfs_iattrs *attrs = kn->iattr; inode->i_mode = kn->mode; if (attrs) /* * kernfs_node has non-default attributes get them from * persistent copy in kernfs_node. */ set_inode_attr(inode, attrs); if (kernfs_type(kn) == KERNFS_DIR) set_nlink(inode, kn->dir.subdirs + 2); } int kernfs_iop_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct kernfs_node *kn = inode->i_private; struct kernfs_root *root = kernfs_root(kn); down_read(&root->kernfs_iattr_rwsem); kernfs_refresh_inode(kn, inode); generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); up_read(&root->kernfs_iattr_rwsem); return 0; } static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode) { kernfs_get(kn); inode->i_private = kn; inode->i_mapping->a_ops = &ram_aops; inode->i_op = &kernfs_iops; inode->i_generation = kernfs_gen(kn); set_default_inode_attr(inode, kn->mode); kernfs_refresh_inode(kn, inode); /* initialize inode according to type */ switch (kernfs_type(kn)) { case KERNFS_DIR: inode->i_op = &kernfs_dir_iops; inode->i_fop = &kernfs_dir_fops; if (kn->flags & KERNFS_EMPTY_DIR) make_empty_dir_inode(inode); break; case KERNFS_FILE: inode->i_size = kn->attr.size; inode->i_fop = &kernfs_file_fops; break; case KERNFS_LINK: inode->i_op = &kernfs_symlink_iops; break; default: BUG(); } unlock_new_inode(inode); } /** * kernfs_get_inode - get inode for kernfs_node * @sb: super block * @kn: kernfs_node to allocate inode for * * Get inode for @kn. If such inode doesn't exist, a new inode is * allocated and basics are initialized. New inode is returned * locked. * * Locking: * Kernel thread context (may sleep). * * Return: * Pointer to allocated inode on success, %NULL on failure. */ struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn) { struct inode *inode; inode = iget_locked(sb, kernfs_ino(kn)); if (inode && (inode->i_state & I_NEW)) kernfs_init_inode(kn, inode); return inode; } /* * The kernfs_node serves as both an inode and a directory entry for * kernfs. To prevent the kernfs inode numbers from being freed * prematurely we take a reference to kernfs_node from the kernfs inode. A * super_operations.evict_inode() implementation is needed to drop that * reference upon inode destruction. */ void kernfs_evict_inode(struct inode *inode) { struct kernfs_node *kn = inode->i_private; truncate_inode_pages_final(&inode->i_data); clear_inode(inode); kernfs_put(kn); } int kernfs_iop_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct kernfs_node *kn; struct kernfs_root *root; int ret; if (mask & MAY_NOT_BLOCK) return -ECHILD; kn = inode->i_private; root = kernfs_root(kn); down_read(&root->kernfs_iattr_rwsem); kernfs_refresh_inode(kn, inode); ret = generic_permission(&nop_mnt_idmap, inode, mask); up_read(&root->kernfs_iattr_rwsem); return ret; } int kernfs_xattr_get(struct kernfs_node *kn, const char *name, void *value, size_t size) { struct kernfs_iattrs *attrs = kernfs_iattrs_noalloc(kn); if (!attrs) return -ENODATA; return simple_xattr_get(&attrs->xattrs, name, value, size); } int kernfs_xattr_set(struct kernfs_node *kn, const char *name, const void *value, size_t size, int flags) { struct simple_xattr *old_xattr; struct kernfs_iattrs *attrs = kernfs_iattrs(kn); if (!attrs) return -ENOMEM; old_xattr = simple_xattr_set(&attrs->xattrs, name, value, size, flags); if (IS_ERR(old_xattr)) return PTR_ERR(old_xattr); simple_xattr_free(old_xattr); return 0; } static int kernfs_vfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *suffix, void *value, size_t size) { const char *name = xattr_full_name(handler, suffix); struct kernfs_node *kn = inode->i_private; return kernfs_xattr_get(kn, name, value, size); } static int kernfs_vfs_xattr_set(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *suffix, const void *value, size_t size, int flags) { const char *name = xattr_full_name(handler, suffix); struct kernfs_node *kn = inode->i_private; return kernfs_xattr_set(kn, name, value, size, flags); } static int kernfs_vfs_user_xattr_add(struct kernfs_node *kn, const char *full_name, struct simple_xattrs *xattrs, const void *value, size_t size, int flags) { atomic_t *sz = &kn->iattr->user_xattr_size; atomic_t *nr = &kn->iattr->nr_user_xattrs; struct simple_xattr *old_xattr; int ret; if (atomic_inc_return(nr) > KERNFS_MAX_USER_XATTRS) { ret = -ENOSPC; goto dec_count_out; } if (atomic_add_return(size, sz) > KERNFS_USER_XATTR_SIZE_LIMIT) { ret = -ENOSPC; goto dec_size_out; } old_xattr = simple_xattr_set(xattrs, full_name, value, size, flags); if (!old_xattr) return 0; if (IS_ERR(old_xattr)) { ret = PTR_ERR(old_xattr); goto dec_size_out; } ret = 0; size = old_xattr->size; simple_xattr_free(old_xattr); dec_size_out: atomic_sub(size, sz); dec_count_out: atomic_dec(nr); return ret; } static int kernfs_vfs_user_xattr_rm(struct kernfs_node *kn, const char *full_name, struct simple_xattrs *xattrs, const void *value, size_t size, int flags) { atomic_t *sz = &kn->iattr->user_xattr_size; atomic_t *nr = &kn->iattr->nr_user_xattrs; struct simple_xattr *old_xattr; old_xattr = simple_xattr_set(xattrs, full_name, value, size, flags); if (!old_xattr) return 0; if (IS_ERR(old_xattr)) return PTR_ERR(old_xattr); atomic_sub(old_xattr->size, sz); atomic_dec(nr); simple_xattr_free(old_xattr); return 0; } static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *suffix, const void *value, size_t size, int flags) { const char *full_name = xattr_full_name(handler, suffix); struct kernfs_node *kn = inode->i_private; struct kernfs_iattrs *attrs; if (!(kernfs_root(kn)->flags & KERNFS_ROOT_SUPPORT_USER_XATTR)) return -EOPNOTSUPP; attrs = kernfs_iattrs(kn); if (!attrs) return -ENOMEM; if (value) return kernfs_vfs_user_xattr_add(kn, full_name, &attrs->xattrs, value, size, flags); else return kernfs_vfs_user_xattr_rm(kn, full_name, &attrs->xattrs, value, size, flags); } static const struct xattr_handler kernfs_trusted_xattr_handler = { .prefix = XATTR_TRUSTED_PREFIX, .get = kernfs_vfs_xattr_get, .set = kernfs_vfs_xattr_set, }; static const struct xattr_handler kernfs_security_xattr_handler = { .prefix = XATTR_SECURITY_PREFIX, .get = kernfs_vfs_xattr_get, .set = kernfs_vfs_xattr_set, }; static const struct xattr_handler kernfs_user_xattr_handler = { .prefix = XATTR_USER_PREFIX, .get = kernfs_vfs_xattr_get, .set = kernfs_vfs_user_xattr_set, }; const struct xattr_handler * const kernfs_xattr_handlers[] = { &kernfs_trusted_xattr_handler, &kernfs_security_xattr_handler, &kernfs_user_xattr_handler, NULL };
2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 // SPDX-License-Identifier: GPL-2.0-or-later /* * Force feedback support for Logitech Flight System G940 * * Copyright (c) 2009 Gary Stein <LordCnidarian@gmail.com> */ /* */ #include <linux/input.h> #include <linux/hid.h> #include "hid-lg.h" /* * G940 Theory of Operation (from experimentation) * * There are 63 fields (only 3 of them currently used) * 0 - seems to be command field * 1 - 30 deal with the x axis * 31 -60 deal with the y axis * * Field 1 is x axis constant force * Field 31 is y axis constant force * * other interesting fields 1,2,3,4 on x axis * (same for 31,32,33,34 on y axis) * * 0 0 127 127 makes the joystick autocenter hard * * 127 0 127 127 makes the joystick loose on the right, * but stops all movemnt left * * -127 0 -127 -127 makes the joystick loose on the left, * but stops all movement right * * 0 0 -127 -127 makes the joystick rattle very hard * * I'm sure these are effects that I don't know enough about them */ struct lg3ff_device { struct hid_report *report; }; static int hid_lg3ff_play(struct input_dev *dev, void *data, struct ff_effect *effect) { struct hid_device *hid = input_get_drvdata(dev); struct list_head *report_list = &hid->report_enum[HID_OUTPUT_REPORT].report_list; struct hid_report *report = list_entry(report_list->next, struct hid_report, list); int x, y; /* * Available values in the field should always be 63, but we only use up to * 35. Instead, clear the entire area, however big it is. */ memset(report->field[0]->value, 0, sizeof(__s32) * report->field[0]->report_count); switch (effect->type) { case FF_CONSTANT: /* * Already clamped in ff_memless * 0 is center (different then other logitech) */ x = effect->u.ramp.start_level; y = effect->u.ramp.end_level; /* send command byte */ report->field[0]->value[0] = 0x51; /* * Sign backwards from other Force3d pro * which get recast here in two's complement 8 bits */ report->field[0]->value[1] = (unsigned char)(-x); report->field[0]->value[31] = (unsigned char)(-y); hid_hw_request(hid, report, HID_REQ_SET_REPORT); break; } return 0; } static void hid_lg3ff_set_autocenter(struct input_dev *dev, u16 magnitude) { struct hid_device *hid = input_get_drvdata(dev); struct list_head *report_list = &hid->report_enum[HID_OUTPUT_REPORT].report_list; struct hid_report *report = list_entry(report_list->next, struct hid_report, list); /* * Auto Centering probed from device * NOTE: deadman's switch on G940 must be covered * for effects to work */ report->field[0]->value[0] = 0x51; report->field[0]->value[1] = 0x00; report->field[0]->value[2] = 0x00; report->field[0]->value[3] = 0x7F; report->field[0]->value[4] = 0x7F; report->field[0]->value[31] = 0x00; report->field[0]->value[32] = 0x00; report->field[0]->value[33] = 0x7F; report->field[0]->value[34] = 0x7F; hid_hw_request(hid, report, HID_REQ_SET_REPORT); } static const signed short ff3_joystick_ac[] = { FF_CONSTANT, FF_AUTOCENTER, -1 }; int lg3ff_init(struct hid_device *hid) { struct hid_input *hidinput; struct input_dev *dev; const signed short *ff_bits = ff3_joystick_ac; int error; int i; if (list_empty(&hid->inputs)) { hid_err(hid, "no inputs found\n"); return -ENODEV; } hidinput = list_entry(hid->inputs.next, struct hid_input, list); dev = hidinput->input; /* Check that the report looks ok */ if (!hid_validate_values(hid, HID_OUTPUT_REPORT, 0, 0, 35)) return -ENODEV; /* Assume single fixed device G940 */ for (i = 0; ff_bits[i] >= 0; i++) set_bit(ff_bits[i], dev->ffbit); error = input_ff_create_memless(dev, NULL, hid_lg3ff_play); if (error) return error; if (test_bit(FF_AUTOCENTER, dev->ffbit)) dev->ff->set_autocenter = hid_lg3ff_set_autocenter; hid_info(hid, "Force feedback for Logitech Flight System G940 by Gary Stein <LordCnidarian@gmail.com>\n"); return 0; }
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 // SPDX-License-Identifier: GPL-2.0+ /* * KLSI KL5KUSB105 chip RS232 converter driver * * Copyright (C) 2010 Johan Hovold <jhovold@gmail.com> * Copyright (C) 2001 Utz-Uwe Haus <haus@uuhaus.de> * * All information about the device was acquired using SniffUSB ans snoopUSB * on Windows98. * It was written out of frustration with the PalmConnect USB Serial adapter * sold by Palm Inc. * Neither Palm, nor their contractor (MCCI) or their supplier (KLSI) provided * information that was not already available. * * It seems that KLSI bought some silicon-design information from ScanLogic, * whose SL11R processor is at the core of the KL5KUSB chipset from KLSI. * KLSI has firmware available for their devices; it is probable that the * firmware differs from that used by KLSI in their products. If you have an * original KLSI device and can provide some information on it, I would be * most interested in adding support for it here. If you have any information * on the protocol used (or find errors in my reverse-engineered stuff), please * let me know. * * The code was only tested with a PalmConnect USB adapter; if you * are adventurous, try it with any KLSI-based device and let me know how it * breaks so that I can fix it! */ /* TODO: * check modem line signals * implement handshaking or decide that we do not support it */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/tty.h> #include <linux/tty_driver.h> #include <linux/tty_flip.h> #include <linux/module.h> #include <linux/uaccess.h> #include <asm/unaligned.h> #include <linux/usb.h> #include <linux/usb/serial.h> #include "kl5kusb105.h" #define DRIVER_AUTHOR "Utz-Uwe Haus <haus@uuhaus.de>, Johan Hovold <jhovold@gmail.com>" #define DRIVER_DESC "KLSI KL5KUSB105 chipset USB->Serial Converter driver" /* * Function prototypes */ static int klsi_105_port_probe(struct usb_serial_port *port); static void klsi_105_port_remove(struct usb_serial_port *port); static int klsi_105_open(struct tty_struct *tty, struct usb_serial_port *port); static void klsi_105_close(struct usb_serial_port *port); static void klsi_105_set_termios(struct tty_struct *tty, struct usb_serial_port *port, const struct ktermios *old_termios); static int klsi_105_tiocmget(struct tty_struct *tty); static void klsi_105_process_read_urb(struct urb *urb); static int klsi_105_prepare_write_buffer(struct usb_serial_port *port, void *dest, size_t size); /* * All of the device info needed for the KLSI converters. */ static const struct usb_device_id id_table[] = { { USB_DEVICE(PALMCONNECT_VID, PALMCONNECT_PID) }, { } /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, id_table); static struct usb_serial_driver kl5kusb105d_device = { .driver = { .owner = THIS_MODULE, .name = "kl5kusb105d", }, .description = "KL5KUSB105D / PalmConnect", .id_table = id_table, .num_ports = 1, .bulk_out_size = 64, .open = klsi_105_open, .close = klsi_105_close, .set_termios = klsi_105_set_termios, .tiocmget = klsi_105_tiocmget, .port_probe = klsi_105_port_probe, .port_remove = klsi_105_port_remove, .throttle = usb_serial_generic_throttle, .unthrottle = usb_serial_generic_unthrottle, .process_read_urb = klsi_105_process_read_urb, .prepare_write_buffer = klsi_105_prepare_write_buffer, }; static struct usb_serial_driver * const serial_drivers[] = { &kl5kusb105d_device, NULL }; struct klsi_105_port_settings { u8 pktlen; /* always 5, it seems */ u8 baudrate; u8 databits; u8 unknown1; u8 unknown2; }; struct klsi_105_private { struct klsi_105_port_settings cfg; unsigned long line_state; /* modem line settings */ spinlock_t lock; }; /* * Handle vendor specific USB requests */ #define KLSI_TIMEOUT 5000 /* default urb timeout */ static int klsi_105_chg_port_settings(struct usb_serial_port *port, struct klsi_105_port_settings *settings) { int rc; rc = usb_control_msg_send(port->serial->dev, 0, KL5KUSB105A_SIO_SET_DATA, USB_TYPE_VENDOR | USB_DIR_OUT | USB_RECIP_INTERFACE, 0, /* value */ 0, /* index */ settings, sizeof(struct klsi_105_port_settings), KLSI_TIMEOUT, GFP_KERNEL); if (rc) dev_err(&port->dev, "Change port settings failed (error = %d)\n", rc); dev_dbg(&port->dev, "pktlen %u, baudrate 0x%02x, databits %u, u1 %u, u2 %u\n", settings->pktlen, settings->baudrate, settings->databits, settings->unknown1, settings->unknown2); return rc; } /* * Read line control via vendor command and return result through * the state pointer. */ static int klsi_105_get_line_state(struct usb_serial_port *port, unsigned long *state) { u16 status; int rc; rc = usb_control_msg_recv(port->serial->dev, 0, KL5KUSB105A_SIO_POLL, USB_TYPE_VENDOR | USB_DIR_IN, 0, /* value */ 0, /* index */ &status, sizeof(status), 10000, GFP_KERNEL); if (rc) { dev_err(&port->dev, "reading line status failed: %d\n", rc); return rc; } le16_to_cpus(&status); dev_dbg(&port->dev, "read status %04x\n", status); *state = ((status & KL5KUSB105A_DSR) ? TIOCM_DSR : 0) | ((status & KL5KUSB105A_CTS) ? TIOCM_CTS : 0); return 0; } /* * Driver's tty interface functions */ static int klsi_105_port_probe(struct usb_serial_port *port) { struct klsi_105_private *priv; priv = kmalloc(sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; /* set initial values for control structures */ priv->cfg.pktlen = 5; priv->cfg.baudrate = kl5kusb105a_sio_b9600; priv->cfg.databits = kl5kusb105a_dtb_8; priv->cfg.unknown1 = 0; priv->cfg.unknown2 = 1; priv->line_state = 0; spin_lock_init(&priv->lock); usb_set_serial_port_data(port, priv); return 0; } static void klsi_105_port_remove(struct usb_serial_port *port) { struct klsi_105_private *priv; priv = usb_get_serial_port_data(port); kfree(priv); } static int klsi_105_open(struct tty_struct *tty, struct usb_serial_port *port) { struct klsi_105_private *priv = usb_get_serial_port_data(port); int retval = 0; int rc; unsigned long line_state; struct klsi_105_port_settings cfg; unsigned long flags; /* Do a defined restart: * Set up sane default baud rate and send the 'READ_ON' * vendor command. * FIXME: set modem line control (how?) * Then read the modem line control and store values in * priv->line_state. */ cfg.pktlen = 5; cfg.baudrate = kl5kusb105a_sio_b9600; cfg.databits = kl5kusb105a_dtb_8; cfg.unknown1 = 0; cfg.unknown2 = 1; klsi_105_chg_port_settings(port, &cfg); spin_lock_irqsave(&priv->lock, flags); priv->cfg.pktlen = cfg.pktlen; priv->cfg.baudrate = cfg.baudrate; priv->cfg.databits = cfg.databits; priv->cfg.unknown1 = cfg.unknown1; priv->cfg.unknown2 = cfg.unknown2; spin_unlock_irqrestore(&priv->lock, flags); /* READ_ON and urb submission */ rc = usb_serial_generic_open(tty, port); if (rc) return rc; rc = usb_control_msg(port->serial->dev, usb_sndctrlpipe(port->serial->dev, 0), KL5KUSB105A_SIO_CONFIGURE, USB_TYPE_VENDOR|USB_DIR_OUT|USB_RECIP_INTERFACE, KL5KUSB105A_SIO_CONFIGURE_READ_ON, 0, /* index */ NULL, 0, KLSI_TIMEOUT); if (rc < 0) { dev_err(&port->dev, "Enabling read failed (error = %d)\n", rc); retval = rc; goto err_generic_close; } else dev_dbg(&port->dev, "%s - enabled reading\n", __func__); rc = klsi_105_get_line_state(port, &line_state); if (rc < 0) { retval = rc; goto err_disable_read; } spin_lock_irqsave(&priv->lock, flags); priv->line_state = line_state; spin_unlock_irqrestore(&priv->lock, flags); dev_dbg(&port->dev, "%s - read line state 0x%lx\n", __func__, line_state); return 0; err_disable_read: usb_control_msg(port->serial->dev, usb_sndctrlpipe(port->serial->dev, 0), KL5KUSB105A_SIO_CONFIGURE, USB_TYPE_VENDOR | USB_DIR_OUT, KL5KUSB105A_SIO_CONFIGURE_READ_OFF, 0, /* index */ NULL, 0, KLSI_TIMEOUT); err_generic_close: usb_serial_generic_close(port); return retval; } static void klsi_105_close(struct usb_serial_port *port) { int rc; /* send READ_OFF */ rc = usb_control_msg(port->serial->dev, usb_sndctrlpipe(port->serial->dev, 0), KL5KUSB105A_SIO_CONFIGURE, USB_TYPE_VENDOR | USB_DIR_OUT, KL5KUSB105A_SIO_CONFIGURE_READ_OFF, 0, /* index */ NULL, 0, KLSI_TIMEOUT); if (rc < 0) dev_err(&port->dev, "failed to disable read: %d\n", rc); /* shutdown our bulk reads and writes */ usb_serial_generic_close(port); } /* We need to write a complete 64-byte data block and encode the * number actually sent in the first double-byte, LSB-order. That * leaves at most 62 bytes of payload. */ #define KLSI_HDR_LEN 2 static int klsi_105_prepare_write_buffer(struct usb_serial_port *port, void *dest, size_t size) { unsigned char *buf = dest; int count; count = kfifo_out_locked(&port->write_fifo, buf + KLSI_HDR_LEN, size, &port->lock); put_unaligned_le16(count, buf); return count + KLSI_HDR_LEN; } /* The data received is preceded by a length double-byte in LSB-first order. */ static void klsi_105_process_read_urb(struct urb *urb) { struct usb_serial_port *port = urb->context; unsigned char *data = urb->transfer_buffer; unsigned len; /* empty urbs seem to happen, we ignore them */ if (!urb->actual_length) return; if (urb->actual_length <= KLSI_HDR_LEN) { dev_dbg(&port->dev, "%s - malformed packet\n", __func__); return; } len = get_unaligned_le16(data); if (len > urb->actual_length - KLSI_HDR_LEN) { dev_dbg(&port->dev, "%s - packet length mismatch\n", __func__); len = urb->actual_length - KLSI_HDR_LEN; } tty_insert_flip_string(&port->port, data + KLSI_HDR_LEN, len); tty_flip_buffer_push(&port->port); } static void klsi_105_set_termios(struct tty_struct *tty, struct usb_serial_port *port, const struct ktermios *old_termios) { struct klsi_105_private *priv = usb_get_serial_port_data(port); struct device *dev = &port->dev; unsigned int iflag = tty->termios.c_iflag; unsigned int old_iflag = old_termios->c_iflag; unsigned int cflag = tty->termios.c_cflag; unsigned int old_cflag = old_termios->c_cflag; struct klsi_105_port_settings *cfg; unsigned long flags; speed_t baud; cfg = kmalloc(sizeof(*cfg), GFP_KERNEL); if (!cfg) return; /* lock while we are modifying the settings */ spin_lock_irqsave(&priv->lock, flags); /* * Update baud rate */ baud = tty_get_baud_rate(tty); switch (baud) { case 0: /* handled below */ break; case 1200: priv->cfg.baudrate = kl5kusb105a_sio_b1200; break; case 2400: priv->cfg.baudrate = kl5kusb105a_sio_b2400; break; case 4800: priv->cfg.baudrate = kl5kusb105a_sio_b4800; break; case 9600: priv->cfg.baudrate = kl5kusb105a_sio_b9600; break; case 19200: priv->cfg.baudrate = kl5kusb105a_sio_b19200; break; case 38400: priv->cfg.baudrate = kl5kusb105a_sio_b38400; break; case 57600: priv->cfg.baudrate = kl5kusb105a_sio_b57600; break; case 115200: priv->cfg.baudrate = kl5kusb105a_sio_b115200; break; default: dev_dbg(dev, "unsupported baudrate, using 9600\n"); priv->cfg.baudrate = kl5kusb105a_sio_b9600; baud = 9600; break; } /* * FIXME: implement B0 handling * * Maybe this should be simulated by sending read disable and read * enable messages? */ tty_encode_baud_rate(tty, baud, baud); if ((cflag & CSIZE) != (old_cflag & CSIZE)) { /* set the number of data bits */ switch (cflag & CSIZE) { case CS5: dev_dbg(dev, "%s - 5 bits/byte not supported\n", __func__); spin_unlock_irqrestore(&priv->lock, flags); goto err; case CS6: dev_dbg(dev, "%s - 6 bits/byte not supported\n", __func__); spin_unlock_irqrestore(&priv->lock, flags); goto err; case CS7: priv->cfg.databits = kl5kusb105a_dtb_7; break; case CS8: priv->cfg.databits = kl5kusb105a_dtb_8; break; default: dev_err(dev, "CSIZE was not CS5-CS8, using default of 8\n"); priv->cfg.databits = kl5kusb105a_dtb_8; break; } } /* * Update line control register (LCR) */ if ((cflag & (PARENB|PARODD)) != (old_cflag & (PARENB|PARODD)) || (cflag & CSTOPB) != (old_cflag & CSTOPB)) { /* Not currently supported */ tty->termios.c_cflag &= ~(PARENB|PARODD|CSTOPB); } /* * Set flow control: well, I do not really now how to handle DTR/RTS. * Just do what we have seen with SniffUSB on Win98. */ if ((iflag & IXOFF) != (old_iflag & IXOFF) || (iflag & IXON) != (old_iflag & IXON) || (cflag & CRTSCTS) != (old_cflag & CRTSCTS)) { /* Not currently supported */ tty->termios.c_cflag &= ~CRTSCTS; } memcpy(cfg, &priv->cfg, sizeof(*cfg)); spin_unlock_irqrestore(&priv->lock, flags); /* now commit changes to device */ klsi_105_chg_port_settings(port, cfg); err: kfree(cfg); } static int klsi_105_tiocmget(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; struct klsi_105_private *priv = usb_get_serial_port_data(port); unsigned long flags; int rc; unsigned long line_state; rc = klsi_105_get_line_state(port, &line_state); if (rc < 0) { dev_err(&port->dev, "Reading line control failed (error = %d)\n", rc); /* better return value? EAGAIN? */ return rc; } spin_lock_irqsave(&priv->lock, flags); priv->line_state = line_state; spin_unlock_irqrestore(&priv->lock, flags); dev_dbg(&port->dev, "%s - read line state 0x%lx\n", __func__, line_state); return (int)line_state; } module_usb_serial_driver(serial_drivers, id_table); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL");
110 110 162 24 24 24 24 24 24 24 24 19 24 110 110 143 141 142 143 1189 30 1189 1188 1189 445 445 975 1189 1189 135 27 162 1189 1189 1189 162 162 162 162 162 162 162 162 162 10 162 172 171 172 172 172 172 1189 110 110 110 110 110 110 110 110 110 110 110 110 1190 161 234 1187 1185 1186 13 1189 1187 825 825 2 2 10 10 10 290 291 285 291 1095 19 1095 5 1 1088 1087 1089 4 115 110 110 110 115 110 110 24 1 1 113 110 110 110 1182 128 1 161 160 16 110 110 118 1188 1181 21 1189 30 1189 1189 1159 1167 1167 913 913 110 913 132 4 4 27 25 27 26 7 7 7 1 1 1 5 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Generic socket support routines. Memory allocators, socket lock/release * handler for protocols to use and generic option handler. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Florian La Roche, <flla@stud.uni-sb.de> * Alan Cox, <A.Cox@swansea.ac.uk> * * Fixes: * Alan Cox : Numerous verify_area() problems * Alan Cox : Connecting on a connecting socket * now returns an error for tcp. * Alan Cox : sock->protocol is set correctly. * and is not sometimes left as 0. * Alan Cox : connect handles icmp errors on a * connect properly. Unfortunately there * is a restart syscall nasty there. I * can't match BSD without hacking the C * library. Ideas urgently sought! * Alan Cox : Disallow bind() to addresses that are * not ours - especially broadcast ones!! * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, * instead they leave that for the DESTROY timer. * Alan Cox : Clean up error flag in accept * Alan Cox : TCP ack handling is buggy, the DESTROY timer * was buggy. Put a remove_sock() in the handler * for memory when we hit 0. Also altered the timer * code. The ACK stuff can wait and needs major * TCP layer surgery. * Alan Cox : Fixed TCP ack bug, removed remove sock * and fixed timer/inet_bh race. * Alan Cox : Added zapped flag for TCP * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... * Rick Sladkey : Relaxed UDP rules for matching packets. * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support * Pauline Middelink : identd support * Alan Cox : Fixed connect() taking signals I think. * Alan Cox : SO_LINGER supported * Alan Cox : Error reporting fixes * Anonymous : inet_create tidied up (sk->reuse setting) * Alan Cox : inet sockets don't set sk->type! * Alan Cox : Split socket option code * Alan Cox : Callbacks * Alan Cox : Nagle flag for Charles & Johannes stuff * Alex : Removed restriction on inet fioctl * Alan Cox : Splitting INET from NET core * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code * Alan Cox : Split IP from generic code * Alan Cox : New kfree_skbmem() * Alan Cox : Make SO_DEBUG superuser only. * Alan Cox : Allow anyone to clear SO_DEBUG * (compatibility fix) * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. * Alan Cox : Allocator for a socket is settable. * Alan Cox : SO_ERROR includes soft errors. * Alan Cox : Allow NULL arguments on some SO_ opts * Alan Cox : Generic socket allocation to make hooks * easier (suggested by Craig Metz). * Michael Pall : SO_ERROR returns positive errno again * Steve Whitehouse: Added default destructor to free * protocol private data. * Steve Whitehouse: Added various other default routines * common to several socket families. * Chris Evans : Call suser() check last on F_SETOWN * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() * Andi Kleen : Fix write_space callback * Chris Evans : Security fixes - signedness again * Arnaldo C. Melo : cleanups, use skb_queue_purge * * To Fix: */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <asm/unaligned.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/errqueue.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/interrupt.h> #include <linux/poll.h> #include <linux/tcp.h> #include <linux/init.h> #include <linux/highmem.h> #include <linux/user_namespace.h> #include <linux/static_key.h> #include <linux/memcontrol.h> #include <linux/prefetch.h> #include <linux/compat.h> #include <linux/mroute.h> #include <linux/mroute6.h> #include <linux/icmpv6.h> #include <linux/uaccess.h> #include <linux/netdevice.h> #include <net/protocol.h> #include <linux/skbuff.h> #include <net/net_namespace.h> #include <net/request_sock.h> #include <net/sock.h> #include <linux/net_tstamp.h> #include <net/xfrm.h> #include <linux/ipsec.h> #include <net/cls_cgroup.h> #include <net/netprio_cgroup.h> #include <linux/sock_diag.h> #include <linux/filter.h> #include <net/sock_reuseport.h> #include <net/bpf_sk_storage.h> #include <trace/events/sock.h> #include <net/tcp.h> #include <net/busy_poll.h> #include <net/phonet/phonet.h> #include <linux/ethtool.h> #include "dev.h" static DEFINE_MUTEX(proto_list_mutex); static LIST_HEAD(proto_list); static void sock_def_write_space_wfree(struct sock *sk); static void sock_def_write_space(struct sock *sk); /** * sk_ns_capable - General socket capability test * @sk: Socket to use a capability on or through * @user_ns: The user namespace of the capability to use * @cap: The capability to use * * Test to see if the opener of the socket had when the socket was * created and the current process has the capability @cap in the user * namespace @user_ns. */ bool sk_ns_capable(const struct sock *sk, struct user_namespace *user_ns, int cap) { return file_ns_capable(sk->sk_socket->file, user_ns, cap) && ns_capable(user_ns, cap); } EXPORT_SYMBOL(sk_ns_capable); /** * sk_capable - Socket global capability test * @sk: Socket to use a capability on or through * @cap: The global capability to use * * Test to see if the opener of the socket had when the socket was * created and the current process has the capability @cap in all user * namespaces. */ bool sk_capable(const struct sock *sk, int cap) { return sk_ns_capable(sk, &init_user_ns, cap); } EXPORT_SYMBOL(sk_capable); /** * sk_net_capable - Network namespace socket capability test * @sk: Socket to use a capability on or through * @cap: The capability to use * * Test to see if the opener of the socket had when the socket was created * and the current process has the capability @cap over the network namespace * the socket is a member of. */ bool sk_net_capable(const struct sock *sk, int cap) { return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); } EXPORT_SYMBOL(sk_net_capable); /* * Each address family might have different locking rules, so we have * one slock key per address family and separate keys for internal and * userspace sockets. */ static struct lock_class_key af_family_keys[AF_MAX]; static struct lock_class_key af_family_kern_keys[AF_MAX]; static struct lock_class_key af_family_slock_keys[AF_MAX]; static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; /* * Make lock validator output more readable. (we pre-construct these * strings build-time, so that runtime initialization of socket * locks is fast): */ #define _sock_locks(x) \ x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \ x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \ x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \ x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \ x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \ x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \ x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \ x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \ x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \ x "27" , x "28" , x "AF_CAN" , \ x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \ x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ x "AF_MCTP" , \ x "AF_MAX" static const char *const af_family_key_strings[AF_MAX+1] = { _sock_locks("sk_lock-") }; static const char *const af_family_slock_key_strings[AF_MAX+1] = { _sock_locks("slock-") }; static const char *const af_family_clock_key_strings[AF_MAX+1] = { _sock_locks("clock-") }; static const char *const af_family_kern_key_strings[AF_MAX+1] = { _sock_locks("k-sk_lock-") }; static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = { _sock_locks("k-slock-") }; static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = { _sock_locks("k-clock-") }; static const char *const af_family_rlock_key_strings[AF_MAX+1] = { _sock_locks("rlock-") }; static const char *const af_family_wlock_key_strings[AF_MAX+1] = { _sock_locks("wlock-") }; static const char *const af_family_elock_key_strings[AF_MAX+1] = { _sock_locks("elock-") }; /* * sk_callback_lock and sk queues locking rules are per-address-family, * so split the lock classes by using a per-AF key: */ static struct lock_class_key af_callback_keys[AF_MAX]; static struct lock_class_key af_rlock_keys[AF_MAX]; static struct lock_class_key af_wlock_keys[AF_MAX]; static struct lock_class_key af_elock_keys[AF_MAX]; static struct lock_class_key af_kern_callback_keys[AF_MAX]; /* Run time adjustable parameters. */ __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; EXPORT_SYMBOL(sysctl_wmem_max); __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; EXPORT_SYMBOL(sysctl_rmem_max); __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; /* Maximal space eaten by iovec or ancillary data plus some space */ int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); EXPORT_SYMBOL(sysctl_optmem_max); int sysctl_tstamp_allow_data __read_mostly = 1; DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); EXPORT_SYMBOL_GPL(memalloc_socks_key); /** * sk_set_memalloc - sets %SOCK_MEMALLOC * @sk: socket to set it on * * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. * It's the responsibility of the admin to adjust min_free_kbytes * to meet the requirements */ void sk_set_memalloc(struct sock *sk) { sock_set_flag(sk, SOCK_MEMALLOC); sk->sk_allocation |= __GFP_MEMALLOC; static_branch_inc(&memalloc_socks_key); } EXPORT_SYMBOL_GPL(sk_set_memalloc); void sk_clear_memalloc(struct sock *sk) { sock_reset_flag(sk, SOCK_MEMALLOC); sk->sk_allocation &= ~__GFP_MEMALLOC; static_branch_dec(&memalloc_socks_key); /* * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward * progress of swapping. SOCK_MEMALLOC may be cleared while * it has rmem allocations due to the last swapfile being deactivated * but there is a risk that the socket is unusable due to exceeding * the rmem limits. Reclaim the reserves and obey rmem limits again. */ sk_mem_reclaim(sk); } EXPORT_SYMBOL_GPL(sk_clear_memalloc); int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) { int ret; unsigned int noreclaim_flag; /* these should have been dropped before queueing */ BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); noreclaim_flag = memalloc_noreclaim_save(); ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv, tcp_v6_do_rcv, tcp_v4_do_rcv, sk, skb); memalloc_noreclaim_restore(noreclaim_flag); return ret; } EXPORT_SYMBOL(__sk_backlog_rcv); void sk_error_report(struct sock *sk) { sk->sk_error_report(sk); switch (sk->sk_family) { case AF_INET: fallthrough; case AF_INET6: trace_inet_sk_error_report(sk); break; default: break; } } EXPORT_SYMBOL(sk_error_report); int sock_get_timeout(long timeo, void *optval, bool old_timeval) { struct __kernel_sock_timeval tv; if (timeo == MAX_SCHEDULE_TIMEOUT) { tv.tv_sec = 0; tv.tv_usec = 0; } else { tv.tv_sec = timeo / HZ; tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ; } if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec }; *(struct old_timeval32 *)optval = tv32; return sizeof(tv32); } if (old_timeval) { struct __kernel_old_timeval old_tv; old_tv.tv_sec = tv.tv_sec; old_tv.tv_usec = tv.tv_usec; *(struct __kernel_old_timeval *)optval = old_tv; return sizeof(old_tv); } *(struct __kernel_sock_timeval *)optval = tv; return sizeof(tv); } EXPORT_SYMBOL(sock_get_timeout); int sock_copy_user_timeval(struct __kernel_sock_timeval *tv, sockptr_t optval, int optlen, bool old_timeval) { if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { struct old_timeval32 tv32; if (optlen < sizeof(tv32)) return -EINVAL; if (copy_from_sockptr(&tv32, optval, sizeof(tv32))) return -EFAULT; tv->tv_sec = tv32.tv_sec; tv->tv_usec = tv32.tv_usec; } else if (old_timeval) { struct __kernel_old_timeval old_tv; if (optlen < sizeof(old_tv)) return -EINVAL; if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv))) return -EFAULT; tv->tv_sec = old_tv.tv_sec; tv->tv_usec = old_tv.tv_usec; } else { if (optlen < sizeof(*tv)) return -EINVAL; if (copy_from_sockptr(tv, optval, sizeof(*tv))) return -EFAULT; } return 0; } EXPORT_SYMBOL(sock_copy_user_timeval); static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, bool old_timeval) { struct __kernel_sock_timeval tv; int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval); long val; if (err) return err; if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) return -EDOM; if (tv.tv_sec < 0) { static int warned __read_mostly; WRITE_ONCE(*timeo_p, 0); if (warned < 10 && net_ratelimit()) { warned++; pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", __func__, current->comm, task_pid_nr(current)); } return 0; } val = MAX_SCHEDULE_TIMEOUT; if ((tv.tv_sec || tv.tv_usec) && (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))) val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ); WRITE_ONCE(*timeo_p, val); return 0; } static bool sock_needs_netstamp(const struct sock *sk) { switch (sk->sk_family) { case AF_UNSPEC: case AF_UNIX: return false; default: return true; } } static void sock_disable_timestamp(struct sock *sk, unsigned long flags) { if (sk->sk_flags & flags) { sk->sk_flags &= ~flags; if (sock_needs_netstamp(sk) && !(sk->sk_flags & SK_FLAGS_TIMESTAMP)) net_disable_timestamp(); } } int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { unsigned long flags; struct sk_buff_head *list = &sk->sk_receive_queue; if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { atomic_inc(&sk->sk_drops); trace_sock_rcvqueue_full(sk, skb); return -ENOMEM; } if (!sk_rmem_schedule(sk, skb, skb->truesize)) { atomic_inc(&sk->sk_drops); return -ENOBUFS; } skb->dev = NULL; skb_set_owner_r(skb, sk); /* we escape from rcu protected region, make sure we dont leak * a norefcounted dst */ skb_dst_force(skb); spin_lock_irqsave(&list->lock, flags); sock_skb_set_dropcount(sk, skb); __skb_queue_tail(list, skb); spin_unlock_irqrestore(&list->lock, flags); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk); return 0; } EXPORT_SYMBOL(__sock_queue_rcv_skb); int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason) { enum skb_drop_reason drop_reason; int err; err = sk_filter(sk, skb); if (err) { drop_reason = SKB_DROP_REASON_SOCKET_FILTER; goto out; } err = __sock_queue_rcv_skb(sk, skb); switch (err) { case -ENOMEM: drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF; break; case -ENOBUFS: drop_reason = SKB_DROP_REASON_PROTO_MEM; break; default: drop_reason = SKB_NOT_DROPPED_YET; break; } out: if (reason) *reason = drop_reason; return err; } EXPORT_SYMBOL(sock_queue_rcv_skb_reason); int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested, unsigned int trim_cap, bool refcounted) { int rc = NET_RX_SUCCESS; if (sk_filter_trim_cap(sk, skb, trim_cap)) goto discard_and_relse; skb->dev = NULL; if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { atomic_inc(&sk->sk_drops); goto discard_and_relse; } if (nested) bh_lock_sock_nested(sk); else bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { /* * trylock + unlock semantics: */ mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); rc = sk_backlog_rcv(sk, skb); mutex_release(&sk->sk_lock.dep_map, _RET_IP_); } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) { bh_unlock_sock(sk); atomic_inc(&sk->sk_drops); goto discard_and_relse; } bh_unlock_sock(sk); out: if (refcounted) sock_put(sk); return rc; discard_and_relse: kfree_skb(skb); goto out; } EXPORT_SYMBOL(__sk_receive_skb); INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *, u32)); INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, u32)); struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) { struct dst_entry *dst = __sk_dst_get(sk); if (dst && dst->obsolete && INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, dst, cookie) == NULL) { sk_tx_queue_clear(sk); WRITE_ONCE(sk->sk_dst_pending_confirm, 0); RCU_INIT_POINTER(sk->sk_dst_cache, NULL); dst_release(dst); return NULL; } return dst; } EXPORT_SYMBOL(__sk_dst_check); struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) { struct dst_entry *dst = sk_dst_get(sk); if (dst && dst->obsolete && INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, dst, cookie) == NULL) { sk_dst_reset(sk); dst_release(dst); return NULL; } return dst; } EXPORT_SYMBOL(sk_dst_check); static int sock_bindtoindex_locked(struct sock *sk, int ifindex) { int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES struct net *net = sock_net(sk); /* Sorry... */ ret = -EPERM; if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW)) goto out; ret = -EINVAL; if (ifindex < 0) goto out; /* Paired with all READ_ONCE() done locklessly. */ WRITE_ONCE(sk->sk_bound_dev_if, ifindex); if (sk->sk_prot->rehash) sk->sk_prot->rehash(sk); sk_dst_reset(sk); ret = 0; out: #endif return ret; } int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk) { int ret; if (lock_sk) lock_sock(sk); ret = sock_bindtoindex_locked(sk, ifindex); if (lock_sk) release_sock(sk); return ret; } EXPORT_SYMBOL(sock_bindtoindex); static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen) { int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES struct net *net = sock_net(sk); char devname[IFNAMSIZ]; int index; ret = -EINVAL; if (optlen < 0) goto out; /* Bind this socket to a particular device like "eth0", * as specified in the passed interface name. If the * name is "" or the option length is zero the socket * is not bound. */ if (optlen > IFNAMSIZ - 1) optlen = IFNAMSIZ - 1; memset(devname, 0, sizeof(devname)); ret = -EFAULT; if (copy_from_sockptr(devname, optval, optlen)) goto out; index = 0; if (devname[0] != '\0') { struct net_device *dev; rcu_read_lock(); dev = dev_get_by_name_rcu(net, devname); if (dev) index = dev->ifindex; rcu_read_unlock(); ret = -ENODEV; if (!dev) goto out; } sockopt_lock_sock(sk); ret = sock_bindtoindex_locked(sk, index); sockopt_release_sock(sk); out: #endif return ret; } static int sock_getbindtodevice(struct sock *sk, sockptr_t optval, sockptr_t optlen, int len) { int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); struct net *net = sock_net(sk); char devname[IFNAMSIZ]; if (bound_dev_if == 0) { len = 0; goto zero; } ret = -EINVAL; if (len < IFNAMSIZ) goto out; ret = netdev_get_name(net, devname, bound_dev_if); if (ret) goto out; len = strlen(devname) + 1; ret = -EFAULT; if (copy_to_sockptr(optval, devname, len)) goto out; zero: ret = -EFAULT; if (copy_to_sockptr(optlen, &len, sizeof(int))) goto out; ret = 0; out: #endif return ret; } bool sk_mc_loop(const struct sock *sk) { if (dev_recursion_level()) return false; if (!sk) return true; /* IPV6_ADDRFORM can change sk->sk_family under us. */ switch (READ_ONCE(sk->sk_family)) { case AF_INET: return inet_test_bit(MC_LOOP, sk); #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: return inet6_test_bit(MC6_LOOP, sk); #endif } WARN_ON_ONCE(1); return true; } EXPORT_SYMBOL(sk_mc_loop); void sock_set_reuseaddr(struct sock *sk) { lock_sock(sk); sk->sk_reuse = SK_CAN_REUSE; release_sock(sk); } EXPORT_SYMBOL(sock_set_reuseaddr); void sock_set_reuseport(struct sock *sk) { lock_sock(sk); sk->sk_reuseport = true; release_sock(sk); } EXPORT_SYMBOL(sock_set_reuseport); void sock_no_linger(struct sock *sk) { lock_sock(sk); WRITE_ONCE(sk->sk_lingertime, 0); sock_set_flag(sk, SOCK_LINGER); release_sock(sk); } EXPORT_SYMBOL(sock_no_linger); void sock_set_priority(struct sock *sk, u32 priority) { WRITE_ONCE(sk->sk_priority, priority); } EXPORT_SYMBOL(sock_set_priority); void sock_set_sndtimeo(struct sock *sk, s64 secs) { lock_sock(sk); if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) WRITE_ONCE(sk->sk_sndtimeo, secs * HZ); else WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT); release_sock(sk); } EXPORT_SYMBOL(sock_set_sndtimeo); static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) { if (val) { sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new); sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns); sock_set_flag(sk, SOCK_RCVTSTAMP); sock_enable_timestamp(sk, SOCK_TIMESTAMP); } else { sock_reset_flag(sk, SOCK_RCVTSTAMP); sock_reset_flag(sk, SOCK_RCVTSTAMPNS); } } void sock_enable_timestamps(struct sock *sk) { lock_sock(sk); __sock_set_timestamps(sk, true, false, true); release_sock(sk); } EXPORT_SYMBOL(sock_enable_timestamps); void sock_set_timestamp(struct sock *sk, int optname, bool valbool) { switch (optname) { case SO_TIMESTAMP_OLD: __sock_set_timestamps(sk, valbool, false, false); break; case SO_TIMESTAMP_NEW: __sock_set_timestamps(sk, valbool, true, false); break; case SO_TIMESTAMPNS_OLD: __sock_set_timestamps(sk, valbool, false, true); break; case SO_TIMESTAMPNS_NEW: __sock_set_timestamps(sk, valbool, true, true); break; } } static int sock_timestamping_bind_phc(struct sock *sk, int phc_index) { struct net *net = sock_net(sk); struct net_device *dev = NULL; bool match = false; int *vclock_index; int i, num; if (sk->sk_bound_dev_if) dev = dev_get_by_index(net, sk->sk_bound_dev_if); if (!dev) { pr_err("%s: sock not bind to device\n", __func__); return -EOPNOTSUPP; } num = ethtool_get_phc_vclocks(dev, &vclock_index); dev_put(dev); for (i = 0; i < num; i++) { if (*(vclock_index + i) == phc_index) { match = true; break; } } if (num > 0) kfree(vclock_index); if (!match) return -EINVAL; WRITE_ONCE(sk->sk_bind_phc, phc_index); return 0; } int sock_set_timestamping(struct sock *sk, int optname, struct so_timestamping timestamping) { int val = timestamping.flags; int ret; if (val & ~SOF_TIMESTAMPING_MASK) return -EINVAL; if (val & SOF_TIMESTAMPING_OPT_ID_TCP && !(val & SOF_TIMESTAMPING_OPT_ID)) return -EINVAL; if (val & SOF_TIMESTAMPING_OPT_ID && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { if (sk_is_tcp(sk)) { if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) return -EINVAL; if (val & SOF_TIMESTAMPING_OPT_ID_TCP) atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq); else atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una); } else { atomic_set(&sk->sk_tskey, 0); } } if (val & SOF_TIMESTAMPING_OPT_STATS && !(val & SOF_TIMESTAMPING_OPT_TSONLY)) return -EINVAL; if (val & SOF_TIMESTAMPING_BIND_PHC) { ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc); if (ret) return ret; } WRITE_ONCE(sk->sk_tsflags, val); sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); if (val & SOF_TIMESTAMPING_RX_SOFTWARE) sock_enable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE); else sock_disable_timestamp(sk, (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); return 0; } void sock_set_keepalive(struct sock *sk) { lock_sock(sk); if (sk->sk_prot->keepalive) sk->sk_prot->keepalive(sk, true); sock_valbool_flag(sk, SOCK_KEEPOPEN, true); release_sock(sk); } EXPORT_SYMBOL(sock_set_keepalive); static void __sock_set_rcvbuf(struct sock *sk, int val) { /* Ensure val * 2 fits into an int, to prevent max_t() from treating it * as a negative value. */ val = min_t(int, val, INT_MAX / 2); sk->sk_userlocks |= SOCK_RCVBUF_LOCK; /* We double it on the way in to account for "struct sk_buff" etc. * overhead. Applications assume that the SO_RCVBUF setting they make * will allow that much actual data to be received on that socket. * * Applications are unaware that "struct sk_buff" and other overheads * allocate from the receive buffer during socket buffer allocation. * * And after considering the possible alternatives, returning the value * we actually used in getsockopt is the most desirable behavior. */ WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF)); } void sock_set_rcvbuf(struct sock *sk, int val) { lock_sock(sk); __sock_set_rcvbuf(sk, val); release_sock(sk); } EXPORT_SYMBOL(sock_set_rcvbuf); static void __sock_set_mark(struct sock *sk, u32 val) { if (val != sk->sk_mark) { WRITE_ONCE(sk->sk_mark, val); sk_dst_reset(sk); } } void sock_set_mark(struct sock *sk, u32 val) { lock_sock(sk); __sock_set_mark(sk, val); release_sock(sk); } EXPORT_SYMBOL(sock_set_mark); static void sock_release_reserved_memory(struct sock *sk, int bytes) { /* Round down bytes to multiple of pages */ bytes = round_down(bytes, PAGE_SIZE); WARN_ON(bytes > sk->sk_reserved_mem); WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes); sk_mem_reclaim(sk); } static int sock_reserve_memory(struct sock *sk, int bytes) { long allocated; bool charged; int pages; if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk)) return -EOPNOTSUPP; if (!bytes) return 0; pages = sk_mem_pages(bytes); /* pre-charge to memcg */ charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages, GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!charged) return -ENOMEM; /* pre-charge to forward_alloc */ sk_memory_allocated_add(sk, pages); allocated = sk_memory_allocated(sk); /* If the system goes into memory pressure with this * precharge, give up and return error. */ if (allocated > sk_prot_mem_limits(sk, 1)) { sk_memory_allocated_sub(sk, pages); mem_cgroup_uncharge_skmem(sk->sk_memcg, pages); return -ENOMEM; } sk_forward_alloc_add(sk, pages << PAGE_SHIFT); WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem + (pages << PAGE_SHIFT)); return 0; } void sockopt_lock_sock(struct sock *sk) { /* When current->bpf_ctx is set, the setsockopt is called from * a bpf prog. bpf has ensured the sk lock has been * acquired before calling setsockopt(). */ if (has_current_bpf_ctx()) return; lock_sock(sk); } EXPORT_SYMBOL(sockopt_lock_sock); void sockopt_release_sock(struct sock *sk) { if (has_current_bpf_ctx()) return; release_sock(sk); } EXPORT_SYMBOL(sockopt_release_sock); bool sockopt_ns_capable(struct user_namespace *ns, int cap) { return has_current_bpf_ctx() || ns_capable(ns, cap); } EXPORT_SYMBOL(sockopt_ns_capable); bool sockopt_capable(int cap) { return has_current_bpf_ctx() || capable(cap); } EXPORT_SYMBOL(sockopt_capable); /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. */ int sk_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct so_timestamping timestamping; struct socket *sock = sk->sk_socket; struct sock_txtime sk_txtime; int val; int valbool; struct linger ling; int ret = 0; /* * Options without arguments */ if (optname == SO_BINDTODEVICE) return sock_setbindtodevice(sk, optval, optlen); if (optlen < sizeof(int)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; valbool = val ? 1 : 0; /* handle options which do not require locking the socket. */ switch (optname) { case SO_PRIORITY: if ((val >= 0 && val <= 6) || sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { sock_set_priority(sk, val); return 0; } return -EPERM; case SO_PASSSEC: assign_bit(SOCK_PASSSEC, &sock->flags, valbool); return 0; case SO_PASSCRED: assign_bit(SOCK_PASSCRED, &sock->flags, valbool); return 0; case SO_PASSPIDFD: assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool); return 0; case SO_TYPE: case SO_PROTOCOL: case SO_DOMAIN: case SO_ERROR: return -ENOPROTOOPT; #ifdef CONFIG_NET_RX_BUSY_POLL case SO_BUSY_POLL: if (val < 0) return -EINVAL; WRITE_ONCE(sk->sk_ll_usec, val); return 0; case SO_PREFER_BUSY_POLL: if (valbool && !sockopt_capable(CAP_NET_ADMIN)) return -EPERM; WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); return 0; case SO_BUSY_POLL_BUDGET: if (val > READ_ONCE(sk->sk_busy_poll_budget) && !sockopt_capable(CAP_NET_ADMIN)) return -EPERM; if (val < 0 || val > U16_MAX) return -EINVAL; WRITE_ONCE(sk->sk_busy_poll_budget, val); return 0; #endif case SO_MAX_PACING_RATE: { unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val; unsigned long pacing_rate; if (sizeof(ulval) != sizeof(val) && optlen >= sizeof(ulval) && copy_from_sockptr(&ulval, optval, sizeof(ulval))) { return -EFAULT; } if (ulval != ~0UL) cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); /* Pairs with READ_ONCE() from sk_getsockopt() */ WRITE_ONCE(sk->sk_max_pacing_rate, ulval); pacing_rate = READ_ONCE(sk->sk_pacing_rate); if (ulval < pacing_rate) WRITE_ONCE(sk->sk_pacing_rate, ulval); return 0; } case SO_TXREHASH: if (val < -1 || val > 1) return -EINVAL; if ((u8)val == SOCK_TXREHASH_DEFAULT) val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash); /* Paired with READ_ONCE() in tcp_rtx_synack() * and sk_getsockopt(). */ WRITE_ONCE(sk->sk_txrehash, (u8)val); return 0; } sockopt_lock_sock(sk); switch (optname) { case SO_DEBUG: if (val && !sockopt_capable(CAP_NET_ADMIN)) ret = -EACCES; else sock_valbool_flag(sk, SOCK_DBG, valbool); break; case SO_REUSEADDR: sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); break; case SO_REUSEPORT: sk->sk_reuseport = valbool; break; case SO_DONTROUTE: sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); sk_dst_reset(sk); break; case SO_BROADCAST: sock_valbool_flag(sk, SOCK_BROADCAST, valbool); break; case SO_SNDBUF: /* Don't error on this BSD doesn't and if you think * about it this is right. Otherwise apps have to * play 'guess the biggest size' games. RCVBUF/SNDBUF * are treated in BSD as hints */ val = min_t(u32, val, READ_ONCE(sysctl_wmem_max)); set_sndbuf: /* Ensure val * 2 fits into an int, to prevent max_t() * from treating it as a negative value. */ val = min_t(int, val, INT_MAX / 2); sk->sk_userlocks |= SOCK_SNDBUF_LOCK; WRITE_ONCE(sk->sk_sndbuf, max_t(int, val * 2, SOCK_MIN_SNDBUF)); /* Wake up sending tasks if we upped the value. */ sk->sk_write_space(sk); break; case SO_SNDBUFFORCE: if (!sockopt_capable(CAP_NET_ADMIN)) { ret = -EPERM; break; } /* No negative values (to prevent underflow, as val will be * multiplied by 2). */ if (val < 0) val = 0; goto set_sndbuf; case SO_RCVBUF: /* Don't error on this BSD doesn't and if you think * about it this is right. Otherwise apps have to * play 'guess the biggest size' games. RCVBUF/SNDBUF * are treated in BSD as hints */ __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max))); break; case SO_RCVBUFFORCE: if (!sockopt_capable(CAP_NET_ADMIN)) { ret = -EPERM; break; } /* No negative values (to prevent underflow, as val will be * multiplied by 2). */ __sock_set_rcvbuf(sk, max(val, 0)); break; case SO_KEEPALIVE: if (sk->sk_prot->keepalive) sk->sk_prot->keepalive(sk, valbool); sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); break; case SO_OOBINLINE: sock_valbool_flag(sk, SOCK_URGINLINE, valbool); break; case SO_NO_CHECK: sk->sk_no_check_tx = valbool; break; case SO_LINGER: if (optlen < sizeof(ling)) { ret = -EINVAL; /* 1003.1g */ break; } if (copy_from_sockptr(&ling, optval, sizeof(ling))) { ret = -EFAULT; break; } if (!ling.l_onoff) { sock_reset_flag(sk, SOCK_LINGER); } else { unsigned long t_sec = ling.l_linger; if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ) WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT); else WRITE_ONCE(sk->sk_lingertime, t_sec * HZ); sock_set_flag(sk, SOCK_LINGER); } break; case SO_BSDCOMPAT: break; case SO_TIMESTAMP_OLD: case SO_TIMESTAMP_NEW: case SO_TIMESTAMPNS_OLD: case SO_TIMESTAMPNS_NEW: sock_set_timestamp(sk, optname, valbool); break; case SO_TIMESTAMPING_NEW: case SO_TIMESTAMPING_OLD: if (optlen == sizeof(timestamping)) { if (copy_from_sockptr(&timestamping, optval, sizeof(timestamping))) { ret = -EFAULT; break; } } else { memset(&timestamping, 0, sizeof(timestamping)); timestamping.flags = val; } ret = sock_set_timestamping(sk, optname, timestamping); break; case SO_RCVLOWAT: { int (*set_rcvlowat)(struct sock *sk, int val) = NULL; if (val < 0) val = INT_MAX; if (sock) set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat; if (set_rcvlowat) ret = set_rcvlowat(sk, val); else WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); break; } case SO_RCVTIMEO_OLD: case SO_RCVTIMEO_NEW: ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen, optname == SO_RCVTIMEO_OLD); break; case SO_SNDTIMEO_OLD: case SO_SNDTIMEO_NEW: ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen, optname == SO_SNDTIMEO_OLD); break; case SO_ATTACH_FILTER: { struct sock_fprog fprog; ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); if (!ret) ret = sk_attach_filter(&fprog, sk); break; } case SO_ATTACH_BPF: ret = -EINVAL; if (optlen == sizeof(u32)) { u32 ufd; ret = -EFAULT; if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) break; ret = sk_attach_bpf(ufd, sk); } break; case SO_ATTACH_REUSEPORT_CBPF: { struct sock_fprog fprog; ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); if (!ret) ret = sk_reuseport_attach_filter(&fprog, sk); break; } case SO_ATTACH_REUSEPORT_EBPF: ret = -EINVAL; if (optlen == sizeof(u32)) { u32 ufd; ret = -EFAULT; if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) break; ret = sk_reuseport_attach_bpf(ufd, sk); } break; case SO_DETACH_REUSEPORT_BPF: ret = reuseport_detach_prog(sk); break; case SO_DETACH_FILTER: ret = sk_detach_filter(sk); break; case SO_LOCK_FILTER: if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool) ret = -EPERM; else sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); break; case SO_MARK: if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; break; } __sock_set_mark(sk, val); break; case SO_RCVMARK: sock_valbool_flag(sk, SOCK_RCVMARK, valbool); break; case SO_RXQ_OVFL: sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); break; case SO_WIFI_STATUS: sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); break; case SO_PEEK_OFF: { int (*set_peek_off)(struct sock *sk, int val); set_peek_off = READ_ONCE(sock->ops)->set_peek_off; if (set_peek_off) ret = set_peek_off(sk, val); else ret = -EOPNOTSUPP; break; } case SO_NOFCS: sock_valbool_flag(sk, SOCK_NOFCS, valbool); break; case SO_SELECT_ERR_QUEUE: sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); break; case SO_INCOMING_CPU: reuseport_update_incoming_cpu(sk, val); break; case SO_CNX_ADVICE: if (val == 1) dst_negative_advice(sk); break; case SO_ZEROCOPY: if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { if (!(sk_is_tcp(sk) || (sk->sk_type == SOCK_DGRAM && sk->sk_protocol == IPPROTO_UDP))) ret = -EOPNOTSUPP; } else if (sk->sk_family != PF_RDS) { ret = -EOPNOTSUPP; } if (!ret) { if (val < 0 || val > 1) ret = -EINVAL; else sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); } break; case SO_TXTIME: if (optlen != sizeof(struct sock_txtime)) { ret = -EINVAL; break; } else if (copy_from_sockptr(&sk_txtime, optval, sizeof(struct sock_txtime))) { ret = -EFAULT; break; } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { ret = -EINVAL; break; } /* CLOCK_MONOTONIC is only used by sch_fq, and this packet * scheduler has enough safe guards. */ if (sk_txtime.clockid != CLOCK_MONOTONIC && !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; break; } sock_valbool_flag(sk, SOCK_TXTIME, true); sk->sk_clockid = sk_txtime.clockid; sk->sk_txtime_deadline_mode = !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); sk->sk_txtime_report_errors = !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); break; case SO_BINDTOIFINDEX: ret = sock_bindtoindex_locked(sk, val); break; case SO_BUF_LOCK: if (val & ~SOCK_BUF_LOCK_MASK) { ret = -EINVAL; break; } sk->sk_userlocks = val | (sk->sk_userlocks & ~SOCK_BUF_LOCK_MASK); break; case SO_RESERVE_MEM: { int delta; if (val < 0) { ret = -EINVAL; break; } delta = val - sk->sk_reserved_mem; if (delta < 0) sock_release_reserved_memory(sk, -delta); else ret = sock_reserve_memory(sk, delta); break; } default: ret = -ENOPROTOOPT; break; } sockopt_release_sock(sk); return ret; } int sock_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { return sk_setsockopt(sock->sk, level, optname, optval, optlen); } EXPORT_SYMBOL(sock_setsockopt); static const struct cred *sk_get_peer_cred(struct sock *sk) { const struct cred *cred; spin_lock(&sk->sk_peer_lock); cred = get_cred(sk->sk_peer_cred); spin_unlock(&sk->sk_peer_lock); return cred; } static void cred_to_ucred(struct pid *pid, const struct cred *cred, struct ucred *ucred) { ucred->pid = pid_vnr(pid); ucred->uid = ucred->gid = -1; if (cred) { struct user_namespace *current_ns = current_user_ns(); ucred->uid = from_kuid_munged(current_ns, cred->euid); ucred->gid = from_kgid_munged(current_ns, cred->egid); } } static int groups_to_user(sockptr_t dst, const struct group_info *src) { struct user_namespace *user_ns = current_user_ns(); int i; for (i = 0; i < src->ngroups; i++) { gid_t gid = from_kgid_munged(user_ns, src->gid[i]); if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid))) return -EFAULT; } return 0; } int sk_getsockopt(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen) { struct socket *sock = sk->sk_socket; union { int val; u64 val64; unsigned long ulval; struct linger ling; struct old_timeval32 tm32; struct __kernel_old_timeval tm; struct __kernel_sock_timeval stm; struct sock_txtime txtime; struct so_timestamping timestamping; } v; int lv = sizeof(int); int len; if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; if (len < 0) return -EINVAL; memset(&v, 0, sizeof(v)); switch (optname) { case SO_DEBUG: v.val = sock_flag(sk, SOCK_DBG); break; case SO_DONTROUTE: v.val = sock_flag(sk, SOCK_LOCALROUTE); break; case SO_BROADCAST: v.val = sock_flag(sk, SOCK_BROADCAST); break; case SO_SNDBUF: v.val = READ_ONCE(sk->sk_sndbuf); break; case SO_RCVBUF: v.val = READ_ONCE(sk->sk_rcvbuf); break; case SO_REUSEADDR: v.val = sk->sk_reuse; break; case SO_REUSEPORT: v.val = sk->sk_reuseport; break; case SO_KEEPALIVE: v.val = sock_flag(sk, SOCK_KEEPOPEN); break; case SO_TYPE: v.val = sk->sk_type; break; case SO_PROTOCOL: v.val = sk->sk_protocol; break; case SO_DOMAIN: v.val = sk->sk_family; break; case SO_ERROR: v.val = -sock_error(sk); if (v.val == 0) v.val = xchg(&sk->sk_err_soft, 0); break; case SO_OOBINLINE: v.val = sock_flag(sk, SOCK_URGINLINE); break; case SO_NO_CHECK: v.val = sk->sk_no_check_tx; break; case SO_PRIORITY: v.val = READ_ONCE(sk->sk_priority); break; case SO_LINGER: lv = sizeof(v.ling); v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); v.ling.l_linger = READ_ONCE(sk->sk_lingertime) / HZ; break; case SO_BSDCOMPAT: break; case SO_TIMESTAMP_OLD: v.val = sock_flag(sk, SOCK_RCVTSTAMP) && !sock_flag(sk, SOCK_TSTAMP_NEW) && !sock_flag(sk, SOCK_RCVTSTAMPNS); break; case SO_TIMESTAMPNS_OLD: v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW); break; case SO_TIMESTAMP_NEW: v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW); break; case SO_TIMESTAMPNS_NEW: v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW); break; case SO_TIMESTAMPING_OLD: lv = sizeof(v.timestamping); v.timestamping.flags = READ_ONCE(sk->sk_tsflags); v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc); break; case SO_RCVTIMEO_OLD: case SO_RCVTIMEO_NEW: lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v, SO_RCVTIMEO_OLD == optname); break; case SO_SNDTIMEO_OLD: case SO_SNDTIMEO_NEW: lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v, SO_SNDTIMEO_OLD == optname); break; case SO_RCVLOWAT: v.val = READ_ONCE(sk->sk_rcvlowat); break; case SO_SNDLOWAT: v.val = 1; break; case SO_PASSCRED: v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); break; case SO_PASSPIDFD: v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags); break; case SO_PEERCRED: { struct ucred peercred; if (len > sizeof(peercred)) len = sizeof(peercred); spin_lock(&sk->sk_peer_lock); cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); spin_unlock(&sk->sk_peer_lock); if (copy_to_sockptr(optval, &peercred, len)) return -EFAULT; goto lenout; } case SO_PEERPIDFD: { struct pid *peer_pid; struct file *pidfd_file = NULL; int pidfd; if (len > sizeof(pidfd)) len = sizeof(pidfd); spin_lock(&sk->sk_peer_lock); peer_pid = get_pid(sk->sk_peer_pid); spin_unlock(&sk->sk_peer_lock); if (!peer_pid) return -ENODATA; pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file); put_pid(peer_pid); if (pidfd < 0) return pidfd; if (copy_to_sockptr(optval, &pidfd, len) || copy_to_sockptr(optlen, &len, sizeof(int))) { put_unused_fd(pidfd); fput(pidfd_file); return -EFAULT; } fd_install(pidfd, pidfd_file); return 0; } case SO_PEERGROUPS: { const struct cred *cred; int ret, n; cred = sk_get_peer_cred(sk); if (!cred) return -ENODATA; n = cred->group_info->ngroups; if (len < n * sizeof(gid_t)) { len = n * sizeof(gid_t); put_cred(cred); return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE; } len = n * sizeof(gid_t); ret = groups_to_user(optval, cred->group_info); put_cred(cred); if (ret) return ret; goto lenout; } case SO_PEERNAME: { struct sockaddr_storage address; lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2); if (lv < 0) return -ENOTCONN; if (lv < len) return -EINVAL; if (copy_to_sockptr(optval, &address, len)) return -EFAULT; goto lenout; } /* Dubious BSD thing... Probably nobody even uses it, but * the UNIX standard wants it for whatever reason... -DaveM */ case SO_ACCEPTCONN: v.val = sk->sk_state == TCP_LISTEN; break; case SO_PASSSEC: v.val = !!test_bit(SOCK_PASSSEC, &sock->flags); break; case SO_PEERSEC: return security_socket_getpeersec_stream(sock, optval, optlen, len); case SO_MARK: v.val = READ_ONCE(sk->sk_mark); break; case SO_RCVMARK: v.val = sock_flag(sk, SOCK_RCVMARK); break; case SO_RXQ_OVFL: v.val = sock_flag(sk, SOCK_RXQ_OVFL); break; case SO_WIFI_STATUS: v.val = sock_flag(sk, SOCK_WIFI_STATUS); break; case SO_PEEK_OFF: if (!READ_ONCE(sock->ops)->set_peek_off) return -EOPNOTSUPP; v.val = READ_ONCE(sk->sk_peek_off); break; case SO_NOFCS: v.val = sock_flag(sk, SOCK_NOFCS); break; case SO_BINDTODEVICE: return sock_getbindtodevice(sk, optval, optlen, len); case SO_GET_FILTER: len = sk_get_filter(sk, optval, len); if (len < 0) return len; goto lenout; case SO_LOCK_FILTER: v.val = sock_flag(sk, SOCK_FILTER_LOCKED); break; case SO_BPF_EXTENSIONS: v.val = bpf_tell_extensions(); break; case SO_SELECT_ERR_QUEUE: v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); break; #ifdef CONFIG_NET_RX_BUSY_POLL case SO_BUSY_POLL: v.val = READ_ONCE(sk->sk_ll_usec); break; case SO_PREFER_BUSY_POLL: v.val = READ_ONCE(sk->sk_prefer_busy_poll); break; #endif case SO_MAX_PACING_RATE: /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */ if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) { lv = sizeof(v.ulval); v.ulval = READ_ONCE(sk->sk_max_pacing_rate); } else { /* 32bit version */ v.val = min_t(unsigned long, ~0U, READ_ONCE(sk->sk_max_pacing_rate)); } break; case SO_INCOMING_CPU: v.val = READ_ONCE(sk->sk_incoming_cpu); break; case SO_MEMINFO: { u32 meminfo[SK_MEMINFO_VARS]; sk_get_meminfo(sk, meminfo); len = min_t(unsigned int, len, sizeof(meminfo)); if (copy_to_sockptr(optval, &meminfo, len)) return -EFAULT; goto lenout; } #ifdef CONFIG_NET_RX_BUSY_POLL case SO_INCOMING_NAPI_ID: v.val = READ_ONCE(sk->sk_napi_id); /* aggregate non-NAPI IDs down to 0 */ if (v.val < MIN_NAPI_ID) v.val = 0; break; #endif case SO_COOKIE: lv = sizeof(u64); if (len < lv) return -EINVAL; v.val64 = sock_gen_cookie(sk); break; case SO_ZEROCOPY: v.val = sock_flag(sk, SOCK_ZEROCOPY); break; case SO_TXTIME: lv = sizeof(v.txtime); v.txtime.clockid = sk->sk_clockid; v.txtime.flags |= sk->sk_txtime_deadline_mode ? SOF_TXTIME_DEADLINE_MODE : 0; v.txtime.flags |= sk->sk_txtime_report_errors ? SOF_TXTIME_REPORT_ERRORS : 0; break; case SO_BINDTOIFINDEX: v.val = READ_ONCE(sk->sk_bound_dev_if); break; case SO_NETNS_COOKIE: lv = sizeof(u64); if (len != lv) return -EINVAL; v.val64 = sock_net(sk)->net_cookie; break; case SO_BUF_LOCK: v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK; break; case SO_RESERVE_MEM: v.val = READ_ONCE(sk->sk_reserved_mem); break; case SO_TXREHASH: /* Paired with WRITE_ONCE() in sk_setsockopt() */ v.val = READ_ONCE(sk->sk_txrehash); break; default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7). */ return -ENOPROTOOPT; } if (len > lv) len = lv; if (copy_to_sockptr(optval, &v, len)) return -EFAULT; lenout: if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; return 0; } /* * Initialize an sk_lock. * * (We also register the sk_lock with the lock validator.) */ static inline void sock_lock_init(struct sock *sk) { if (sk->sk_kern_sock) sock_lock_init_class_and_name( sk, af_family_kern_slock_key_strings[sk->sk_family], af_family_kern_slock_keys + sk->sk_family, af_family_kern_key_strings[sk->sk_family], af_family_kern_keys + sk->sk_family); else sock_lock_init_class_and_name( sk, af_family_slock_key_strings[sk->sk_family], af_family_slock_keys + sk->sk_family, af_family_key_strings[sk->sk_family], af_family_keys + sk->sk_family); } /* * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, * even temporarly, because of RCU lookups. sk_node should also be left as is. * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end */ static void sock_copy(struct sock *nsk, const struct sock *osk) { const struct proto *prot = READ_ONCE(osk->sk_prot); #ifdef CONFIG_SECURITY_NETWORK void *sptr = nsk->sk_security; #endif /* If we move sk_tx_queue_mapping out of the private section, * we must check if sk_tx_queue_clear() is called after * sock_copy() in sk_clone_lock(). */ BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) < offsetof(struct sock, sk_dontcopy_begin) || offsetof(struct sock, sk_tx_queue_mapping) >= offsetof(struct sock, sk_dontcopy_end)); memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); #ifdef CONFIG_SECURITY_NETWORK nsk->sk_security = sptr; security_sk_clone(osk, nsk); #endif } static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, int family) { struct sock *sk; struct kmem_cache *slab; slab = prot->slab; if (slab != NULL) { sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); if (!sk) return sk; if (want_init_on_alloc(priority)) sk_prot_clear_nulls(sk, prot->obj_size); } else sk = kmalloc(prot->obj_size, priority); if (sk != NULL) { if (security_sk_alloc(sk, family, priority)) goto out_free; if (!try_module_get(prot->owner)) goto out_free_sec; } return sk; out_free_sec: security_sk_free(sk); out_free: if (slab != NULL) kmem_cache_free(slab, sk); else kfree(sk); return NULL; } static void sk_prot_free(struct proto *prot, struct sock *sk) { struct kmem_cache *slab; struct module *owner; owner = prot->owner; slab = prot->slab; cgroup_sk_free(&sk->sk_cgrp_data); mem_cgroup_sk_free(sk); security_sk_free(sk); if (slab != NULL) kmem_cache_free(slab, sk); else kfree(sk); module_put(owner); } /** * sk_alloc - All socket objects are allocated here * @net: the applicable net namespace * @family: protocol family * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) * @prot: struct proto associated with this new sock instance * @kern: is this to be a kernel socket? */ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot, int kern) { struct sock *sk; sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); if (sk) { sk->sk_family = family; /* * See comment in struct sock definition to understand * why we need sk_prot_creator -acme */ sk->sk_prot = sk->sk_prot_creator = prot; sk->sk_kern_sock = kern; sock_lock_init(sk); sk->sk_net_refcnt = kern ? 0 : 1; if (likely(sk->sk_net_refcnt)) { get_net_track(net, &sk->ns_tracker, priority); sock_inuse_add(net, 1); } else { __netns_tracker_alloc(net, &sk->ns_tracker, false, priority); } sock_net_set(sk, net); refcount_set(&sk->sk_wmem_alloc, 1); mem_cgroup_sk_alloc(sk); cgroup_sk_alloc(&sk->sk_cgrp_data); sock_update_classid(&sk->sk_cgrp_data); sock_update_netprioidx(&sk->sk_cgrp_data); sk_tx_queue_clear(sk); } return sk; } EXPORT_SYMBOL(sk_alloc); /* Sockets having SOCK_RCU_FREE will call this function after one RCU * grace period. This is the case for UDP sockets and TCP listeners. */ static void __sk_destruct(struct rcu_head *head) { struct sock *sk = container_of(head, struct sock, sk_rcu); struct sk_filter *filter; if (sk->sk_destruct) sk->sk_destruct(sk); filter = rcu_dereference_check(sk->sk_filter, refcount_read(&sk->sk_wmem_alloc) == 0); if (filter) { sk_filter_uncharge(sk, filter); RCU_INIT_POINTER(sk->sk_filter, NULL); } sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); #ifdef CONFIG_BPF_SYSCALL bpf_sk_storage_free(sk); #endif if (atomic_read(&sk->sk_omem_alloc)) pr_debug("%s: optmem leakage (%d bytes) detected\n", __func__, atomic_read(&sk->sk_omem_alloc)); if (sk->sk_frag.page) { put_page(sk->sk_frag.page); sk->sk_frag.page = NULL; } /* We do not need to acquire sk->sk_peer_lock, we are the last user. */ put_cred(sk->sk_peer_cred); put_pid(sk->sk_peer_pid); if (likely(sk->sk_net_refcnt)) put_net_track(sock_net(sk), &sk->ns_tracker); else __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false); sk_prot_free(sk->sk_prot_creator, sk); } void sk_destruct(struct sock *sk) { bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); if (rcu_access_pointer(sk->sk_reuseport_cb)) { reuseport_detach_sock(sk); use_call_rcu = true; } if (use_call_rcu) call_rcu(&sk->sk_rcu, __sk_destruct); else __sk_destruct(&sk->sk_rcu); } static void __sk_free(struct sock *sk) { if (likely(sk->sk_net_refcnt)) sock_inuse_add(sock_net(sk), -1); if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk))) sock_diag_broadcast_destroy(sk); else sk_destruct(sk); } void sk_free(struct sock *sk) { /* * We subtract one from sk_wmem_alloc and can know if * some packets are still in some tx queue. * If not null, sock_wfree() will call __sk_free(sk) later */ if (refcount_dec_and_test(&sk->sk_wmem_alloc)) __sk_free(sk); } EXPORT_SYMBOL(sk_free); static void sk_init_common(struct sock *sk) { skb_queue_head_init(&sk->sk_receive_queue); skb_queue_head_init(&sk->sk_write_queue); skb_queue_head_init(&sk->sk_error_queue); rwlock_init(&sk->sk_callback_lock); lockdep_set_class_and_name(&sk->sk_receive_queue.lock, af_rlock_keys + sk->sk_family, af_family_rlock_key_strings[sk->sk_family]); lockdep_set_class_and_name(&sk->sk_write_queue.lock, af_wlock_keys + sk->sk_family, af_family_wlock_key_strings[sk->sk_family]); lockdep_set_class_and_name(&sk->sk_error_queue.lock, af_elock_keys + sk->sk_family, af_family_elock_key_strings[sk->sk_family]); lockdep_set_class_and_name(&sk->sk_callback_lock, af_callback_keys + sk->sk_family, af_family_clock_key_strings[sk->sk_family]); } /** * sk_clone_lock - clone a socket, and lock its clone * @sk: the socket to clone * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) * * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) */ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) { struct proto *prot = READ_ONCE(sk->sk_prot); struct sk_filter *filter; bool is_charged = true; struct sock *newsk; newsk = sk_prot_alloc(prot, priority, sk->sk_family); if (!newsk) goto out; sock_copy(newsk, sk); newsk->sk_prot_creator = prot; /* SANITY */ if (likely(newsk->sk_net_refcnt)) { get_net_track(sock_net(newsk), &newsk->ns_tracker, priority); sock_inuse_add(sock_net(newsk), 1); } else { /* Kernel sockets are not elevating the struct net refcount. * Instead, use a tracker to more easily detect if a layer * is not properly dismantling its kernel sockets at netns * destroy time. */ __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker, false, priority); } sk_node_init(&newsk->sk_node); sock_lock_init(newsk); bh_lock_sock(newsk); newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; newsk->sk_backlog.len = 0; atomic_set(&newsk->sk_rmem_alloc, 0); /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */ refcount_set(&newsk->sk_wmem_alloc, 1); atomic_set(&newsk->sk_omem_alloc, 0); sk_init_common(newsk); newsk->sk_dst_cache = NULL; newsk->sk_dst_pending_confirm = 0; newsk->sk_wmem_queued = 0; newsk->sk_forward_alloc = 0; newsk->sk_reserved_mem = 0; atomic_set(&newsk->sk_drops, 0); newsk->sk_send_head = NULL; newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; atomic_set(&newsk->sk_zckey, 0); sock_reset_flag(newsk, SOCK_DONE); /* sk->sk_memcg will be populated at accept() time */ newsk->sk_memcg = NULL; cgroup_sk_clone(&newsk->sk_cgrp_data); rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); if (filter != NULL) /* though it's an empty new sock, the charging may fail * if sysctl_optmem_max was changed between creation of * original socket and cloning */ is_charged = sk_filter_charge(newsk, filter); RCU_INIT_POINTER(newsk->sk_filter, filter); rcu_read_unlock(); if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { /* We need to make sure that we don't uncharge the new * socket if we couldn't charge it in the first place * as otherwise we uncharge the parent's filter. */ if (!is_charged) RCU_INIT_POINTER(newsk->sk_filter, NULL); sk_free_unlock_clone(newsk); newsk = NULL; goto out; } RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); if (bpf_sk_storage_clone(sk, newsk)) { sk_free_unlock_clone(newsk); newsk = NULL; goto out; } /* Clear sk_user_data if parent had the pointer tagged * as not suitable for copying when cloning. */ if (sk_user_data_is_nocopy(newsk)) newsk->sk_user_data = NULL; newsk->sk_err = 0; newsk->sk_err_soft = 0; newsk->sk_priority = 0; newsk->sk_incoming_cpu = raw_smp_processor_id(); /* Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.rst for details) */ smp_wmb(); refcount_set(&newsk->sk_refcnt, 2); sk_set_socket(newsk, NULL); sk_tx_queue_clear(newsk); RCU_INIT_POINTER(newsk->sk_wq, NULL); if (newsk->sk_prot->sockets_allocated) sk_sockets_allocated_inc(newsk); if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP) net_enable_timestamp(); out: return newsk; } EXPORT_SYMBOL_GPL(sk_clone_lock); void sk_free_unlock_clone(struct sock *sk) { /* It is still raw copy of parent, so invalidate * destructor and make plain sk_free() */ sk->sk_destruct = NULL; bh_unlock_sock(sk); sk_free(sk); } EXPORT_SYMBOL_GPL(sk_free_unlock_clone); static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst) { bool is_ipv6 = false; u32 max_size; #if IS_ENABLED(CONFIG_IPV6) is_ipv6 = (sk->sk_family == AF_INET6 && !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)); #endif /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */ max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) : READ_ONCE(dst->dev->gso_ipv4_max_size); if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk)) max_size = GSO_LEGACY_MAX_SIZE; return max_size - (MAX_TCP_HEADER + 1); } void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { u32 max_segs = 1; sk->sk_route_caps = dst->dev->features; if (sk_is_tcp(sk)) sk->sk_route_caps |= NETIF_F_GSO; if (sk->sk_route_caps & NETIF_F_GSO) sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; if (unlikely(sk->sk_gso_disabled)) sk->sk_route_caps &= ~NETIF_F_GSO_MASK; if (sk_can_gso(sk)) { if (dst->header_len && !xfrm_dst_offload_ok(dst)) { sk->sk_route_caps &= ~NETIF_F_GSO_MASK; } else { sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst); /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1); } } sk->sk_gso_max_segs = max_segs; sk_dst_set(sk, dst); } EXPORT_SYMBOL_GPL(sk_setup_caps); /* * Simple resource managers for sockets. */ /* * Write buffer destructor automatically called from kfree_skb. */ void sock_wfree(struct sk_buff *skb) { struct sock *sk = skb->sk; unsigned int len = skb->truesize; bool free; if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { if (sock_flag(sk, SOCK_RCU_FREE) && sk->sk_write_space == sock_def_write_space) { rcu_read_lock(); free = refcount_sub_and_test(len, &sk->sk_wmem_alloc); sock_def_write_space_wfree(sk); rcu_read_unlock(); if (unlikely(free)) __sk_free(sk); return; } /* * Keep a reference on sk_wmem_alloc, this will be released * after sk_write_space() call */ WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc)); sk->sk_write_space(sk); len = 1; } /* * if sk_wmem_alloc reaches 0, we must finish what sk_free() * could not do because of in-flight packets */ if (refcount_sub_and_test(len, &sk->sk_wmem_alloc)) __sk_free(sk); } EXPORT_SYMBOL(sock_wfree); /* This variant of sock_wfree() is used by TCP, * since it sets SOCK_USE_WRITE_QUEUE. */ void __sock_wfree(struct sk_buff *skb) { struct sock *sk = skb->sk; if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) __sk_free(sk); } void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) { skb_orphan(skb); skb->sk = sk; #ifdef CONFIG_INET if (unlikely(!sk_fullsock(sk))) { skb->destructor = sock_edemux; sock_hold(sk); return; } #endif skb->destructor = sock_wfree; skb_set_hash_from_sk(skb, sk); /* * We used to take a refcount on sk, but following operation * is enough to guarantee sk_free() wont free this sock until * all in-flight packets are completed */ refcount_add(skb->truesize, &sk->sk_wmem_alloc); } EXPORT_SYMBOL(skb_set_owner_w); static bool can_skb_orphan_partial(const struct sk_buff *skb) { #ifdef CONFIG_TLS_DEVICE /* Drivers depend on in-order delivery for crypto offload, * partial orphan breaks out-of-order-OK logic. */ if (skb->decrypted) return false; #endif return (skb->destructor == sock_wfree || (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree)); } /* This helper is used by netem, as it can hold packets in its * delay queue. We want to allow the owner socket to send more * packets, as if they were already TX completed by a typical driver. * But we also want to keep skb->sk set because some packet schedulers * rely on it (sch_fq for example). */ void skb_orphan_partial(struct sk_buff *skb) { if (skb_is_tcp_pure_ack(skb)) return; if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk)) return; skb_orphan(skb); } EXPORT_SYMBOL(skb_orphan_partial); /* * Read buffer destructor automatically called from kfree_skb. */ void sock_rfree(struct sk_buff *skb) { struct sock *sk = skb->sk; unsigned int len = skb->truesize; atomic_sub(len, &sk->sk_rmem_alloc); sk_mem_uncharge(sk, len); } EXPORT_SYMBOL(sock_rfree); /* * Buffer destructor for skbs that are not used directly in read or write * path, e.g. for error handler skbs. Automatically called from kfree_skb. */ void sock_efree(struct sk_buff *skb) { sock_put(skb->sk); } EXPORT_SYMBOL(sock_efree); /* Buffer destructor for prefetch/receive path where reference count may * not be held, e.g. for listen sockets. */ #ifdef CONFIG_INET void sock_pfree(struct sk_buff *skb) { if (sk_is_refcounted(skb->sk)) sock_gen_put(skb->sk); } EXPORT_SYMBOL(sock_pfree); #endif /* CONFIG_INET */ kuid_t sock_i_uid(struct sock *sk) { kuid_t uid; read_lock_bh(&sk->sk_callback_lock); uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID; read_unlock_bh(&sk->sk_callback_lock); return uid; } EXPORT_SYMBOL(sock_i_uid); unsigned long __sock_i_ino(struct sock *sk) { unsigned long ino; read_lock(&sk->sk_callback_lock); ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; read_unlock(&sk->sk_callback_lock); return ino; } EXPORT_SYMBOL(__sock_i_ino); unsigned long sock_i_ino(struct sock *sk) { unsigned long ino; local_bh_disable(); ino = __sock_i_ino(sk); local_bh_enable(); return ino; } EXPORT_SYMBOL(sock_i_ino); /* * Allocate a skb from the socket's send buffer. */ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority) { if (force || refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) { struct sk_buff *skb = alloc_skb(size, priority); if (skb) { skb_set_owner_w(skb, sk); return skb; } } return NULL; } EXPORT_SYMBOL(sock_wmalloc); static void sock_ofree(struct sk_buff *skb) { struct sock *sk = skb->sk; atomic_sub(skb->truesize, &sk->sk_omem_alloc); } struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, gfp_t priority) { struct sk_buff *skb; /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > READ_ONCE(sysctl_optmem_max)) return NULL; skb = alloc_skb(size, priority); if (!skb) return NULL; atomic_add(skb->truesize, &sk->sk_omem_alloc); skb->sk = sk; skb->destructor = sock_ofree; return skb; } /* * Allocate a memory block from the socket's option memory buffer. */ void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) { int optmem_max = READ_ONCE(sysctl_optmem_max); if ((unsigned int)size <= optmem_max && atomic_read(&sk->sk_omem_alloc) + size < optmem_max) { void *mem; /* First do the add, to avoid the race if kmalloc * might sleep. */ atomic_add(size, &sk->sk_omem_alloc); mem = kmalloc(size, priority); if (mem) return mem; atomic_sub(size, &sk->sk_omem_alloc); } return NULL; } EXPORT_SYMBOL(sock_kmalloc); /* Free an option memory block. Note, we actually want the inline * here as this allows gcc to detect the nullify and fold away the * condition entirely. */ static inline void __sock_kfree_s(struct sock *sk, void *mem, int size, const bool nullify) { if (WARN_ON_ONCE(!mem)) return; if (nullify) kfree_sensitive(mem); else kfree(mem); atomic_sub(size, &sk->sk_omem_alloc); } void sock_kfree_s(struct sock *sk, void *mem, int size) { __sock_kfree_s(sk, mem, size, false); } EXPORT_SYMBOL(sock_kfree_s); void sock_kzfree_s(struct sock *sk, void *mem, int size) { __sock_kfree_s(sk, mem, size, true); } EXPORT_SYMBOL(sock_kzfree_s); /* It is almost wait_for_tcp_memory minus release_sock/lock_sock. I think, these locks should be removed for datagram sockets. */ static long sock_wait_for_wmem(struct sock *sk, long timeo) { DEFINE_WAIT(wait); sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); for (;;) { if (!timeo) break; if (signal_pending(current)) break; set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) break; if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) break; if (READ_ONCE(sk->sk_err)) break; timeo = schedule_timeout(timeo); } finish_wait(sk_sleep(sk), &wait); return timeo; } /* * Generic send/receive buffer handlers */ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, unsigned long data_len, int noblock, int *errcode, int max_page_order) { struct sk_buff *skb; long timeo; int err; timeo = sock_sndtimeo(sk, noblock); for (;;) { err = sock_error(sk); if (err != 0) goto failure; err = -EPIPE; if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) goto failure; if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf)) break; sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); err = -EAGAIN; if (!timeo) goto failure; if (signal_pending(current)) goto interrupted; timeo = sock_wait_for_wmem(sk, timeo); } skb = alloc_skb_with_frags(header_len, data_len, max_page_order, errcode, sk->sk_allocation); if (skb) skb_set_owner_w(skb, sk); return skb; interrupted: err = sock_intr_errno(timeo); failure: *errcode = err; return NULL; } EXPORT_SYMBOL(sock_alloc_send_pskb); int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, struct sockcm_cookie *sockc) { u32 tsflags; switch (cmsg->cmsg_type) { case SO_MARK: if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) return -EINVAL; sockc->mark = *(u32 *)CMSG_DATA(cmsg); break; case SO_TIMESTAMPING_OLD: if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) return -EINVAL; tsflags = *(u32 *)CMSG_DATA(cmsg); if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK) return -EINVAL; sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; sockc->tsflags |= tsflags; break; case SCM_TXTIME: if (!sock_flag(sk, SOCK_TXTIME)) return -EINVAL; if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64))) return -EINVAL; sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); break; /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ case SCM_RIGHTS: case SCM_CREDENTIALS: break; default: return -EINVAL; } return 0; } EXPORT_SYMBOL(__sock_cmsg_send); int sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct sockcm_cookie *sockc) { struct cmsghdr *cmsg; int ret; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; if (cmsg->cmsg_level != SOL_SOCKET) continue; ret = __sock_cmsg_send(sk, cmsg, sockc); if (ret) return ret; } return 0; } EXPORT_SYMBOL(sock_cmsg_send); static void sk_enter_memory_pressure(struct sock *sk) { if (!sk->sk_prot->enter_memory_pressure) return; sk->sk_prot->enter_memory_pressure(sk); } static void sk_leave_memory_pressure(struct sock *sk) { if (sk->sk_prot->leave_memory_pressure) { INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure, tcp_leave_memory_pressure, sk); } else { unsigned long *memory_pressure = sk->sk_prot->memory_pressure; if (memory_pressure && READ_ONCE(*memory_pressure)) WRITE_ONCE(*memory_pressure, 0); } } DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); /** * skb_page_frag_refill - check that a page_frag contains enough room * @sz: minimum size of the fragment we want to get * @pfrag: pointer to page_frag * @gfp: priority for memory allocation * * Note: While this allocator tries to use high order pages, there is * no guarantee that allocations succeed. Therefore, @sz MUST be * less or equal than PAGE_SIZE. */ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) { if (pfrag->page) { if (page_ref_count(pfrag->page) == 1) { pfrag->offset = 0; return true; } if (pfrag->offset + sz <= pfrag->size) return true; put_page(pfrag->page); } pfrag->offset = 0; if (SKB_FRAG_PAGE_ORDER && !static_branch_unlikely(&net_high_order_alloc_disable_key)) { /* Avoid direct reclaim but allow kswapd to wake */ pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY, SKB_FRAG_PAGE_ORDER); if (likely(pfrag->page)) { pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; return true; } } pfrag->page = alloc_page(gfp); if (likely(pfrag->page)) { pfrag->size = PAGE_SIZE; return true; } return false; } EXPORT_SYMBOL(skb_page_frag_refill); bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) { if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) return true; sk_enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); return false; } EXPORT_SYMBOL(sk_page_frag_refill); void __lock_sock(struct sock *sk) __releases(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock) { DEFINE_WAIT(wait); for (;;) { prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, TASK_UNINTERRUPTIBLE); spin_unlock_bh(&sk->sk_lock.slock); schedule(); spin_lock_bh(&sk->sk_lock.slock); if (!sock_owned_by_user(sk)) break; } finish_wait(&sk->sk_lock.wq, &wait); } void __release_sock(struct sock *sk) __releases(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock) { struct sk_buff *skb, *next; while ((skb = sk->sk_backlog.head) != NULL) { sk->sk_backlog.head = sk->sk_backlog.tail = NULL; spin_unlock_bh(&sk->sk_lock.slock); do { next = skb->next; prefetch(next); DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb)); skb_mark_not_on_list(skb); sk_backlog_rcv(sk, skb); cond_resched(); skb = next; } while (skb != NULL); spin_lock_bh(&sk->sk_lock.slock); } /* * Doing the zeroing here guarantee we can not loop forever * while a wild producer attempts to flood us. */ sk->sk_backlog.len = 0; } void __sk_flush_backlog(struct sock *sk) { spin_lock_bh(&sk->sk_lock.slock); __release_sock(sk); if (sk->sk_prot->release_cb) INDIRECT_CALL_INET_1(sk->sk_prot->release_cb, tcp_release_cb, sk); spin_unlock_bh(&sk->sk_lock.slock); } EXPORT_SYMBOL_GPL(__sk_flush_backlog); /** * sk_wait_data - wait for data to arrive at sk_receive_queue * @sk: sock to wait on * @timeo: for how long * @skb: last skb seen on sk_receive_queue * * Now socket state including sk->sk_err is changed only under lock, * hence we may omit checks after joining wait queue. * We check receive queue before schedule() only as optimization; * it is very likely that release_sock() added new data. */ int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) { DEFINE_WAIT_FUNC(wait, woken_wake_function); int rc; add_wait_queue(sk_sleep(sk), &wait); sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); remove_wait_queue(sk_sleep(sk), &wait); return rc; } EXPORT_SYMBOL(sk_wait_data); /** * __sk_mem_raise_allocated - increase memory_allocated * @sk: socket * @size: memory size to allocate * @amt: pages to allocate * @kind: allocation type * * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc. * * Unlike the globally shared limits among the sockets under same protocol, * consuming the budget of a memcg won't have direct effect on other ones. * So be optimistic about memcg's tolerance, and leave the callers to decide * whether or not to raise allocated through sk_under_memory_pressure() or * its variants. */ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) { struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL; struct proto *prot = sk->sk_prot; bool charged = false; long allocated; sk_memory_allocated_add(sk, amt); allocated = sk_memory_allocated(sk); if (memcg) { if (!mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge())) goto suppress_allocation; charged = true; } /* Under limit. */ if (allocated <= sk_prot_mem_limits(sk, 0)) { sk_leave_memory_pressure(sk); return 1; } /* Under pressure. */ if (allocated > sk_prot_mem_limits(sk, 1)) sk_enter_memory_pressure(sk); /* Over hard limit. */ if (allocated > sk_prot_mem_limits(sk, 2)) goto suppress_allocation; /* Guarantee minimum buffer size under pressure (either global * or memcg) to make sure features described in RFC 7323 (TCP * Extensions for High Performance) work properly. * * This rule does NOT stand when exceeds global or memcg's hard * limit, or else a DoS attack can be taken place by spawning * lots of sockets whose usage are under minimum buffer size. */ if (kind == SK_MEM_RECV) { if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot)) return 1; } else { /* SK_MEM_SEND */ int wmem0 = sk_get_wmem0(sk, prot); if (sk->sk_type == SOCK_STREAM) { if (sk->sk_wmem_queued < wmem0) return 1; } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) { return 1; } } if (sk_has_memory_pressure(sk)) { u64 alloc; /* The following 'average' heuristic is within the * scope of global accounting, so it only makes * sense for global memory pressure. */ if (!sk_under_global_memory_pressure(sk)) return 1; /* Try to be fair among all the sockets under global * pressure by allowing the ones that below average * usage to raise. */ alloc = sk_sockets_allocated_read_positive(sk); if (sk_prot_mem_limits(sk, 2) > alloc * sk_mem_pages(sk->sk_wmem_queued + atomic_read(&sk->sk_rmem_alloc) + sk->sk_forward_alloc)) return 1; } suppress_allocation: if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { sk_stream_moderate_sndbuf(sk); /* Fail only if socket is _under_ its sndbuf. * In this case we cannot block, so that we have to fail. */ if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) { /* Force charge with __GFP_NOFAIL */ if (memcg && !charged) { mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge() | __GFP_NOFAIL); } return 1; } } if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged)) trace_sock_exceed_buf_limit(sk, prot, allocated, kind); sk_memory_allocated_sub(sk, amt); if (charged) mem_cgroup_uncharge_skmem(memcg, amt); return 0; } /** * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated * @sk: socket * @size: memory size to allocate * @kind: allocation type * * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means * rmem allocation. This function assumes that protocols which have * memory_pressure use sk_wmem_queued as write buffer accounting. */ int __sk_mem_schedule(struct sock *sk, int size, int kind) { int ret, amt = sk_mem_pages(size); sk_forward_alloc_add(sk, amt << PAGE_SHIFT); ret = __sk_mem_raise_allocated(sk, size, amt, kind); if (!ret) sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT)); return ret; } EXPORT_SYMBOL(__sk_mem_schedule); /** * __sk_mem_reduce_allocated - reclaim memory_allocated * @sk: socket * @amount: number of quanta * * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc */ void __sk_mem_reduce_allocated(struct sock *sk, int amount) { sk_memory_allocated_sub(sk, amount); if (mem_cgroup_sockets_enabled && sk->sk_memcg) mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); if (sk_under_global_memory_pressure(sk) && (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) sk_leave_memory_pressure(sk); } /** * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated * @sk: socket * @amount: number of bytes (rounded down to a PAGE_SIZE multiple) */ void __sk_mem_reclaim(struct sock *sk, int amount) { amount >>= PAGE_SHIFT; sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT)); __sk_mem_reduce_allocated(sk, amount); } EXPORT_SYMBOL(__sk_mem_reclaim); int sk_set_peek_off(struct sock *sk, int val) { WRITE_ONCE(sk->sk_peek_off, val); return 0; } EXPORT_SYMBOL_GPL(sk_set_peek_off); /* * Set of default routines for initialising struct proto_ops when * the protocol does not support a particular function. In certain * cases where it makes no sense for a protocol to have a "do nothing" * function, some default processing is provided. */ int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_bind); int sock_no_connect(struct socket *sock, struct sockaddr *saddr, int len, int flags) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_connect); int sock_no_socketpair(struct socket *sock1, struct socket *sock2) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_socketpair); int sock_no_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_accept); int sock_no_getname(struct socket *sock, struct sockaddr *saddr, int peer) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_getname); int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_ioctl); int sock_no_listen(struct socket *sock, int backlog) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_listen); int sock_no_shutdown(struct socket *sock, int how) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_shutdown); int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_sendmsg); int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_sendmsg_locked); int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_recvmsg); int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) { /* Mirror missing mmap method error code */ return -ENODEV; } EXPORT_SYMBOL(sock_no_mmap); /* * When a file is received (via SCM_RIGHTS, etc), we must bump the * various sock-based usage counts. */ void __receive_sock(struct file *file) { struct socket *sock; sock = sock_from_file(file); if (sock) { sock_update_netprioidx(&sock->sk->sk_cgrp_data); sock_update_classid(&sock->sk->sk_cgrp_data); } } /* * Default Socket Callbacks */ static void sock_def_wakeup(struct sock *sk) { struct socket_wq *wq; rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_all(&wq->wait); rcu_read_unlock(); } static void sock_def_error_report(struct sock *sk) { struct socket_wq *wq; rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_poll(&wq->wait, EPOLLERR); sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); rcu_read_unlock(); } void sock_def_readable(struct sock *sk) { struct socket_wq *wq; trace_sk_data_ready(sk); rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | EPOLLRDNORM | EPOLLRDBAND); sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); rcu_read_unlock(); } static void sock_def_write_space(struct sock *sk) { struct socket_wq *wq; rcu_read_lock(); /* Do not wake up a writer until he can make "significant" * progress. --DaveM */ if (sock_writeable(sk)) { wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); /* Should agree with poll, otherwise some programs break */ sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); } /* An optimised version of sock_def_write_space(), should only be called * for SOCK_RCU_FREE sockets under RCU read section and after putting * ->sk_wmem_alloc. */ static void sock_def_write_space_wfree(struct sock *sk) { /* Do not wake up a writer until he can make "significant" * progress. --DaveM */ if (sock_writeable(sk)) { struct socket_wq *wq = rcu_dereference(sk->sk_wq); /* rely on refcount_sub from sock_wfree() */ smp_mb__after_atomic(); if (wq && waitqueue_active(&wq->wait)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); /* Should agree with poll, otherwise some programs break */ sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } } static void sock_def_destruct(struct sock *sk) { } void sk_send_sigurg(struct sock *sk) { if (sk->sk_socket && sk->sk_socket->file) if (send_sigurg(&sk->sk_socket->file->f_owner)) sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); } EXPORT_SYMBOL(sk_send_sigurg); void sk_reset_timer(struct sock *sk, struct timer_list* timer, unsigned long expires) { if (!mod_timer(timer, expires)) sock_hold(sk); } EXPORT_SYMBOL(sk_reset_timer); void sk_stop_timer(struct sock *sk, struct timer_list* timer) { if (del_timer(timer)) __sock_put(sk); } EXPORT_SYMBOL(sk_stop_timer); void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer) { if (del_timer_sync(timer)) __sock_put(sk); } EXPORT_SYMBOL(sk_stop_timer_sync); void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid) { sk_init_common(sk); sk->sk_send_head = NULL; timer_setup(&sk->sk_timer, NULL, 0); sk->sk_allocation = GFP_KERNEL; sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default); sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default); sk->sk_state = TCP_CLOSE; sk->sk_use_task_frag = true; sk_set_socket(sk, sock); sock_set_flag(sk, SOCK_ZAPPED); if (sock) { sk->sk_type = sock->type; RCU_INIT_POINTER(sk->sk_wq, &sock->wq); sock->sk = sk; } else { RCU_INIT_POINTER(sk->sk_wq, NULL); } sk->sk_uid = uid; rwlock_init(&sk->sk_callback_lock); if (sk->sk_kern_sock) lockdep_set_class_and_name( &sk->sk_callback_lock, af_kern_callback_keys + sk->sk_family, af_family_kern_clock_key_strings[sk->sk_family]); else lockdep_set_class_and_name( &sk->sk_callback_lock, af_callback_keys + sk->sk_family, af_family_clock_key_strings[sk->sk_family]); sk->sk_state_change = sock_def_wakeup; sk->sk_data_ready = sock_def_readable; sk->sk_write_space = sock_def_write_space; sk->sk_error_report = sock_def_error_report; sk->sk_destruct = sock_def_destruct; sk->sk_frag.page = NULL; sk->sk_frag.offset = 0; sk->sk_peek_off = -1; sk->sk_peer_pid = NULL; sk->sk_peer_cred = NULL; spin_lock_init(&sk->sk_peer_lock); sk->sk_write_pending = 0; sk->sk_rcvlowat = 1; sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; sk->sk_stamp = SK_DEFAULT_STAMP; #if BITS_PER_LONG==32 seqlock_init(&sk->sk_stamp_seq); #endif atomic_set(&sk->sk_zckey, 0); #ifdef CONFIG_NET_RX_BUSY_POLL sk->sk_napi_id = 0; sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read); #endif sk->sk_max_pacing_rate = ~0UL; sk->sk_pacing_rate = ~0UL; WRITE_ONCE(sk->sk_pacing_shift, 10); sk->sk_incoming_cpu = -1; sk_rx_queue_clear(sk); /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.rst for details) */ smp_wmb(); refcount_set(&sk->sk_refcnt, 1); atomic_set(&sk->sk_drops, 0); } EXPORT_SYMBOL(sock_init_data_uid); void sock_init_data(struct socket *sock, struct sock *sk) { kuid_t uid = sock ? SOCK_INODE(sock)->i_uid : make_kuid(sock_net(sk)->user_ns, 0); sock_init_data_uid(sock, sk, uid); } EXPORT_SYMBOL(sock_init_data); void lock_sock_nested(struct sock *sk, int subclass) { /* The sk_lock has mutex_lock() semantics here. */ mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); might_sleep(); spin_lock_bh(&sk->sk_lock.slock); if (sock_owned_by_user_nocheck(sk)) __lock_sock(sk); sk->sk_lock.owned = 1; spin_unlock_bh(&sk->sk_lock.slock); } EXPORT_SYMBOL(lock_sock_nested); void release_sock(struct sock *sk) { spin_lock_bh(&sk->sk_lock.slock); if (sk->sk_backlog.tail) __release_sock(sk); if (sk->sk_prot->release_cb) INDIRECT_CALL_INET_1(sk->sk_prot->release_cb, tcp_release_cb, sk); sock_release_ownership(sk); if (waitqueue_active(&sk->sk_lock.wq)) wake_up(&sk->sk_lock.wq); spin_unlock_bh(&sk->sk_lock.slock); } EXPORT_SYMBOL(release_sock); bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) { might_sleep(); spin_lock_bh(&sk->sk_lock.slock); if (!sock_owned_by_user_nocheck(sk)) { /* * Fast path return with bottom halves disabled and * sock::sk_lock.slock held. * * The 'mutex' is not contended and holding * sock::sk_lock.slock prevents all other lockers to * proceed so the corresponding unlock_sock_fast() can * avoid the slow path of release_sock() completely and * just release slock. * * From a semantical POV this is equivalent to 'acquiring' * the 'mutex', hence the corresponding lockdep * mutex_release() has to happen in the fast path of * unlock_sock_fast(). */ return false; } __lock_sock(sk); sk->sk_lock.owned = 1; __acquire(&sk->sk_lock.slock); spin_unlock_bh(&sk->sk_lock.slock); return true; } EXPORT_SYMBOL(__lock_sock_fast); int sock_gettstamp(struct socket *sock, void __user *userstamp, bool timeval, bool time32) { struct sock *sk = sock->sk; struct timespec64 ts; sock_enable_timestamp(sk, SOCK_TIMESTAMP); ts = ktime_to_timespec64(sock_read_timestamp(sk)); if (ts.tv_sec == -1) return -ENOENT; if (ts.tv_sec == 0) { ktime_t kt = ktime_get_real(); sock_write_timestamp(sk, kt); ts = ktime_to_timespec64(kt); } if (timeval) ts.tv_nsec /= 1000; #ifdef CONFIG_COMPAT_32BIT_TIME if (time32) return put_old_timespec32(&ts, userstamp); #endif #ifdef CONFIG_SPARC64 /* beware of padding in sparc64 timeval */ if (timeval && !in_compat_syscall()) { struct __kernel_old_timeval __user tv = { .tv_sec = ts.tv_sec, .tv_usec = ts.tv_nsec, }; if (copy_to_user(userstamp, &tv, sizeof(tv))) return -EFAULT; return 0; } #endif return put_timespec64(&ts, userstamp); } EXPORT_SYMBOL(sock_gettstamp); void sock_enable_timestamp(struct sock *sk, enum sock_flags flag) { if (!sock_flag(sk, flag)) { unsigned long previous_flags = sk->sk_flags; sock_set_flag(sk, flag); /* * we just set one of the two flags which require net * time stamping, but time stamping might have been on * already because of the other one */ if (sock_needs_netstamp(sk) && !(previous_flags & SK_FLAGS_TIMESTAMP)) net_enable_timestamp(); } } int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level, int type) { struct sock_exterr_skb *serr; struct sk_buff *skb; int copied, err; err = -EAGAIN; skb = sock_dequeue_err_skb(sk); if (skb == NULL) goto out; copied = skb->len; if (copied > len) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free_skb; sock_recv_timestamp(msg, sk, skb); serr = SKB_EXT_ERR(skb); put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); msg->msg_flags |= MSG_ERRQUEUE; err = copied; out_free_skb: kfree_skb(skb); out: return err; } EXPORT_SYMBOL(sock_recv_errqueue); /* * Get a socket option on an socket. * * FIX: POSIX 1003.1g is very ambiguous here. It states that * asynchronous errors should be reported by getsockopt. We assume * this means if you specify SO_ERROR (otherwise whats the point of it). */ int sock_common_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; /* IPV6_ADDRFORM can change sk->sk_prot under us. */ return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL(sock_common_getsockopt); int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; int addr_len = 0; int err; err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len); if (err >= 0) msg->msg_namelen = addr_len; return err; } EXPORT_SYMBOL(sock_common_recvmsg); /* * Set socket options on an inet socket. */ int sock_common_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; /* IPV6_ADDRFORM can change sk->sk_prot under us. */ return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL(sock_common_setsockopt); void sk_common_release(struct sock *sk) { if (sk->sk_prot->destroy) sk->sk_prot->destroy(sk); /* * Observation: when sk_common_release is called, processes have * no access to socket. But net still has. * Step one, detach it from networking: * * A. Remove from hash tables. */ sk->sk_prot->unhash(sk); /* * In this point socket cannot receive new packets, but it is possible * that some packets are in flight because some CPU runs receiver and * did hash table lookup before we unhashed socket. They will achieve * receive queue and will be purged by socket destructor. * * Also we still have packets pending on receive queue and probably, * our own packets waiting in device queues. sock_destroy will drain * receive queue, but transmitted packets will delay socket destruction * until the last reference will be released. */ sock_orphan(sk); xfrm_sk_free_policy(sk); sock_put(sk); } EXPORT_SYMBOL(sk_common_release); void sk_get_meminfo(const struct sock *sk, u32 *mem) { memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf); mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk); mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); } #ifdef CONFIG_PROC_FS static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); int sock_prot_inuse_get(struct net *net, struct proto *prot) { int cpu, idx = prot->inuse_idx; int res = 0; for_each_possible_cpu(cpu) res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx]; return res >= 0 ? res : 0; } EXPORT_SYMBOL_GPL(sock_prot_inuse_get); int sock_inuse_get(struct net *net) { int cpu, res = 0; for_each_possible_cpu(cpu) res += per_cpu_ptr(net->core.prot_inuse, cpu)->all; return res; } EXPORT_SYMBOL_GPL(sock_inuse_get); static int __net_init sock_inuse_init_net(struct net *net) { net->core.prot_inuse = alloc_percpu(struct prot_inuse); if (net->core.prot_inuse == NULL) return -ENOMEM; return 0; } static void __net_exit sock_inuse_exit_net(struct net *net) { free_percpu(net->core.prot_inuse); } static struct pernet_operations net_inuse_ops = { .init = sock_inuse_init_net, .exit = sock_inuse_exit_net, }; static __init int net_inuse_init(void) { if (register_pernet_subsys(&net_inuse_ops)) panic("Cannot initialize net inuse counters"); return 0; } core_initcall(net_inuse_init); static int assign_proto_idx(struct proto *prot) { prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { pr_err("PROTO_INUSE_NR exhausted\n"); return -ENOSPC; } set_bit(prot->inuse_idx, proto_inuse_idx); return 0; } static void release_proto_idx(struct proto *prot) { if (prot->inuse_idx != PROTO_INUSE_NR - 1) clear_bit(prot->inuse_idx, proto_inuse_idx); } #else static inline int assign_proto_idx(struct proto *prot) { return 0; } static inline void release_proto_idx(struct proto *prot) { } #endif static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot) { if (!twsk_prot) return; kfree(twsk_prot->twsk_slab_name); twsk_prot->twsk_slab_name = NULL; kmem_cache_destroy(twsk_prot->twsk_slab); twsk_prot->twsk_slab = NULL; } static int tw_prot_init(const struct proto *prot) { struct timewait_sock_ops *twsk_prot = prot->twsk_prot; if (!twsk_prot) return 0; twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name); if (!twsk_prot->twsk_slab_name) return -ENOMEM; twsk_prot->twsk_slab = kmem_cache_create(twsk_prot->twsk_slab_name, twsk_prot->twsk_obj_size, 0, SLAB_ACCOUNT | prot->slab_flags, NULL); if (!twsk_prot->twsk_slab) { pr_crit("%s: Can't create timewait sock SLAB cache!\n", prot->name); return -ENOMEM; } return 0; } static void req_prot_cleanup(struct request_sock_ops *rsk_prot) { if (!rsk_prot) return; kfree(rsk_prot->slab_name); rsk_prot->slab_name = NULL; kmem_cache_destroy(rsk_prot->slab); rsk_prot->slab = NULL; } static int req_prot_init(const struct proto *prot) { struct request_sock_ops *rsk_prot = prot->rsk_prot; if (!rsk_prot) return 0; rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name); if (!rsk_prot->slab_name) return -ENOMEM; rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, rsk_prot->obj_size, 0, SLAB_ACCOUNT | prot->slab_flags, NULL); if (!rsk_prot->slab) { pr_crit("%s: Can't create request sock SLAB cache!\n", prot->name); return -ENOMEM; } return 0; } int proto_register(struct proto *prot, int alloc_slab) { int ret = -ENOBUFS; if (prot->memory_allocated && !prot->sysctl_mem) { pr_err("%s: missing sysctl_mem\n", prot->name); return -EINVAL; } if (prot->memory_allocated && !prot->per_cpu_fw_alloc) { pr_err("%s: missing per_cpu_fw_alloc\n", prot->name); return -EINVAL; } if (alloc_slab) { prot->slab = kmem_cache_create_usercopy(prot->name, prot->obj_size, 0, SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | prot->slab_flags, prot->useroffset, prot->usersize, NULL); if (prot->slab == NULL) { pr_crit("%s: Can't create sock SLAB cache!\n", prot->name); goto out; } if (req_prot_init(prot)) goto out_free_request_sock_slab; if (tw_prot_init(prot)) goto out_free_timewait_sock_slab; } mutex_lock(&proto_list_mutex); ret = assign_proto_idx(prot); if (ret) { mutex_unlock(&proto_list_mutex); goto out_free_timewait_sock_slab; } list_add(&prot->node, &proto_list); mutex_unlock(&proto_list_mutex); return ret; out_free_timewait_sock_slab: if (alloc_slab) tw_prot_cleanup(prot->twsk_prot); out_free_request_sock_slab: if (alloc_slab) { req_prot_cleanup(prot->rsk_prot); kmem_cache_destroy(prot->slab); prot->slab = NULL; } out: return ret; } EXPORT_SYMBOL(proto_register); void proto_unregister(struct proto *prot) { mutex_lock(&proto_list_mutex); release_proto_idx(prot); list_del(&prot->node); mutex_unlock(&proto_list_mutex); kmem_cache_destroy(prot->slab); prot->slab = NULL; req_prot_cleanup(prot->rsk_prot); tw_prot_cleanup(prot->twsk_prot); } EXPORT_SYMBOL(proto_unregister); int sock_load_diag_module(int family, int protocol) { if (!protocol) { if (!sock_is_registered(family)) return -ENOENT; return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, NETLINK_SOCK_DIAG, family); } #ifdef CONFIG_INET if (family == AF_INET && protocol != IPPROTO_RAW && protocol < MAX_INET_PROTOS && !rcu_access_pointer(inet_protos[protocol])) return -ENOENT; #endif return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, NETLINK_SOCK_DIAG, family, protocol); } EXPORT_SYMBOL(sock_load_diag_module); #ifdef CONFIG_PROC_FS static void *proto_seq_start(struct seq_file *seq, loff_t *pos) __acquires(proto_list_mutex) { mutex_lock(&proto_list_mutex); return seq_list_start_head(&proto_list, *pos); } static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) { return seq_list_next(v, &proto_list, pos); } static void proto_seq_stop(struct seq_file *seq, void *v) __releases(proto_list_mutex) { mutex_unlock(&proto_list_mutex); } static char proto_method_implemented(const void *method) { return method == NULL ? 'n' : 'y'; } static long sock_prot_memory_allocated(struct proto *proto) { return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; } static const char *sock_prot_memory_pressure(struct proto *proto) { return proto->memory_pressure != NULL ? proto_memory_pressure(proto) ? "yes" : "no" : "NI"; } static void proto_seq_printf(struct seq_file *seq, struct proto *proto) { seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", proto->name, proto->obj_size, sock_prot_inuse_get(seq_file_net(seq), proto), sock_prot_memory_allocated(proto), sock_prot_memory_pressure(proto), proto->max_header, proto->slab == NULL ? "no" : "yes", module_name(proto->owner), proto_method_implemented(proto->close), proto_method_implemented(proto->connect), proto_method_implemented(proto->disconnect), proto_method_implemented(proto->accept), proto_method_implemented(proto->ioctl), proto_method_implemented(proto->init), proto_method_implemented(proto->destroy), proto_method_implemented(proto->shutdown), proto_method_implemented(proto->setsockopt), proto_method_implemented(proto->getsockopt), proto_method_implemented(proto->sendmsg), proto_method_implemented(proto->recvmsg), proto_method_implemented(proto->bind), proto_method_implemented(proto->backlog_rcv), proto_method_implemented(proto->hash), proto_method_implemented(proto->unhash), proto_method_implemented(proto->get_port), proto_method_implemented(proto->enter_memory_pressure)); } static int proto_seq_show(struct seq_file *seq, void *v) { if (v == &proto_list) seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", "protocol", "size", "sockets", "memory", "press", "maxhdr", "slab", "module", "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n"); else proto_seq_printf(seq, list_entry(v, struct proto, node)); return 0; } static const struct seq_operations proto_seq_ops = { .start = proto_seq_start, .next = proto_seq_next, .stop = proto_seq_stop, .show = proto_seq_show, }; static __net_init int proto_init_net(struct net *net) { if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops, sizeof(struct seq_net_private))) return -ENOMEM; return 0; } static __net_exit void proto_exit_net(struct net *net) { remove_proc_entry("protocols", net->proc_net); } static __net_initdata struct pernet_operations proto_net_ops = { .init = proto_init_net, .exit = proto_exit_net, }; static int __init proto_init(void) { return register_pernet_subsys(&proto_net_ops); } subsys_initcall(proto_init); #endif /* PROC_FS */ #ifdef CONFIG_NET_RX_BUSY_POLL bool sk_busy_loop_end(void *p, unsigned long start_time) { struct sock *sk = p; return !skb_queue_empty_lockless(&sk->sk_receive_queue) || sk_busy_loop_timeout(sk, start_time); } EXPORT_SYMBOL(sk_busy_loop_end); #endif /* CONFIG_NET_RX_BUSY_POLL */ int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len) { if (!sk->sk_prot->bind_add) return -EOPNOTSUPP; return sk->sk_prot->bind_add(sk, addr, addr_len); } EXPORT_SYMBOL(sock_bind_add); /* Copy 'size' bytes from userspace and return `size` back to userspace */ int sock_ioctl_inout(struct sock *sk, unsigned int cmd, void __user *arg, void *karg, size_t size) { int ret; if (copy_from_user(karg, arg, size)) return -EFAULT; ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg); if (ret) return ret; if (copy_to_user(arg, karg, size)) return -EFAULT; return 0; } EXPORT_SYMBOL(sock_ioctl_inout); /* This is the most common ioctl prep function, where the result (4 bytes) is * copied back to userspace if the ioctl() returns successfully. No input is * copied from userspace as input argument. */ static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg) { int ret, karg = 0; ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg); if (ret) return ret; return put_user(karg, (int __user *)arg); } /* A wrapper around sock ioctls, which copies the data from userspace * (depending on the protocol/ioctl), and copies back the result to userspace. * The main motivation for this function is to pass kernel memory to the * protocol ioctl callbacks, instead of userspace memory. */ int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) { int rc = 1; if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET) rc = ipmr_sk_ioctl(sk, cmd, arg); else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6) rc = ip6mr_sk_ioctl(sk, cmd, arg); else if (sk_is_phonet(sk)) rc = phonet_sk_ioctl(sk, cmd, arg); /* If ioctl was processed, returns its value */ if (rc <= 0) return rc; /* Otherwise call the default handler */ return sock_ioctl_out(sk, cmd, arg); } EXPORT_SYMBOL(sk_ioctl);
5391 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 /* SPDX-License-Identifier: GPL-2.0-only */ /* * AppArmor security module * * This file contains AppArmor file mediation function definitions. * * Copyright (C) 1998-2008 Novell/SUSE * Copyright 2009-2010 Canonical Ltd. */ #ifndef __AA_FILE_H #define __AA_FILE_H #include <linux/spinlock.h> #include "domain.h" #include "match.h" #include "perms.h" struct aa_policydb; struct aa_profile; struct path; #define mask_mode_t(X) (X & (MAY_EXEC | MAY_WRITE | MAY_READ | MAY_APPEND)) #define AA_AUDIT_FILE_MASK (MAY_READ | MAY_WRITE | MAY_EXEC | MAY_APPEND |\ AA_MAY_CREATE | AA_MAY_DELETE | \ AA_MAY_GETATTR | AA_MAY_SETATTR | \ AA_MAY_CHMOD | AA_MAY_CHOWN | AA_MAY_LOCK | \ AA_EXEC_MMAP | AA_MAY_LINK) static inline struct aa_file_ctx *file_ctx(struct file *file) { return file->f_security + apparmor_blob_sizes.lbs_file; } /* struct aa_file_ctx - the AppArmor context the file was opened in * @lock: lock to update the ctx * @label: label currently cached on the ctx * @perms: the permission the file was opened with */ struct aa_file_ctx { spinlock_t lock; struct aa_label __rcu *label; u32 allow; }; /* * The xindex is broken into 3 parts * - index - an index into either the exec name table or the variable table * - exec type - which determines how the executable name and index are used * - flags - which modify how the destination name is applied */ #define AA_X_INDEX_MASK AA_INDEX_MASK #define AA_X_TYPE_MASK 0x0c000000 #define AA_X_NONE AA_INDEX_NONE #define AA_X_NAME 0x04000000 /* use executable name px */ #define AA_X_TABLE 0x08000000 /* use a specified name ->n# */ #define AA_X_UNSAFE 0x10000000 #define AA_X_CHILD 0x20000000 #define AA_X_INHERIT 0x40000000 #define AA_X_UNCONFINED 0x80000000 /* need to make conditional which ones are being set */ struct path_cond { kuid_t uid; umode_t mode; }; #define COMBINED_PERM_MASK(X) ((X).allow | (X).audit | (X).quiet | (X).kill) int aa_audit_file(const struct cred *cred, struct aa_profile *profile, struct aa_perms *perms, const char *op, u32 request, const char *name, const char *target, struct aa_label *tlabel, kuid_t ouid, const char *info, int error); struct aa_perms *aa_lookup_fperms(struct aa_policydb *file_rules, aa_state_t state, struct path_cond *cond); aa_state_t aa_str_perms(struct aa_policydb *file_rules, aa_state_t start, const char *name, struct path_cond *cond, struct aa_perms *perms); int aa_path_perm(const char *op, const struct cred *subj_cred, struct aa_label *label, const struct path *path, int flags, u32 request, struct path_cond *cond); int aa_path_link(const struct cred *subj_cred, struct aa_label *label, struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry); int aa_file_perm(const char *op, const struct cred *subj_cred, struct aa_label *label, struct file *file, u32 request, bool in_atomic); void aa_inherit_files(const struct cred *cred, struct files_struct *files); /** * aa_map_file_perms - map file flags to AppArmor permissions * @file: open file to map flags to AppArmor permissions * * Returns: apparmor permission set for the file */ static inline u32 aa_map_file_to_perms(struct file *file) { int flags = file->f_flags; u32 perms = 0; if (file->f_mode & FMODE_WRITE) perms |= MAY_WRITE; if (file->f_mode & FMODE_READ) perms |= MAY_READ; if ((flags & O_APPEND) && (perms & MAY_WRITE)) perms = (perms & ~MAY_WRITE) | MAY_APPEND; /* trunc implies write permission */ if (flags & O_TRUNC) perms |= MAY_WRITE; if (flags & O_CREAT) perms |= AA_MAY_CREATE; return perms; } #endif /* __AA_FILE_H */
5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_FIB_RULES_H #define __NET_FIB_RULES_H #include <linux/types.h> #include <linux/slab.h> #include <linux/netdevice.h> #include <linux/fib_rules.h> #include <linux/refcount.h> #include <net/flow.h> #include <net/rtnetlink.h> #include <net/fib_notifier.h> #include <linux/indirect_call_wrapper.h> struct fib_kuid_range { kuid_t start; kuid_t end; }; struct fib_rule { struct list_head list; int iifindex; int oifindex; u32 mark; u32 mark_mask; u32 flags; u32 table; u8 action; u8 l3mdev; u8 proto; u8 ip_proto; u32 target; __be64 tun_id; struct fib_rule __rcu *ctarget; struct net *fr_net; refcount_t refcnt; u32 pref; int suppress_ifgroup; int suppress_prefixlen; char iifname[IFNAMSIZ]; char oifname[IFNAMSIZ]; struct fib_kuid_range uid_range; struct fib_rule_port_range sport_range; struct fib_rule_port_range dport_range; struct rcu_head rcu; }; struct fib_lookup_arg { void *lookup_ptr; const void *lookup_data; void *result; struct fib_rule *rule; u32 table; int flags; #define FIB_LOOKUP_NOREF 1 #define FIB_LOOKUP_IGNORE_LINKSTATE 2 }; struct fib_rules_ops { int family; struct list_head list; int rule_size; int addr_size; int unresolved_rules; int nr_goto_rules; unsigned int fib_rules_seq; int (*action)(struct fib_rule *, struct flowi *, int, struct fib_lookup_arg *); bool (*suppress)(struct fib_rule *, int, struct fib_lookup_arg *); int (*match)(struct fib_rule *, struct flowi *, int); int (*configure)(struct fib_rule *, struct sk_buff *, struct fib_rule_hdr *, struct nlattr **, struct netlink_ext_ack *); int (*delete)(struct fib_rule *); int (*compare)(struct fib_rule *, struct fib_rule_hdr *, struct nlattr **); int (*fill)(struct fib_rule *, struct sk_buff *, struct fib_rule_hdr *); size_t (*nlmsg_payload)(struct fib_rule *); /* Called after modifications to the rules set, must flush * the route cache if one exists. */ void (*flush_cache)(struct fib_rules_ops *ops); int nlgroup; struct list_head rules_list; struct module *owner; struct net *fro_net; struct rcu_head rcu; }; struct fib_rule_notifier_info { struct fib_notifier_info info; /* must be first */ struct fib_rule *rule; }; static inline void fib_rule_get(struct fib_rule *rule) { refcount_inc(&rule->refcnt); } static inline void fib_rule_put(struct fib_rule *rule) { if (refcount_dec_and_test(&rule->refcnt)) kfree_rcu(rule, rcu); } #ifdef CONFIG_NET_L3_MASTER_DEV static inline u32 fib_rule_get_table(struct fib_rule *rule, struct fib_lookup_arg *arg) { return rule->l3mdev ? arg->table : rule->table; } #else static inline u32 fib_rule_get_table(struct fib_rule *rule, struct fib_lookup_arg *arg) { return rule->table; } #endif static inline u32 frh_get_table(struct fib_rule_hdr *frh, struct nlattr **nla) { if (nla[FRA_TABLE]) return nla_get_u32(nla[FRA_TABLE]); return frh->table; } static inline bool fib_rule_port_range_set(const struct fib_rule_port_range *range) { return range->start != 0 && range->end != 0; } static inline bool fib_rule_port_inrange(const struct fib_rule_port_range *a, __be16 port) { return ntohs(port) >= a->start && ntohs(port) <= a->end; } static inline bool fib_rule_port_range_valid(const struct fib_rule_port_range *a) { return a->start != 0 && a->end != 0 && a->end < 0xffff && a->start <= a->end; } static inline bool fib_rule_port_range_compare(struct fib_rule_port_range *a, struct fib_rule_port_range *b) { return a->start == b->start && a->end == b->end; } static inline bool fib_rule_requires_fldissect(struct fib_rule *rule) { return rule->iifindex != LOOPBACK_IFINDEX && (rule->ip_proto || fib_rule_port_range_set(&rule->sport_range) || fib_rule_port_range_set(&rule->dport_range)); } struct fib_rules_ops *fib_rules_register(const struct fib_rules_ops *, struct net *); void fib_rules_unregister(struct fib_rules_ops *); int fib_rules_lookup(struct fib_rules_ops *, struct flowi *, int flags, struct fib_lookup_arg *); int fib_default_rule_add(struct fib_rules_ops *, u32 pref, u32 table, u32 flags); bool fib_rule_matchall(const struct fib_rule *rule); int fib_rules_dump(struct net *net, struct notifier_block *nb, int family, struct netlink_ext_ack *extack); unsigned int fib_rules_seq_read(struct net *net, int family); int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack); int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack); INDIRECT_CALLABLE_DECLARE(int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)); INDIRECT_CALLABLE_DECLARE(int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)); INDIRECT_CALLABLE_DECLARE(int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg)); INDIRECT_CALLABLE_DECLARE(int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg)); INDIRECT_CALLABLE_DECLARE(bool fib6_rule_suppress(struct fib_rule *rule, int flags, struct fib_lookup_arg *arg)); INDIRECT_CALLABLE_DECLARE(bool fib4_rule_suppress(struct fib_rule *rule, int flags, struct fib_lookup_arg *arg)); #endif
129 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/bad_inode.c * * Copyright (C) 1997, Stephen Tweedie * * Provide stub functions for unreadable inodes * * Fabian Frederick : August 2003 - All file operations assigned to EIO */ #include <linux/fs.h> #include <linux/export.h> #include <linux/stat.h> #include <linux/time.h> #include <linux/namei.h> #include <linux/poll.h> #include <linux/fiemap.h> static int bad_file_open(struct inode *inode, struct file *filp) { return -EIO; } static const struct file_operations bad_file_ops = { .open = bad_file_open, }; static int bad_inode_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return -EIO; } static struct dentry *bad_inode_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return ERR_PTR(-EIO); } static int bad_inode_link (struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { return -EIO; } static int bad_inode_unlink(struct inode *dir, struct dentry *dentry) { return -EIO; } static int bad_inode_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { return -EIO; } static int bad_inode_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { return -EIO; } static int bad_inode_rmdir (struct inode *dir, struct dentry *dentry) { return -EIO; } static int bad_inode_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { return -EIO; } static int bad_inode_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { return -EIO; } static int bad_inode_readlink(struct dentry *dentry, char __user *buffer, int buflen) { return -EIO; } static int bad_inode_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { return -EIO; } static int bad_inode_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { return -EIO; } static int bad_inode_setattr(struct mnt_idmap *idmap, struct dentry *direntry, struct iattr *attrs) { return -EIO; } static ssize_t bad_inode_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { return -EIO; } static const char *bad_inode_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { return ERR_PTR(-EIO); } static struct posix_acl *bad_inode_get_acl(struct inode *inode, int type, bool rcu) { return ERR_PTR(-EIO); } static int bad_inode_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { return -EIO; } static int bad_inode_update_time(struct inode *inode, int flags) { return -EIO; } static int bad_inode_atomic_open(struct inode *inode, struct dentry *dentry, struct file *file, unsigned int open_flag, umode_t create_mode) { return -EIO; } static int bad_inode_tmpfile(struct mnt_idmap *idmap, struct inode *inode, struct file *file, umode_t mode) { return -EIO; } static int bad_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { return -EIO; } static const struct inode_operations bad_inode_ops = { .create = bad_inode_create, .lookup = bad_inode_lookup, .link = bad_inode_link, .unlink = bad_inode_unlink, .symlink = bad_inode_symlink, .mkdir = bad_inode_mkdir, .rmdir = bad_inode_rmdir, .mknod = bad_inode_mknod, .rename = bad_inode_rename2, .readlink = bad_inode_readlink, .permission = bad_inode_permission, .getattr = bad_inode_getattr, .setattr = bad_inode_setattr, .listxattr = bad_inode_listxattr, .get_link = bad_inode_get_link, .get_inode_acl = bad_inode_get_acl, .fiemap = bad_inode_fiemap, .update_time = bad_inode_update_time, .atomic_open = bad_inode_atomic_open, .tmpfile = bad_inode_tmpfile, .set_acl = bad_inode_set_acl, }; /* * When a filesystem is unable to read an inode due to an I/O error in * its read_inode() function, it can call make_bad_inode() to return a * set of stubs which will return EIO errors as required. * * We only need to do limited initialisation: all other fields are * preinitialised to zero automatically. */ /** * make_bad_inode - mark an inode bad due to an I/O error * @inode: Inode to mark bad * * When an inode cannot be read due to a media or remote network * failure this function makes the inode "bad" and causes I/O operations * on it to fail from this point on. */ void make_bad_inode(struct inode *inode) { remove_inode_hash(inode); inode->i_mode = S_IFREG; simple_inode_init_ts(inode); inode->i_op = &bad_inode_ops; inode->i_opflags &= ~IOP_XATTR; inode->i_fop = &bad_file_ops; } EXPORT_SYMBOL(make_bad_inode); /* * This tests whether an inode has been flagged as bad. The test uses * &bad_inode_ops to cover the case of invalidated inodes as well as * those created by make_bad_inode() above. */ /** * is_bad_inode - is an inode errored * @inode: inode to test * * Returns true if the inode in question has been marked as bad. */ bool is_bad_inode(struct inode *inode) { return (inode->i_op == &bad_inode_ops); } EXPORT_SYMBOL(is_bad_inode); /** * iget_failed - Mark an under-construction inode as dead and release it * @inode: The inode to discard * * Mark an under-construction inode as dead and release it. */ void iget_failed(struct inode *inode) { make_bad_inode(inode); unlock_new_inode(inode); iput(inode); } EXPORT_SYMBOL(iget_failed);
208 208 208 208 208 208 208 231 208 205 203 203 28 28 28 28 28 28 28 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 /* * Copyright (c) 2015, Mellanox Technologies inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "core_priv.h" #include <linux/in.h> #include <linux/in6.h> /* For in6_dev_get/in6_dev_put */ #include <net/addrconf.h> #include <net/bonding.h> #include <rdma/ib_cache.h> #include <rdma/ib_addr.h> static struct workqueue_struct *gid_cache_wq; enum gid_op_type { GID_DEL = 0, GID_ADD }; struct update_gid_event_work { struct work_struct work; union ib_gid gid; struct ib_gid_attr gid_attr; enum gid_op_type gid_op; }; #define ROCE_NETDEV_CALLBACK_SZ 3 struct netdev_event_work_cmd { roce_netdev_callback cb; roce_netdev_filter filter; struct net_device *ndev; struct net_device *filter_ndev; }; struct netdev_event_work { struct work_struct work; struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ]; }; static const struct { bool (*is_supported)(const struct ib_device *device, u32 port_num); enum ib_gid_type gid_type; } PORT_CAP_TO_GID_TYPE[] = { {rdma_protocol_roce_eth_encap, IB_GID_TYPE_ROCE}, {rdma_protocol_roce_udp_encap, IB_GID_TYPE_ROCE_UDP_ENCAP}, }; #define CAP_TO_GID_TABLE_SIZE ARRAY_SIZE(PORT_CAP_TO_GID_TYPE) unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u32 port) { int i; unsigned int ret_flags = 0; if (!rdma_protocol_roce(ib_dev, port)) return 1UL << IB_GID_TYPE_IB; for (i = 0; i < CAP_TO_GID_TABLE_SIZE; i++) if (PORT_CAP_TO_GID_TYPE[i].is_supported(ib_dev, port)) ret_flags |= 1UL << PORT_CAP_TO_GID_TYPE[i].gid_type; return ret_flags; } EXPORT_SYMBOL(roce_gid_type_mask_support); static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *gid_attr) { int i; unsigned long gid_type_mask = roce_gid_type_mask_support(ib_dev, port); for (i = 0; i < IB_GID_TYPE_SIZE; i++) { if ((1UL << i) & gid_type_mask) { gid_attr->gid_type = i; switch (gid_op) { case GID_ADD: ib_cache_gid_add(ib_dev, port, gid, gid_attr); break; case GID_DEL: ib_cache_gid_del(ib_dev, port, gid, gid_attr); break; } } } } enum bonding_slave_state { BONDING_SLAVE_STATE_ACTIVE = 1UL << 0, BONDING_SLAVE_STATE_INACTIVE = 1UL << 1, /* No primary slave or the device isn't a slave in bonding */ BONDING_SLAVE_STATE_NA = 1UL << 2, }; static enum bonding_slave_state is_eth_active_slave_of_bonding_rcu(struct net_device *dev, struct net_device *upper) { if (upper && netif_is_bond_master(upper)) { struct net_device *pdev = bond_option_active_slave_get_rcu(netdev_priv(upper)); if (pdev) return dev == pdev ? BONDING_SLAVE_STATE_ACTIVE : BONDING_SLAVE_STATE_INACTIVE; } return BONDING_SLAVE_STATE_NA; } #define REQUIRED_BOND_STATES (BONDING_SLAVE_STATE_ACTIVE | \ BONDING_SLAVE_STATE_NA) static bool is_eth_port_of_netdev_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *real_dev; bool res; if (!rdma_ndev) return false; rcu_read_lock(); real_dev = rdma_vlan_dev_real_dev(cookie); if (!real_dev) real_dev = cookie; res = ((rdma_is_upper_dev_rcu(rdma_ndev, cookie) && (is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) & REQUIRED_BOND_STATES)) || real_dev == rdma_ndev); rcu_read_unlock(); return res; } static bool is_eth_port_inactive_slave_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *master_dev; bool res; if (!rdma_ndev) return false; rcu_read_lock(); master_dev = netdev_master_upper_dev_get_rcu(rdma_ndev); res = is_eth_active_slave_of_bonding_rcu(rdma_ndev, master_dev) == BONDING_SLAVE_STATE_INACTIVE; rcu_read_unlock(); return res; } /** * is_ndev_for_default_gid_filter - Check if a given netdevice * can be considered for default GIDs or not. * @ib_dev: IB device to check * @port: Port to consider for adding default GID * @rdma_ndev: rdma netdevice pointer * @cookie: Netdevice to consider to form a default GID * * is_ndev_for_default_gid_filter() returns true if a given netdevice can be * considered for deriving default RoCE GID, returns false otherwise. */ static bool is_ndev_for_default_gid_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; bool res; if (!rdma_ndev) return false; rcu_read_lock(); /* * When rdma netdevice is used in bonding, bonding master netdevice * should be considered for default GIDs. Therefore, ignore slave rdma * netdevices when bonding is considered. * Additionally when event(cookie) netdevice is bond master device, * make sure that it the upper netdevice of rdma netdevice. */ res = ((cookie_ndev == rdma_ndev && !netif_is_bond_slave(rdma_ndev)) || (netif_is_bond_master(cookie_ndev) && rdma_is_upper_dev_rcu(rdma_ndev, cookie_ndev))); rcu_read_unlock(); return res; } static bool pass_all_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { return true; } static bool upper_device_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { bool res; if (!rdma_ndev) return false; if (rdma_ndev == cookie) return true; rcu_read_lock(); res = rdma_is_upper_dev_rcu(rdma_ndev, cookie); rcu_read_unlock(); return res; } /** * is_upper_ndev_bond_master_filter - Check if a given netdevice * is bond master device of netdevice of the RDMA device of port. * @ib_dev: IB device to check * @port: Port to consider for adding default GID * @rdma_ndev: Pointer to rdma netdevice * @cookie: Netdevice to consider to form a default GID * * is_upper_ndev_bond_master_filter() returns true if a cookie_netdev * is bond master device and rdma_ndev is its lower netdevice. It might * not have been established as slave device yet. */ static bool is_upper_ndev_bond_master_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; bool match = false; if (!rdma_ndev) return false; rcu_read_lock(); if (netif_is_bond_master(cookie_ndev) && rdma_is_upper_dev_rcu(rdma_ndev, cookie_ndev)) match = true; rcu_read_unlock(); return match; } static void update_gid_ip(enum gid_op_type gid_op, struct ib_device *ib_dev, u32 port, struct net_device *ndev, struct sockaddr *addr) { union ib_gid gid; struct ib_gid_attr gid_attr; rdma_ip2gid(addr, &gid); memset(&gid_attr, 0, sizeof(gid_attr)); gid_attr.ndev = ndev; update_gid(gid_op, ib_dev, port, &gid, &gid_attr); } static void bond_delete_netdev_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, struct net_device *event_ndev) { struct net_device *real_dev = rdma_vlan_dev_real_dev(event_ndev); unsigned long gid_type_mask; if (!rdma_ndev) return; if (!real_dev) real_dev = event_ndev; rcu_read_lock(); if (((rdma_ndev != event_ndev && !rdma_is_upper_dev_rcu(rdma_ndev, event_ndev)) || is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) == BONDING_SLAVE_STATE_INACTIVE)) { rcu_read_unlock(); return; } rcu_read_unlock(); gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_DELETE); } static void enum_netdev_ipv4_ips(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { const struct in_ifaddr *ifa; struct in_device *in_dev; struct sin_list { struct list_head list; struct sockaddr_in ip; }; struct sin_list *sin_iter; struct sin_list *sin_temp; LIST_HEAD(sin_list); if (ndev->reg_state >= NETREG_UNREGISTERING) return; rcu_read_lock(); in_dev = __in_dev_get_rcu(ndev); if (!in_dev) { rcu_read_unlock(); return; } in_dev_for_each_ifa_rcu(ifa, in_dev) { struct sin_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) continue; entry->ip.sin_family = AF_INET; entry->ip.sin_addr.s_addr = ifa->ifa_address; list_add_tail(&entry->list, &sin_list); } rcu_read_unlock(); list_for_each_entry_safe(sin_iter, sin_temp, &sin_list, list) { update_gid_ip(GID_ADD, ib_dev, port, ndev, (struct sockaddr *)&sin_iter->ip); list_del(&sin_iter->list); kfree(sin_iter); } } static void enum_netdev_ipv6_ips(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { struct inet6_ifaddr *ifp; struct inet6_dev *in6_dev; struct sin6_list { struct list_head list; struct sockaddr_in6 sin6; }; struct sin6_list *sin6_iter; struct sin6_list *sin6_temp; struct ib_gid_attr gid_attr = {.ndev = ndev}; LIST_HEAD(sin6_list); if (ndev->reg_state >= NETREG_UNREGISTERING) return; in6_dev = in6_dev_get(ndev); if (!in6_dev) return; read_lock_bh(&in6_dev->lock); list_for_each_entry(ifp, &in6_dev->addr_list, if_list) { struct sin6_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) continue; entry->sin6.sin6_family = AF_INET6; entry->sin6.sin6_addr = ifp->addr; list_add_tail(&entry->list, &sin6_list); } read_unlock_bh(&in6_dev->lock); in6_dev_put(in6_dev); list_for_each_entry_safe(sin6_iter, sin6_temp, &sin6_list, list) { union ib_gid gid; rdma_ip2gid((struct sockaddr *)&sin6_iter->sin6, &gid); update_gid(GID_ADD, ib_dev, port, &gid, &gid_attr); list_del(&sin6_iter->list); kfree(sin6_iter); } } static void _add_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { enum_netdev_ipv4_ips(ib_dev, port, ndev); if (IS_ENABLED(CONFIG_IPV6)) enum_netdev_ipv6_ips(ib_dev, port, ndev); } static void add_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { _add_netdev_ips(ib_dev, port, cookie); } static void del_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { ib_cache_gid_del_all_netdev_gids(ib_dev, port, cookie); } /** * del_default_gids - Delete default GIDs of the event/cookie netdevice * @ib_dev: RDMA device pointer * @port: Port of the RDMA device whose GID table to consider * @rdma_ndev: Unused rdma netdevice * @cookie: Pointer to event netdevice * * del_default_gids() deletes the default GIDs of the event/cookie netdevice. */ static void del_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; unsigned long gid_type_mask; gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ib_cache_gid_set_default_gid(ib_dev, port, cookie_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_DELETE); } static void add_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *event_ndev = cookie; unsigned long gid_type_mask; gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ib_cache_gid_set_default_gid(ib_dev, port, event_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_SET); } static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net *net; struct net_device *ndev; /* Lock the rtnl to make sure the netdevs does not move under * our feet */ rtnl_lock(); down_read(&net_rwsem); for_each_net(net) for_each_netdev(net, ndev) { /* * Filter and add default GIDs of the primary netdevice * when not in bonding mode, or add default GIDs * of bond master device, when in bonding mode. */ if (is_ndev_for_default_gid_filter(ib_dev, port, rdma_ndev, ndev)) add_default_gids(ib_dev, port, rdma_ndev, ndev); if (is_eth_port_of_netdev_filter(ib_dev, port, rdma_ndev, ndev)) _add_netdev_ips(ib_dev, port, ndev); } up_read(&net_rwsem); rtnl_unlock(); } /** * rdma_roce_rescan_device - Rescan all of the network devices in the system * and add their gids, as needed, to the relevant RoCE devices. * * @ib_dev: the rdma device */ void rdma_roce_rescan_device(struct ib_device *ib_dev) { ib_enum_roce_netdev(ib_dev, pass_all_filter, NULL, enum_all_gids_of_dev_cb, NULL); } EXPORT_SYMBOL(rdma_roce_rescan_device); static void callback_for_addr_gid_device_scan(struct ib_device *device, u32 port, struct net_device *rdma_ndev, void *cookie) { struct update_gid_event_work *parsed = cookie; return update_gid(parsed->gid_op, device, port, &parsed->gid, &parsed->gid_attr); } struct upper_list { struct list_head list; struct net_device *upper; }; static int netdev_upper_walk(struct net_device *upper, struct netdev_nested_priv *priv) { struct upper_list *entry = kmalloc(sizeof(*entry), GFP_ATOMIC); struct list_head *upper_list = (struct list_head *)priv->data; if (!entry) return 0; list_add_tail(&entry->list, upper_list); dev_hold(upper); entry->upper = upper; return 0; } static void handle_netdev_upper(struct ib_device *ib_dev, u32 port, void *cookie, void (*handle_netdev)(struct ib_device *ib_dev, u32 port, struct net_device *ndev)) { struct net_device *ndev = cookie; struct netdev_nested_priv priv; struct upper_list *upper_iter; struct upper_list *upper_temp; LIST_HEAD(upper_list); priv.data = &upper_list; rcu_read_lock(); netdev_walk_all_upper_dev_rcu(ndev, netdev_upper_walk, &priv); rcu_read_unlock(); handle_netdev(ib_dev, port, ndev); list_for_each_entry_safe(upper_iter, upper_temp, &upper_list, list) { handle_netdev(ib_dev, port, upper_iter->upper); dev_put(upper_iter->upper); list_del(&upper_iter->list); kfree(upper_iter); } } static void _roce_del_all_netdev_gids(struct ib_device *ib_dev, u32 port, struct net_device *event_ndev) { ib_cache_gid_del_all_netdev_gids(ib_dev, port, event_ndev); } static void del_netdev_upper_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { handle_netdev_upper(ib_dev, port, cookie, _roce_del_all_netdev_gids); } static void add_netdev_upper_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { handle_netdev_upper(ib_dev, port, cookie, _add_netdev_ips); } static void del_netdev_default_ips_join(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *master_ndev; rcu_read_lock(); master_ndev = netdev_master_upper_dev_get_rcu(rdma_ndev); if (master_ndev) dev_hold(master_ndev); rcu_read_unlock(); if (master_ndev) { bond_delete_netdev_default_gids(ib_dev, port, rdma_ndev, master_ndev); dev_put(master_ndev); } } /* The following functions operate on all IB devices. netdevice_event and * addr_event execute ib_enum_all_roce_netdevs through a work. * ib_enum_all_roce_netdevs iterates through all IB devices. */ static void netdevice_event_work_handler(struct work_struct *_work) { struct netdev_event_work *work = container_of(_work, struct netdev_event_work, work); unsigned int i; for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++) { ib_enum_all_roce_netdevs(work->cmds[i].filter, work->cmds[i].filter_ndev, work->cmds[i].cb, work->cmds[i].ndev); dev_put(work->cmds[i].ndev); dev_put(work->cmds[i].filter_ndev); } kfree(work); } static int netdevice_queue_work(struct netdev_event_work_cmd *cmds, struct net_device *ndev) { unsigned int i; struct netdev_event_work *ndev_work = kmalloc(sizeof(*ndev_work), GFP_KERNEL); if (!ndev_work) return NOTIFY_DONE; memcpy(ndev_work->cmds, cmds, sizeof(ndev_work->cmds)); for (i = 0; i < ARRAY_SIZE(ndev_work->cmds) && ndev_work->cmds[i].cb; i++) { if (!ndev_work->cmds[i].ndev) ndev_work->cmds[i].ndev = ndev; if (!ndev_work->cmds[i].filter_ndev) ndev_work->cmds[i].filter_ndev = ndev; dev_hold(ndev_work->cmds[i].ndev); dev_hold(ndev_work->cmds[i].filter_ndev); } INIT_WORK(&ndev_work->work, netdevice_event_work_handler); queue_work(gid_cache_wq, &ndev_work->work); return NOTIFY_DONE; } static const struct netdev_event_work_cmd add_cmd = { .cb = add_netdev_ips, .filter = is_eth_port_of_netdev_filter }; static const struct netdev_event_work_cmd add_cmd_upper_ips = { .cb = add_netdev_upper_ips, .filter = is_eth_port_of_netdev_filter }; static void ndev_event_unlink(struct netdev_notifier_changeupper_info *changeupper_info, struct netdev_event_work_cmd *cmds) { static const struct netdev_event_work_cmd upper_ips_del_cmd = { .cb = del_netdev_upper_ips, .filter = upper_device_filter }; cmds[0] = upper_ips_del_cmd; cmds[0].ndev = changeupper_info->upper_dev; cmds[1] = add_cmd; } static const struct netdev_event_work_cmd bonding_default_add_cmd = { .cb = add_default_gids, .filter = is_upper_ndev_bond_master_filter }; static void ndev_event_link(struct net_device *event_ndev, struct netdev_notifier_changeupper_info *changeupper_info, struct netdev_event_work_cmd *cmds) { static const struct netdev_event_work_cmd bonding_default_del_cmd = { .cb = del_default_gids, .filter = is_upper_ndev_bond_master_filter }; /* * When a lower netdev is linked to its upper bonding * netdev, delete lower slave netdev's default GIDs. */ cmds[0] = bonding_default_del_cmd; cmds[0].ndev = event_ndev; cmds[0].filter_ndev = changeupper_info->upper_dev; /* Now add bonding upper device default GIDs */ cmds[1] = bonding_default_add_cmd; cmds[1].ndev = changeupper_info->upper_dev; cmds[1].filter_ndev = changeupper_info->upper_dev; /* Now add bonding upper device IP based GIDs */ cmds[2] = add_cmd_upper_ips; cmds[2].ndev = changeupper_info->upper_dev; cmds[2].filter_ndev = changeupper_info->upper_dev; } static void netdevice_event_changeupper(struct net_device *event_ndev, struct netdev_notifier_changeupper_info *changeupper_info, struct netdev_event_work_cmd *cmds) { if (changeupper_info->linking) ndev_event_link(event_ndev, changeupper_info, cmds); else ndev_event_unlink(changeupper_info, cmds); } static const struct netdev_event_work_cmd add_default_gid_cmd = { .cb = add_default_gids, .filter = is_ndev_for_default_gid_filter, }; static int netdevice_event(struct notifier_block *this, unsigned long event, void *ptr) { static const struct netdev_event_work_cmd del_cmd = { .cb = del_netdev_ips, .filter = pass_all_filter}; static const struct netdev_event_work_cmd bonding_default_del_cmd_join = { .cb = del_netdev_default_ips_join, .filter = is_eth_port_inactive_slave_filter }; static const struct netdev_event_work_cmd netdev_del_cmd = { .cb = del_netdev_ips, .filter = is_eth_port_of_netdev_filter }; static const struct netdev_event_work_cmd bonding_event_ips_del_cmd = { .cb = del_netdev_upper_ips, .filter = upper_device_filter}; struct net_device *ndev = netdev_notifier_info_to_dev(ptr); struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ] = { {NULL} }; if (ndev->type != ARPHRD_ETHER) return NOTIFY_DONE; switch (event) { case NETDEV_REGISTER: case NETDEV_UP: cmds[0] = bonding_default_del_cmd_join; cmds[1] = add_default_gid_cmd; cmds[2] = add_cmd; break; case NETDEV_UNREGISTER: if (ndev->reg_state < NETREG_UNREGISTERED) cmds[0] = del_cmd; else return NOTIFY_DONE; break; case NETDEV_CHANGEADDR: cmds[0] = netdev_del_cmd; if (ndev->reg_state == NETREG_REGISTERED) { cmds[1] = add_default_gid_cmd; cmds[2] = add_cmd; } break; case NETDEV_CHANGEUPPER: netdevice_event_changeupper(ndev, container_of(ptr, struct netdev_notifier_changeupper_info, info), cmds); break; case NETDEV_BONDING_FAILOVER: cmds[0] = bonding_event_ips_del_cmd; /* Add default GIDs of the bond device */ cmds[1] = bonding_default_add_cmd; /* Add IP based GIDs of the bond device */ cmds[2] = add_cmd_upper_ips; break; default: return NOTIFY_DONE; } return netdevice_queue_work(cmds, ndev); } static void update_gid_event_work_handler(struct work_struct *_work) { struct update_gid_event_work *work = container_of(_work, struct update_gid_event_work, work); ib_enum_all_roce_netdevs(is_eth_port_of_netdev_filter, work->gid_attr.ndev, callback_for_addr_gid_device_scan, work); dev_put(work->gid_attr.ndev); kfree(work); } static int addr_event(struct notifier_block *this, unsigned long event, struct sockaddr *sa, struct net_device *ndev) { struct update_gid_event_work *work; enum gid_op_type gid_op; if (ndev->type != ARPHRD_ETHER) return NOTIFY_DONE; switch (event) { case NETDEV_UP: gid_op = GID_ADD; break; case NETDEV_DOWN: gid_op = GID_DEL; break; default: return NOTIFY_DONE; } work = kmalloc(sizeof(*work), GFP_ATOMIC); if (!work) return NOTIFY_DONE; INIT_WORK(&work->work, update_gid_event_work_handler); rdma_ip2gid(sa, &work->gid); work->gid_op = gid_op; memset(&work->gid_attr, 0, sizeof(work->gid_attr)); dev_hold(ndev); work->gid_attr.ndev = ndev; queue_work(gid_cache_wq, &work->work); return NOTIFY_DONE; } static int inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct sockaddr_in in; struct net_device *ndev; struct in_ifaddr *ifa = ptr; in.sin_family = AF_INET; in.sin_addr.s_addr = ifa->ifa_address; ndev = ifa->ifa_dev->dev; return addr_event(this, event, (struct sockaddr *)&in, ndev); } static int inet6addr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct sockaddr_in6 in6; struct net_device *ndev; struct inet6_ifaddr *ifa6 = ptr; in6.sin6_family = AF_INET6; in6.sin6_addr = ifa6->addr; ndev = ifa6->idev->dev; return addr_event(this, event, (struct sockaddr *)&in6, ndev); } static struct notifier_block nb_netdevice = { .notifier_call = netdevice_event }; static struct notifier_block nb_inetaddr = { .notifier_call = inetaddr_event }; static struct notifier_block nb_inet6addr = { .notifier_call = inet6addr_event }; int __init roce_gid_mgmt_init(void) { gid_cache_wq = alloc_ordered_workqueue("gid-cache-wq", 0); if (!gid_cache_wq) return -ENOMEM; register_inetaddr_notifier(&nb_inetaddr); if (IS_ENABLED(CONFIG_IPV6)) register_inet6addr_notifier(&nb_inet6addr); /* We relay on the netdevice notifier to enumerate all * existing devices in the system. Register to this notifier * last to make sure we will not miss any IP add/del * callbacks. */ register_netdevice_notifier(&nb_netdevice); return 0; } void __exit roce_gid_mgmt_cleanup(void) { if (IS_ENABLED(CONFIG_IPV6)) unregister_inet6addr_notifier(&nb_inet6addr); unregister_inetaddr_notifier(&nb_inetaddr); unregister_netdevice_notifier(&nb_netdevice); /* Ensure all gid deletion tasks complete before we go down, * to avoid any reference to free'd memory. By the time * ib-core is removed, all physical devices have been removed, * so no issue with remaining hardware contexts. */ destroy_workqueue(gid_cache_wq); }
43 10 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2003 * Copyright (c) Cisco 1999,2000 * Copyright (c) Motorola 1999,2000,2001 * Copyright (c) La Monte H.P. Yarroll 2001 * * This file is part of the SCTP kernel implementation. * * A collection class to handle the storage of transport addresses. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Jon Grimm <jgrimm@us.ibm.com> * Daisy Chang <daisyc@us.ibm.com> */ #include <linux/types.h> #include <linux/slab.h> #include <linux/in.h> #include <net/sock.h> #include <net/ipv6.h> #include <net/if_inet6.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> /* Forward declarations for internal helpers. */ static int sctp_copy_one_addr(struct net *net, struct sctp_bind_addr *dest, union sctp_addr *addr, enum sctp_scope scope, gfp_t gfp, int flags); static void sctp_bind_addr_clean(struct sctp_bind_addr *); /* First Level Abstractions. */ /* Copy 'src' to 'dest' taking 'scope' into account. Omit addresses * in 'src' which have a broader scope than 'scope'. */ int sctp_bind_addr_copy(struct net *net, struct sctp_bind_addr *dest, const struct sctp_bind_addr *src, enum sctp_scope scope, gfp_t gfp, int flags) { struct sctp_sockaddr_entry *addr; int error = 0; /* All addresses share the same port. */ dest->port = src->port; /* Extract the addresses which are relevant for this scope. */ list_for_each_entry(addr, &src->address_list, list) { error = sctp_copy_one_addr(net, dest, &addr->a, scope, gfp, flags); if (error < 0) goto out; } /* If there are no addresses matching the scope and * this is global scope, try to get a link scope address, with * the assumption that we must be sitting behind a NAT. */ if (list_empty(&dest->address_list) && (SCTP_SCOPE_GLOBAL == scope)) { list_for_each_entry(addr, &src->address_list, list) { error = sctp_copy_one_addr(net, dest, &addr->a, SCTP_SCOPE_LINK, gfp, flags); if (error < 0) goto out; } } /* If somehow no addresses were found that can be used with this * scope, it's an error. */ if (list_empty(&dest->address_list)) error = -ENETUNREACH; out: if (error) sctp_bind_addr_clean(dest); return error; } /* Exactly duplicate the address lists. This is necessary when doing * peer-offs and accepts. We don't want to put all the current system * addresses into the endpoint. That's useless. But we do want duplicat * the list of bound addresses that the older endpoint used. */ int sctp_bind_addr_dup(struct sctp_bind_addr *dest, const struct sctp_bind_addr *src, gfp_t gfp) { struct sctp_sockaddr_entry *addr; int error = 0; /* All addresses share the same port. */ dest->port = src->port; list_for_each_entry(addr, &src->address_list, list) { error = sctp_add_bind_addr(dest, &addr->a, sizeof(addr->a), 1, gfp); if (error < 0) break; } return error; } /* Initialize the SCTP_bind_addr structure for either an endpoint or * an association. */ void sctp_bind_addr_init(struct sctp_bind_addr *bp, __u16 port) { INIT_LIST_HEAD(&bp->address_list); bp->port = port; } /* Dispose of the address list. */ static void sctp_bind_addr_clean(struct sctp_bind_addr *bp) { struct sctp_sockaddr_entry *addr, *temp; /* Empty the bind address list. */ list_for_each_entry_safe(addr, temp, &bp->address_list, list) { list_del_rcu(&addr->list); kfree_rcu(addr, rcu); SCTP_DBG_OBJCNT_DEC(addr); } } /* Dispose of an SCTP_bind_addr structure */ void sctp_bind_addr_free(struct sctp_bind_addr *bp) { /* Empty the bind address list. */ sctp_bind_addr_clean(bp); } /* Add an address to the bind address list in the SCTP_bind_addr structure. */ int sctp_add_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *new, int new_size, __u8 addr_state, gfp_t gfp) { struct sctp_sockaddr_entry *addr; /* Add the address to the bind address list. */ addr = kzalloc(sizeof(*addr), gfp); if (!addr) return -ENOMEM; memcpy(&addr->a, new, min_t(size_t, sizeof(*new), new_size)); /* Fix up the port if it has not yet been set. * Both v4 and v6 have the port at the same offset. */ if (!addr->a.v4.sin_port) addr->a.v4.sin_port = htons(bp->port); addr->state = addr_state; addr->valid = 1; INIT_LIST_HEAD(&addr->list); /* We always hold a socket lock when calling this function, * and that acts as a writer synchronizing lock. */ list_add_tail_rcu(&addr->list, &bp->address_list); SCTP_DBG_OBJCNT_INC(addr); return 0; } /* Delete an address from the bind address list in the SCTP_bind_addr * structure. */ int sctp_del_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *del_addr) { struct sctp_sockaddr_entry *addr, *temp; int found = 0; /* We hold the socket lock when calling this function, * and that acts as a writer synchronizing lock. */ list_for_each_entry_safe(addr, temp, &bp->address_list, list) { if (sctp_cmp_addr_exact(&addr->a, del_addr)) { /* Found the exact match. */ found = 1; addr->valid = 0; list_del_rcu(&addr->list); break; } } if (found) { kfree_rcu(addr, rcu); SCTP_DBG_OBJCNT_DEC(addr); return 0; } return -EINVAL; } /* Create a network byte-order representation of all the addresses * formated as SCTP parameters. * * The second argument is the return value for the length. */ union sctp_params sctp_bind_addrs_to_raw(const struct sctp_bind_addr *bp, int *addrs_len, gfp_t gfp) { union sctp_params addrparms; union sctp_params retval; int addrparms_len; union sctp_addr_param rawaddr; int len; struct sctp_sockaddr_entry *addr; struct list_head *pos; struct sctp_af *af; addrparms_len = 0; len = 0; /* Allocate enough memory at once. */ list_for_each(pos, &bp->address_list) { len += sizeof(union sctp_addr_param); } /* Don't even bother embedding an address if there * is only one. */ if (len == sizeof(union sctp_addr_param)) { retval.v = NULL; goto end_raw; } retval.v = kmalloc(len, gfp); if (!retval.v) goto end_raw; addrparms = retval; list_for_each_entry(addr, &bp->address_list, list) { af = sctp_get_af_specific(addr->a.v4.sin_family); len = af->to_addr_param(&addr->a, &rawaddr); memcpy(addrparms.v, &rawaddr, len); addrparms.v += len; addrparms_len += len; } end_raw: *addrs_len = addrparms_len; return retval; } /* * Create an address list out of the raw address list format (IPv4 and IPv6 * address parameters). */ int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw_addr_list, int addrs_len, __u16 port, gfp_t gfp) { union sctp_addr_param *rawaddr; struct sctp_paramhdr *param; union sctp_addr addr; int retval = 0; int len; struct sctp_af *af; /* Convert the raw address to standard address format */ while (addrs_len) { param = (struct sctp_paramhdr *)raw_addr_list; rawaddr = (union sctp_addr_param *)raw_addr_list; af = sctp_get_af_specific(param_type2af(param->type)); if (unlikely(!af) || !af->from_addr_param(&addr, rawaddr, htons(port), 0)) { retval = -EINVAL; goto out_err; } if (sctp_bind_addr_state(bp, &addr) != -1) goto next; retval = sctp_add_bind_addr(bp, &addr, sizeof(addr), SCTP_ADDR_SRC, gfp); if (retval) /* Can't finish building the list, clean up. */ goto out_err; next: len = ntohs(param->length); addrs_len -= len; raw_addr_list += len; } return retval; out_err: if (retval) sctp_bind_addr_clean(bp); return retval; } /******************************************************************** * 2nd Level Abstractions ********************************************************************/ /* Does this contain a specified address? Allow wildcarding. */ int sctp_bind_addr_match(struct sctp_bind_addr *bp, const union sctp_addr *addr, struct sctp_sock *opt) { struct sctp_sockaddr_entry *laddr; int match = 0; rcu_read_lock(); list_for_each_entry_rcu(laddr, &bp->address_list, list) { if (!laddr->valid) continue; if (opt->pf->cmp_addr(&laddr->a, addr, opt)) { match = 1; break; } } rcu_read_unlock(); return match; } int sctp_bind_addrs_check(struct sctp_sock *sp, struct sctp_sock *sp2, int cnt2) { struct sctp_bind_addr *bp2 = &sp2->ep->base.bind_addr; struct sctp_bind_addr *bp = &sp->ep->base.bind_addr; struct sctp_sockaddr_entry *laddr, *laddr2; bool exist = false; int cnt = 0; rcu_read_lock(); list_for_each_entry_rcu(laddr, &bp->address_list, list) { list_for_each_entry_rcu(laddr2, &bp2->address_list, list) { if (sp->pf->af->cmp_addr(&laddr->a, &laddr2->a) && laddr->valid && laddr2->valid) { exist = true; goto next; } } cnt = 0; break; next: cnt++; } rcu_read_unlock(); return (cnt == cnt2) ? 0 : (exist ? -EEXIST : 1); } /* Does the address 'addr' conflict with any addresses in * the bp. */ int sctp_bind_addr_conflict(struct sctp_bind_addr *bp, const union sctp_addr *addr, struct sctp_sock *bp_sp, struct sctp_sock *addr_sp) { struct sctp_sockaddr_entry *laddr; int conflict = 0; struct sctp_sock *sp; /* Pick the IPv6 socket as the basis of comparison * since it's usually a superset of the IPv4. * If there is no IPv6 socket, then default to bind_addr. */ if (sctp_opt2sk(bp_sp)->sk_family == AF_INET6) sp = bp_sp; else if (sctp_opt2sk(addr_sp)->sk_family == AF_INET6) sp = addr_sp; else sp = bp_sp; rcu_read_lock(); list_for_each_entry_rcu(laddr, &bp->address_list, list) { if (!laddr->valid) continue; conflict = sp->pf->cmp_addr(&laddr->a, addr, sp); if (conflict) break; } rcu_read_unlock(); return conflict; } /* Get the state of the entry in the bind_addr_list */ int sctp_bind_addr_state(const struct sctp_bind_addr *bp, const union sctp_addr *addr) { struct sctp_sockaddr_entry *laddr; struct sctp_af *af; af = sctp_get_af_specific(addr->sa.sa_family); if (unlikely(!af)) return -1; list_for_each_entry_rcu(laddr, &bp->address_list, list) { if (!laddr->valid) continue; if (af->cmp_addr(&laddr->a, addr)) return laddr->state; } return -1; } /* Find the first address in the bind address list that is not present in * the addrs packed array. */ union sctp_addr *sctp_find_unmatch_addr(struct sctp_bind_addr *bp, const union sctp_addr *addrs, int addrcnt, struct sctp_sock *opt) { struct sctp_sockaddr_entry *laddr; union sctp_addr *addr; void *addr_buf; struct sctp_af *af; int i; /* This is only called sctp_send_asconf_del_ip() and we hold * the socket lock in that code patch, so that address list * can't change. */ list_for_each_entry(laddr, &bp->address_list, list) { addr_buf = (union sctp_addr *)addrs; for (i = 0; i < addrcnt; i++) { addr = addr_buf; af = sctp_get_af_specific(addr->v4.sin_family); if (!af) break; if (opt->pf->cmp_addr(&laddr->a, addr, opt)) break; addr_buf += af->sockaddr_len; } if (i == addrcnt) return &laddr->a; } return NULL; } /* Copy out addresses from the global local address list. */ static int sctp_copy_one_addr(struct net *net, struct sctp_bind_addr *dest, union sctp_addr *addr, enum sctp_scope scope, gfp_t gfp, int flags) { int error = 0; if (sctp_is_any(NULL, addr)) { error = sctp_copy_local_addr_list(net, dest, scope, gfp, flags); } else if (sctp_in_scope(net, addr, scope)) { /* Now that the address is in scope, check to see if * the address type is supported by local sock as * well as the remote peer. */ if ((((AF_INET == addr->sa.sa_family) && (flags & SCTP_ADDR4_ALLOWED) && (flags & SCTP_ADDR4_PEERSUPP))) || (((AF_INET6 == addr->sa.sa_family) && (flags & SCTP_ADDR6_ALLOWED) && (flags & SCTP_ADDR6_PEERSUPP)))) error = sctp_add_bind_addr(dest, addr, sizeof(*addr), SCTP_ADDR_SRC, gfp); } return error; } /* Is this a wildcard address? */ int sctp_is_any(struct sock *sk, const union sctp_addr *addr) { unsigned short fam = 0; struct sctp_af *af; /* Try to get the right address family */ if (addr->sa.sa_family != AF_UNSPEC) fam = addr->sa.sa_family; else if (sk) fam = sk->sk_family; af = sctp_get_af_specific(fam); if (!af) return 0; return af->is_any(addr); } /* Is 'addr' valid for 'scope'? */ int sctp_in_scope(struct net *net, const union sctp_addr *addr, enum sctp_scope scope) { enum sctp_scope addr_scope = sctp_scope(addr); /* The unusable SCTP addresses will not be considered with * any defined scopes. */ if (SCTP_SCOPE_UNUSABLE == addr_scope) return 0; /* * For INIT and INIT-ACK address list, let L be the level of * requested destination address, sender and receiver * SHOULD include all of its addresses with level greater * than or equal to L. * * Address scoping can be selectively controlled via sysctl * option */ switch (net->sctp.scope_policy) { case SCTP_SCOPE_POLICY_DISABLE: return 1; case SCTP_SCOPE_POLICY_ENABLE: if (addr_scope <= scope) return 1; break; case SCTP_SCOPE_POLICY_PRIVATE: if (addr_scope <= scope || SCTP_SCOPE_PRIVATE == addr_scope) return 1; break; case SCTP_SCOPE_POLICY_LINK: if (addr_scope <= scope || SCTP_SCOPE_LINK == addr_scope) return 1; break; default: break; } return 0; } int sctp_is_ep_boundall(struct sock *sk) { struct sctp_bind_addr *bp; struct sctp_sockaddr_entry *addr; bp = &sctp_sk(sk)->ep->base.bind_addr; if (sctp_list_single_entry(&bp->address_list)) { addr = list_entry(bp->address_list.next, struct sctp_sockaddr_entry, list); if (sctp_is_any(sk, &addr->a)) return 1; } return 0; } /******************************************************************** * 3rd Level Abstractions ********************************************************************/ /* What is the scope of 'addr'? */ enum sctp_scope sctp_scope(const union sctp_addr *addr) { struct sctp_af *af; af = sctp_get_af_specific(addr->sa.sa_family); if (!af) return SCTP_SCOPE_UNUSABLE; return af->scope((union sctp_addr *)addr); }
55 55 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 // SPDX-License-Identifier: GPL-2.0+ /* * linux/fs/jbd2/revoke.c * * Written by Stephen C. Tweedie <sct@redhat.com>, 2000 * * Copyright 2000 Red Hat corp --- All Rights Reserved * * Journal revoke routines for the generic filesystem journaling code; * part of the ext2fs journaling system. * * Revoke is the mechanism used to prevent old log records for deleted * metadata from being replayed on top of newer data using the same * blocks. The revoke mechanism is used in two separate places: * * + Commit: during commit we write the entire list of the current * transaction's revoked blocks to the journal * * + Recovery: during recovery we record the transaction ID of all * revoked blocks. If there are multiple revoke records in the log * for a single block, only the last one counts, and if there is a log * entry for a block beyond the last revoke, then that log entry still * gets replayed. * * We can get interactions between revokes and new log data within a * single transaction: * * Block is revoked and then journaled: * The desired end result is the journaling of the new block, so we * cancel the revoke before the transaction commits. * * Block is journaled and then revoked: * The revoke must take precedence over the write of the block, so we * need either to cancel the journal entry or to write the revoke * later in the log than the log block. In this case, we choose the * latter: journaling a block cancels any revoke record for that block * in the current transaction, so any revoke for that block in the * transaction must have happened after the block was journaled and so * the revoke must take precedence. * * Block is revoked and then written as data: * The data write is allowed to succeed, but the revoke is _not_ * cancelled. We still need to prevent old log records from * overwriting the new data. We don't even need to clear the revoke * bit here. * * We cache revoke status of a buffer in the current transaction in b_states * bits. As the name says, revokevalid flag indicates that the cached revoke * status of a buffer is valid and we can rely on the cached status. * * Revoke information on buffers is a tri-state value: * * RevokeValid clear: no cached revoke status, need to look it up * RevokeValid set, Revoked clear: * buffer has not been revoked, and cancel_revoke * need do nothing. * RevokeValid set, Revoked set: * buffer has been revoked. * * Locking rules: * We keep two hash tables of revoke records. One hashtable belongs to the * running transaction (is pointed to by journal->j_revoke), the other one * belongs to the committing transaction. Accesses to the second hash table * happen only from the kjournald and no other thread touches this table. Also * journal_switch_revoke_table() which switches which hashtable belongs to the * running and which to the committing transaction is called only from * kjournald. Therefore we need no locks when accessing the hashtable belonging * to the committing transaction. * * All users operating on the hash table belonging to the running transaction * have a handle to the transaction. Therefore they are safe from kjournald * switching hash tables under them. For operations on the lists of entries in * the hash table j_revoke_lock is used. * * Finally, also replay code uses the hash tables but at this moment no one else * can touch them (filesystem isn't mounted yet) and hence no locking is * needed. */ #ifndef __KERNEL__ #include "jfs_user.h" #else #include <linux/time.h> #include <linux/fs.h> #include <linux/jbd2.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/init.h> #include <linux/bio.h> #include <linux/log2.h> #include <linux/hash.h> #endif static struct kmem_cache *jbd2_revoke_record_cache; static struct kmem_cache *jbd2_revoke_table_cache; /* Each revoke record represents one single revoked block. During journal replay, this involves recording the transaction ID of the last transaction to revoke this block. */ struct jbd2_revoke_record_s { struct list_head hash; tid_t sequence; /* Used for recovery only */ unsigned long long blocknr; }; /* The revoke table is just a simple hash table of revoke records. */ struct jbd2_revoke_table_s { /* It is conceivable that we might want a larger hash table * for recovery. Must be a power of two. */ int hash_size; int hash_shift; struct list_head *hash_table; }; #ifdef __KERNEL__ static void write_one_revoke_record(transaction_t *, struct list_head *, struct buffer_head **, int *, struct jbd2_revoke_record_s *); static void flush_descriptor(journal_t *, struct buffer_head *, int); #endif /* Utility functions to maintain the revoke table */ static inline int hash(journal_t *journal, unsigned long long block) { return hash_64(block, journal->j_revoke->hash_shift); } static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr, tid_t seq) { struct list_head *hash_list; struct jbd2_revoke_record_s *record; gfp_t gfp_mask = GFP_NOFS; if (journal_oom_retry) gfp_mask |= __GFP_NOFAIL; record = kmem_cache_alloc(jbd2_revoke_record_cache, gfp_mask); if (!record) return -ENOMEM; record->sequence = seq; record->blocknr = blocknr; hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; spin_lock(&journal->j_revoke_lock); list_add(&record->hash, hash_list); spin_unlock(&journal->j_revoke_lock); return 0; } /* Find a revoke record in the journal's hash table. */ static struct jbd2_revoke_record_s *find_revoke_record(journal_t *journal, unsigned long long blocknr) { struct list_head *hash_list; struct jbd2_revoke_record_s *record; hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; spin_lock(&journal->j_revoke_lock); record = (struct jbd2_revoke_record_s *) hash_list->next; while (&(record->hash) != hash_list) { if (record->blocknr == blocknr) { spin_unlock(&journal->j_revoke_lock); return record; } record = (struct jbd2_revoke_record_s *) record->hash.next; } spin_unlock(&journal->j_revoke_lock); return NULL; } void jbd2_journal_destroy_revoke_record_cache(void) { kmem_cache_destroy(jbd2_revoke_record_cache); jbd2_revoke_record_cache = NULL; } void jbd2_journal_destroy_revoke_table_cache(void) { kmem_cache_destroy(jbd2_revoke_table_cache); jbd2_revoke_table_cache = NULL; } int __init jbd2_journal_init_revoke_record_cache(void) { J_ASSERT(!jbd2_revoke_record_cache); jbd2_revoke_record_cache = KMEM_CACHE(jbd2_revoke_record_s, SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY); if (!jbd2_revoke_record_cache) { pr_emerg("JBD2: failed to create revoke_record cache\n"); return -ENOMEM; } return 0; } int __init jbd2_journal_init_revoke_table_cache(void) { J_ASSERT(!jbd2_revoke_table_cache); jbd2_revoke_table_cache = KMEM_CACHE(jbd2_revoke_table_s, SLAB_TEMPORARY); if (!jbd2_revoke_table_cache) { pr_emerg("JBD2: failed to create revoke_table cache\n"); return -ENOMEM; } return 0; } static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size) { int shift = 0; int tmp = hash_size; struct jbd2_revoke_table_s *table; table = kmem_cache_alloc(jbd2_revoke_table_cache, GFP_KERNEL); if (!table) goto out; while((tmp >>= 1UL) != 0UL) shift++; table->hash_size = hash_size; table->hash_shift = shift; table->hash_table = kmalloc_array(hash_size, sizeof(struct list_head), GFP_KERNEL); if (!table->hash_table) { kmem_cache_free(jbd2_revoke_table_cache, table); table = NULL; goto out; } for (tmp = 0; tmp < hash_size; tmp++) INIT_LIST_HEAD(&table->hash_table[tmp]); out: return table; } static void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table) { int i; struct list_head *hash_list; for (i = 0; i < table->hash_size; i++) { hash_list = &table->hash_table[i]; J_ASSERT(list_empty(hash_list)); } kfree(table->hash_table); kmem_cache_free(jbd2_revoke_table_cache, table); } /* Initialise the revoke table for a given journal to a given size. */ int jbd2_journal_init_revoke(journal_t *journal, int hash_size) { J_ASSERT(journal->j_revoke_table[0] == NULL); J_ASSERT(is_power_of_2(hash_size)); journal->j_revoke_table[0] = jbd2_journal_init_revoke_table(hash_size); if (!journal->j_revoke_table[0]) goto fail0; journal->j_revoke_table[1] = jbd2_journal_init_revoke_table(hash_size); if (!journal->j_revoke_table[1]) goto fail1; journal->j_revoke = journal->j_revoke_table[1]; spin_lock_init(&journal->j_revoke_lock); return 0; fail1: jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]); journal->j_revoke_table[0] = NULL; fail0: return -ENOMEM; } /* Destroy a journal's revoke table. The table must already be empty! */ void jbd2_journal_destroy_revoke(journal_t *journal) { journal->j_revoke = NULL; if (journal->j_revoke_table[0]) jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]); if (journal->j_revoke_table[1]) jbd2_journal_destroy_revoke_table(journal->j_revoke_table[1]); } #ifdef __KERNEL__ /* * jbd2_journal_revoke: revoke a given buffer_head from the journal. This * prevents the block from being replayed during recovery if we take a * crash after this current transaction commits. Any subsequent * metadata writes of the buffer in this transaction cancel the * revoke. * * Note that this call may block --- it is up to the caller to make * sure that there are no further calls to journal_write_metadata * before the revoke is complete. In ext3, this implies calling the * revoke before clearing the block bitmap when we are deleting * metadata. * * Revoke performs a jbd2_journal_forget on any buffer_head passed in as a * parameter, but does _not_ forget the buffer_head if the bh was only * found implicitly. * * bh_in may not be a journalled buffer - it may have come off * the hash tables without an attached journal_head. * * If bh_in is non-zero, jbd2_journal_revoke() will decrement its b_count * by one. */ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr, struct buffer_head *bh_in) { struct buffer_head *bh = NULL; journal_t *journal; struct block_device *bdev; int err; might_sleep(); if (bh_in) BUFFER_TRACE(bh_in, "enter"); journal = handle->h_transaction->t_journal; if (!jbd2_journal_set_features(journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)){ J_ASSERT (!"Cannot set revoke feature!"); return -EINVAL; } bdev = journal->j_fs_dev; bh = bh_in; if (!bh) { bh = __find_get_block(bdev, blocknr, journal->j_blocksize); if (bh) BUFFER_TRACE(bh, "found on hash"); } #ifdef JBD2_EXPENSIVE_CHECKING else { struct buffer_head *bh2; /* If there is a different buffer_head lying around in * memory anywhere... */ bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize); if (bh2) { /* ... and it has RevokeValid status... */ if (bh2 != bh && buffer_revokevalid(bh2)) /* ...then it better be revoked too, * since it's illegal to create a revoke * record against a buffer_head which is * not marked revoked --- that would * risk missing a subsequent revoke * cancel. */ J_ASSERT_BH(bh2, buffer_revoked(bh2)); put_bh(bh2); } } #endif if (WARN_ON_ONCE(handle->h_revoke_credits <= 0)) { if (!bh_in) brelse(bh); return -EIO; } /* We really ought not ever to revoke twice in a row without first having the revoke cancelled: it's illegal to free a block twice without allocating it in between! */ if (bh) { if (!J_EXPECT_BH(bh, !buffer_revoked(bh), "inconsistent data on disk")) { if (!bh_in) brelse(bh); return -EIO; } set_buffer_revoked(bh); set_buffer_revokevalid(bh); if (bh_in) { BUFFER_TRACE(bh_in, "call jbd2_journal_forget"); jbd2_journal_forget(handle, bh_in); } else { BUFFER_TRACE(bh, "call brelse"); __brelse(bh); } } handle->h_revoke_credits--; jbd2_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in); err = insert_revoke_hash(journal, blocknr, handle->h_transaction->t_tid); BUFFER_TRACE(bh_in, "exit"); return err; } /* * Cancel an outstanding revoke. For use only internally by the * journaling code (called from jbd2_journal_get_write_access). * * We trust buffer_revoked() on the buffer if the buffer is already * being journaled: if there is no revoke pending on the buffer, then we * don't do anything here. * * This would break if it were possible for a buffer to be revoked and * discarded, and then reallocated within the same transaction. In such * a case we would have lost the revoked bit, but when we arrived here * the second time we would still have a pending revoke to cancel. So, * do not trust the Revoked bit on buffers unless RevokeValid is also * set. */ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) { struct jbd2_revoke_record_s *record; journal_t *journal = handle->h_transaction->t_journal; int need_cancel; int did_revoke = 0; /* akpm: debug */ struct buffer_head *bh = jh2bh(jh); jbd2_debug(4, "journal_head %p, cancelling revoke\n", jh); /* Is the existing Revoke bit valid? If so, we trust it, and * only perform the full cancel if the revoke bit is set. If * not, we can't trust the revoke bit, and we need to do the * full search for a revoke record. */ if (test_set_buffer_revokevalid(bh)) { need_cancel = test_clear_buffer_revoked(bh); } else { need_cancel = 1; clear_buffer_revoked(bh); } if (need_cancel) { record = find_revoke_record(journal, bh->b_blocknr); if (record) { jbd2_debug(4, "cancelled existing revoke on " "blocknr %llu\n", (unsigned long long)bh->b_blocknr); spin_lock(&journal->j_revoke_lock); list_del(&record->hash); spin_unlock(&journal->j_revoke_lock); kmem_cache_free(jbd2_revoke_record_cache, record); did_revoke = 1; } } #ifdef JBD2_EXPENSIVE_CHECKING /* There better not be one left behind by now! */ record = find_revoke_record(journal, bh->b_blocknr); J_ASSERT_JH(jh, record == NULL); #endif /* Finally, have we just cleared revoke on an unhashed * buffer_head? If so, we'd better make sure we clear the * revoked status on any hashed alias too, otherwise the revoke * state machine will get very upset later on. */ if (need_cancel) { struct buffer_head *bh2; bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size); if (bh2) { if (bh2 != bh) clear_buffer_revoked(bh2); __brelse(bh2); } } return did_revoke; } /* * journal_clear_revoked_flag clears revoked flag of buffers in * revoke table to reflect there is no revoked buffers in the next * transaction which is going to be started. */ void jbd2_clear_buffer_revoked_flags(journal_t *journal) { struct jbd2_revoke_table_s *revoke = journal->j_revoke; int i = 0; for (i = 0; i < revoke->hash_size; i++) { struct list_head *hash_list; struct list_head *list_entry; hash_list = &revoke->hash_table[i]; list_for_each(list_entry, hash_list) { struct jbd2_revoke_record_s *record; struct buffer_head *bh; record = (struct jbd2_revoke_record_s *)list_entry; bh = __find_get_block(journal->j_fs_dev, record->blocknr, journal->j_blocksize); if (bh) { clear_buffer_revoked(bh); __brelse(bh); } } } } /* journal_switch_revoke table select j_revoke for next transaction * we do not want to suspend any processing until all revokes are * written -bzzz */ void jbd2_journal_switch_revoke_table(journal_t *journal) { int i; if (journal->j_revoke == journal->j_revoke_table[0]) journal->j_revoke = journal->j_revoke_table[1]; else journal->j_revoke = journal->j_revoke_table[0]; for (i = 0; i < journal->j_revoke->hash_size; i++) INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]); } /* * Write revoke records to the journal for all entries in the current * revoke hash, deleting the entries as we go. */ void jbd2_journal_write_revoke_records(transaction_t *transaction, struct list_head *log_bufs) { journal_t *journal = transaction->t_journal; struct buffer_head *descriptor; struct jbd2_revoke_record_s *record; struct jbd2_revoke_table_s *revoke; struct list_head *hash_list; int i, offset, count; descriptor = NULL; offset = 0; count = 0; /* select revoke table for committing transaction */ revoke = journal->j_revoke == journal->j_revoke_table[0] ? journal->j_revoke_table[1] : journal->j_revoke_table[0]; for (i = 0; i < revoke->hash_size; i++) { hash_list = &revoke->hash_table[i]; while (!list_empty(hash_list)) { record = (struct jbd2_revoke_record_s *) hash_list->next; write_one_revoke_record(transaction, log_bufs, &descriptor, &offset, record); count++; list_del(&record->hash); kmem_cache_free(jbd2_revoke_record_cache, record); } } if (descriptor) flush_descriptor(journal, descriptor, offset); jbd2_debug(1, "Wrote %d revoke records\n", count); } /* * Write out one revoke record. We need to create a new descriptor * block if the old one is full or if we have not already created one. */ static void write_one_revoke_record(transaction_t *transaction, struct list_head *log_bufs, struct buffer_head **descriptorp, int *offsetp, struct jbd2_revoke_record_s *record) { journal_t *journal = transaction->t_journal; int csum_size = 0; struct buffer_head *descriptor; int sz, offset; /* If we are already aborting, this all becomes a noop. We still need to go round the loop in jbd2_journal_write_revoke_records in order to free all of the revoke records: only the IO to the journal is omitted. */ if (is_journal_aborted(journal)) return; descriptor = *descriptorp; offset = *offsetp; /* Do we need to leave space at the end for a checksum? */ if (jbd2_journal_has_csum_v2or3(journal)) csum_size = sizeof(struct jbd2_journal_block_tail); if (jbd2_has_feature_64bit(journal)) sz = 8; else sz = 4; /* Make sure we have a descriptor with space left for the record */ if (descriptor) { if (offset + sz > journal->j_blocksize - csum_size) { flush_descriptor(journal, descriptor, offset); descriptor = NULL; } } if (!descriptor) { descriptor = jbd2_journal_get_descriptor_buffer(transaction, JBD2_REVOKE_BLOCK); if (!descriptor) return; /* Record it so that we can wait for IO completion later */ BUFFER_TRACE(descriptor, "file in log_bufs"); jbd2_file_log_bh(log_bufs, descriptor); offset = sizeof(jbd2_journal_revoke_header_t); *descriptorp = descriptor; } if (jbd2_has_feature_64bit(journal)) * ((__be64 *)(&descriptor->b_data[offset])) = cpu_to_be64(record->blocknr); else * ((__be32 *)(&descriptor->b_data[offset])) = cpu_to_be32(record->blocknr); offset += sz; *offsetp = offset; } /* * Flush a revoke descriptor out to the journal. If we are aborting, * this is a noop; otherwise we are generating a buffer which needs to * be waited for during commit, so it has to go onto the appropriate * journal buffer list. */ static void flush_descriptor(journal_t *journal, struct buffer_head *descriptor, int offset) { jbd2_journal_revoke_header_t *header; if (is_journal_aborted(journal)) return; header = (jbd2_journal_revoke_header_t *)descriptor->b_data; header->r_count = cpu_to_be32(offset); jbd2_descriptor_block_csum_set(journal, descriptor); set_buffer_jwrite(descriptor); BUFFER_TRACE(descriptor, "write"); set_buffer_dirty(descriptor); write_dirty_buffer(descriptor, REQ_SYNC); } #endif /* * Revoke support for recovery. * * Recovery needs to be able to: * * record all revoke records, including the tid of the latest instance * of each revoke in the journal * * check whether a given block in a given transaction should be replayed * (ie. has not been revoked by a revoke record in that or a subsequent * transaction) * * empty the revoke table after recovery. */ /* * First, setting revoke records. We create a new revoke record for * every block ever revoked in the log as we scan it for recovery, and * we update the existing records if we find multiple revokes for a * single block. */ int jbd2_journal_set_revoke(journal_t *journal, unsigned long long blocknr, tid_t sequence) { struct jbd2_revoke_record_s *record; record = find_revoke_record(journal, blocknr); if (record) { /* If we have multiple occurrences, only record the * latest sequence number in the hashed record */ if (tid_gt(sequence, record->sequence)) record->sequence = sequence; return 0; } return insert_revoke_hash(journal, blocknr, sequence); } /* * Test revoke records. For a given block referenced in the log, has * that block been revoked? A revoke record with a given transaction * sequence number revokes all blocks in that transaction and earlier * ones, but later transactions still need replayed. */ int jbd2_journal_test_revoke(journal_t *journal, unsigned long long blocknr, tid_t sequence) { struct jbd2_revoke_record_s *record; record = find_revoke_record(journal, blocknr); if (!record) return 0; if (tid_gt(sequence, record->sequence)) return 0; return 1; } /* * Finally, once recovery is over, we need to clear the revoke table so * that it can be reused by the running filesystem. */ void jbd2_journal_clear_revoke(journal_t *journal) { int i; struct list_head *hash_list; struct jbd2_revoke_record_s *record; struct jbd2_revoke_table_s *revoke; revoke = journal->j_revoke; for (i = 0; i < revoke->hash_size; i++) { hash_list = &revoke->hash_table[i]; while (!list_empty(hash_list)) { record = (struct jbd2_revoke_record_s*) hash_list->next; list_del(&record->hash); kmem_cache_free(jbd2_revoke_record_cache, record); } } }
127 70 105 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_USER_NAMESPACE_H #define _LINUX_USER_NAMESPACE_H #include <linux/kref.h> #include <linux/nsproxy.h> #include <linux/ns_common.h> #include <linux/sched.h> #include <linux/workqueue.h> #include <linux/rwsem.h> #include <linux/sysctl.h> #include <linux/err.h> #define UID_GID_MAP_MAX_BASE_EXTENTS 5 #define UID_GID_MAP_MAX_EXTENTS 340 struct uid_gid_extent { u32 first; u32 lower_first; u32 count; }; struct uid_gid_map { /* 64 bytes -- 1 cache line */ u32 nr_extents; union { struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS]; struct { struct uid_gid_extent *forward; struct uid_gid_extent *reverse; }; }; }; #define USERNS_SETGROUPS_ALLOWED 1UL #define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED struct ucounts; enum ucount_type { UCOUNT_USER_NAMESPACES, UCOUNT_PID_NAMESPACES, UCOUNT_UTS_NAMESPACES, UCOUNT_IPC_NAMESPACES, UCOUNT_NET_NAMESPACES, UCOUNT_MNT_NAMESPACES, UCOUNT_CGROUP_NAMESPACES, UCOUNT_TIME_NAMESPACES, #ifdef CONFIG_INOTIFY_USER UCOUNT_INOTIFY_INSTANCES, UCOUNT_INOTIFY_WATCHES, #endif #ifdef CONFIG_FANOTIFY UCOUNT_FANOTIFY_GROUPS, UCOUNT_FANOTIFY_MARKS, #endif UCOUNT_COUNTS, }; enum rlimit_type { UCOUNT_RLIMIT_NPROC, UCOUNT_RLIMIT_MSGQUEUE, UCOUNT_RLIMIT_SIGPENDING, UCOUNT_RLIMIT_MEMLOCK, UCOUNT_RLIMIT_COUNTS, }; #if IS_ENABLED(CONFIG_BINFMT_MISC) struct binfmt_misc; #endif struct user_namespace { struct uid_gid_map uid_map; struct uid_gid_map gid_map; struct uid_gid_map projid_map; struct user_namespace *parent; int level; kuid_t owner; kgid_t group; struct ns_common ns; unsigned long flags; /* parent_could_setfcap: true if the creator if this ns had CAP_SETFCAP * in its effective capability set at the child ns creation time. */ bool parent_could_setfcap; #ifdef CONFIG_KEYS /* List of joinable keyrings in this namespace. Modification access of * these pointers is controlled by keyring_sem. Once * user_keyring_register is set, it won't be changed, so it can be * accessed directly with READ_ONCE(). */ struct list_head keyring_name_list; struct key *user_keyring_register; struct rw_semaphore keyring_sem; #endif /* Register of per-UID persistent keyrings for this namespace */ #ifdef CONFIG_PERSISTENT_KEYRINGS struct key *persistent_keyring_register; #endif struct work_struct work; #ifdef CONFIG_SYSCTL struct ctl_table_set set; struct ctl_table_header *sysctls; #endif struct ucounts *ucounts; long ucount_max[UCOUNT_COUNTS]; long rlimit_max[UCOUNT_RLIMIT_COUNTS]; #if IS_ENABLED(CONFIG_BINFMT_MISC) struct binfmt_misc *binfmt_misc; #endif } __randomize_layout; struct ucounts { struct hlist_node node; struct user_namespace *ns; kuid_t uid; atomic_t count; atomic_long_t ucount[UCOUNT_COUNTS]; atomic_long_t rlimit[UCOUNT_RLIMIT_COUNTS]; }; extern struct user_namespace init_user_ns; extern struct ucounts init_ucounts; bool setup_userns_sysctls(struct user_namespace *ns); void retire_userns_sysctls(struct user_namespace *ns); struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type); void dec_ucount(struct ucounts *ucounts, enum ucount_type type); struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid); struct ucounts * __must_check get_ucounts(struct ucounts *ucounts); void put_ucounts(struct ucounts *ucounts); static inline long get_rlimit_value(struct ucounts *ucounts, enum rlimit_type type) { return atomic_long_read(&ucounts->rlimit[type]); } long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v); bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v); long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type); void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type); bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long max); static inline long get_userns_rlimit_max(struct user_namespace *ns, enum rlimit_type type) { return READ_ONCE(ns->rlimit_max[type]); } static inline void set_userns_rlimit_max(struct user_namespace *ns, enum rlimit_type type, unsigned long max) { ns->rlimit_max[type] = max <= LONG_MAX ? max : LONG_MAX; } #ifdef CONFIG_USER_NS static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { if (ns) refcount_inc(&ns->ns.count); return ns; } extern int create_user_ns(struct cred *new); extern int unshare_userns(unsigned long unshare_flags, struct cred **new_cred); extern void __put_user_ns(struct user_namespace *ns); static inline void put_user_ns(struct user_namespace *ns) { if (ns && refcount_dec_and_test(&ns->ns.count)) __put_user_ns(ns); } struct seq_operations; extern const struct seq_operations proc_uid_seq_operations; extern const struct seq_operations proc_gid_seq_operations; extern const struct seq_operations proc_projid_seq_operations; extern ssize_t proc_uid_map_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t proc_gid_map_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t, loff_t *); extern int proc_setgroups_show(struct seq_file *m, void *v); extern bool userns_may_setgroups(const struct user_namespace *ns); extern bool in_userns(const struct user_namespace *ancestor, const struct user_namespace *child); extern bool current_in_userns(const struct user_namespace *target_ns); struct ns_common *ns_get_owner(struct ns_common *ns); #else static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { return &init_user_ns; } static inline int create_user_ns(struct cred *new) { return -EINVAL; } static inline int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) { if (unshare_flags & CLONE_NEWUSER) return -EINVAL; return 0; } static inline void put_user_ns(struct user_namespace *ns) { } static inline bool userns_may_setgroups(const struct user_namespace *ns) { return true; } static inline bool in_userns(const struct user_namespace *ancestor, const struct user_namespace *child) { return true; } static inline bool current_in_userns(const struct user_namespace *target_ns) { return true; } static inline struct ns_common *ns_get_owner(struct ns_common *ns) { return ERR_PTR(-EPERM); } #endif #endif /* _LINUX_USER_H */
56 4 56 7 56 4 56 1 56 53 7 53 53 53 53 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE * Copyright (C) 2016 - 2020 Christoph Hellwig */ #include <linux/init.h> #include <linux/mm.h> #include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/mpage.h> #include <linux/uio.h> #include <linux/namei.h> #include <linux/task_io_accounting_ops.h> #include <linux/falloc.h> #include <linux/suspend.h> #include <linux/fs.h> #include <linux/iomap.h> #include <linux/module.h> #include "blk.h" static inline struct inode *bdev_file_inode(struct file *file) { return file->f_mapping->host; } static blk_opf_t dio_bio_write_op(struct kiocb *iocb) { blk_opf_t opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; /* avoid the need for a I/O completion work item */ if (iocb_is_dsync(iocb)) opf |= REQ_FUA; return opf; } static bool blkdev_dio_unaligned(struct block_device *bdev, loff_t pos, struct iov_iter *iter) { return pos & (bdev_logical_block_size(bdev) - 1) || !bdev_iter_is_aligned(bdev, iter); } #define DIO_INLINE_BIO_VECS 4 static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, unsigned int nr_pages) { struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs; loff_t pos = iocb->ki_pos; bool should_dirty = false; struct bio bio; ssize_t ret; if (blkdev_dio_unaligned(bdev, pos, iter)) return -EINVAL; if (nr_pages <= DIO_INLINE_BIO_VECS) vecs = inline_vecs; else { vecs = kmalloc_array(nr_pages, sizeof(struct bio_vec), GFP_KERNEL); if (!vecs) return -ENOMEM; } if (iov_iter_rw(iter) == READ) { bio_init(&bio, bdev, vecs, nr_pages, REQ_OP_READ); if (user_backed_iter(iter)) should_dirty = true; } else { bio_init(&bio, bdev, vecs, nr_pages, dio_bio_write_op(iocb)); } bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT; bio.bi_ioprio = iocb->ki_ioprio; ret = bio_iov_iter_get_pages(&bio, iter); if (unlikely(ret)) goto out; ret = bio.bi_iter.bi_size; if (iov_iter_rw(iter) == WRITE) task_io_account_write(ret); if (iocb->ki_flags & IOCB_NOWAIT) bio.bi_opf |= REQ_NOWAIT; submit_bio_wait(&bio); bio_release_pages(&bio, should_dirty); if (unlikely(bio.bi_status)) ret = blk_status_to_errno(bio.bi_status); out: if (vecs != inline_vecs) kfree(vecs); bio_uninit(&bio); return ret; } enum { DIO_SHOULD_DIRTY = 1, DIO_IS_SYNC = 2, }; struct blkdev_dio { union { struct kiocb *iocb; struct task_struct *waiter; }; size_t size; atomic_t ref; unsigned int flags; struct bio bio ____cacheline_aligned_in_smp; }; static struct bio_set blkdev_dio_pool; static void blkdev_bio_end_io(struct bio *bio) { struct blkdev_dio *dio = bio->bi_private; bool should_dirty = dio->flags & DIO_SHOULD_DIRTY; if (bio->bi_status && !dio->bio.bi_status) dio->bio.bi_status = bio->bi_status; if (atomic_dec_and_test(&dio->ref)) { if (!(dio->flags & DIO_IS_SYNC)) { struct kiocb *iocb = dio->iocb; ssize_t ret; WRITE_ONCE(iocb->private, NULL); if (likely(!dio->bio.bi_status)) { ret = dio->size; iocb->ki_pos += ret; } else { ret = blk_status_to_errno(dio->bio.bi_status); } dio->iocb->ki_complete(iocb, ret); bio_put(&dio->bio); } else { struct task_struct *waiter = dio->waiter; WRITE_ONCE(dio->waiter, NULL); blk_wake_io_task(waiter); } } if (should_dirty) { bio_check_pages_dirty(bio); } else { bio_release_pages(bio, false); bio_put(bio); } } static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, unsigned int nr_pages) { struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); struct blk_plug plug; struct blkdev_dio *dio; struct bio *bio; bool is_read = (iov_iter_rw(iter) == READ), is_sync; blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb); loff_t pos = iocb->ki_pos; int ret = 0; if (blkdev_dio_unaligned(bdev, pos, iter)) return -EINVAL; if (iocb->ki_flags & IOCB_ALLOC_CACHE) opf |= REQ_ALLOC_CACHE; bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL, &blkdev_dio_pool); dio = container_of(bio, struct blkdev_dio, bio); atomic_set(&dio->ref, 1); /* * Grab an extra reference to ensure the dio structure which is embedded * into the first bio stays around. */ bio_get(bio); is_sync = is_sync_kiocb(iocb); if (is_sync) { dio->flags = DIO_IS_SYNC; dio->waiter = current; } else { dio->flags = 0; dio->iocb = iocb; } dio->size = 0; if (is_read && user_backed_iter(iter)) dio->flags |= DIO_SHOULD_DIRTY; blk_start_plug(&plug); for (;;) { bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT; bio->bi_private = dio; bio->bi_end_io = blkdev_bio_end_io; bio->bi_ioprio = iocb->ki_ioprio; ret = bio_iov_iter_get_pages(bio, iter); if (unlikely(ret)) { bio->bi_status = BLK_STS_IOERR; bio_endio(bio); break; } if (iocb->ki_flags & IOCB_NOWAIT) { /* * This is nonblocking IO, and we need to allocate * another bio if we have data left to map. As we * cannot guarantee that one of the sub bios will not * fail getting issued FOR NOWAIT and as error results * are coalesced across all of them, be safe and ask for * a retry of this from blocking context. */ if (unlikely(iov_iter_count(iter))) { bio_release_pages(bio, false); bio_clear_flag(bio, BIO_REFFED); bio_put(bio); blk_finish_plug(&plug); return -EAGAIN; } bio->bi_opf |= REQ_NOWAIT; } if (is_read) { if (dio->flags & DIO_SHOULD_DIRTY) bio_set_pages_dirty(bio); } else { task_io_account_write(bio->bi_iter.bi_size); } dio->size += bio->bi_iter.bi_size; pos += bio->bi_iter.bi_size; nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS); if (!nr_pages) { submit_bio(bio); break; } atomic_inc(&dio->ref); submit_bio(bio); bio = bio_alloc(bdev, nr_pages, opf, GFP_KERNEL); } blk_finish_plug(&plug); if (!is_sync) return -EIOCBQUEUED; for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (!READ_ONCE(dio->waiter)) break; blk_io_schedule(); } __set_current_state(TASK_RUNNING); if (!ret) ret = blk_status_to_errno(dio->bio.bi_status); if (likely(!ret)) ret = dio->size; bio_put(&dio->bio); return ret; } static void blkdev_bio_end_io_async(struct bio *bio) { struct blkdev_dio *dio = container_of(bio, struct blkdev_dio, bio); struct kiocb *iocb = dio->iocb; ssize_t ret; WRITE_ONCE(iocb->private, NULL); if (likely(!bio->bi_status)) { ret = dio->size; iocb->ki_pos += ret; } else { ret = blk_status_to_errno(bio->bi_status); } iocb->ki_complete(iocb, ret); if (dio->flags & DIO_SHOULD_DIRTY) { bio_check_pages_dirty(bio); } else { bio_release_pages(bio, false); bio_put(bio); } } static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, struct iov_iter *iter, unsigned int nr_pages) { struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); bool is_read = iov_iter_rw(iter) == READ; blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb); struct blkdev_dio *dio; struct bio *bio; loff_t pos = iocb->ki_pos; int ret = 0; if (blkdev_dio_unaligned(bdev, pos, iter)) return -EINVAL; if (iocb->ki_flags & IOCB_ALLOC_CACHE) opf |= REQ_ALLOC_CACHE; bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL, &blkdev_dio_pool); dio = container_of(bio, struct blkdev_dio, bio); dio->flags = 0; dio->iocb = iocb; bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT; bio->bi_end_io = blkdev_bio_end_io_async; bio->bi_ioprio = iocb->ki_ioprio; if (iov_iter_is_bvec(iter)) { /* * Users don't rely on the iterator being in any particular * state for async I/O returning -EIOCBQUEUED, hence we can * avoid expensive iov_iter_advance(). Bypass * bio_iov_iter_get_pages() and set the bvec directly. */ bio_iov_bvec_set(bio, iter); } else { ret = bio_iov_iter_get_pages(bio, iter); if (unlikely(ret)) { bio_put(bio); return ret; } } dio->size = bio->bi_iter.bi_size; if (is_read) { if (user_backed_iter(iter)) { dio->flags |= DIO_SHOULD_DIRTY; bio_set_pages_dirty(bio); } } else { task_io_account_write(bio->bi_iter.bi_size); } if (iocb->ki_flags & IOCB_NOWAIT) bio->bi_opf |= REQ_NOWAIT; if (iocb->ki_flags & IOCB_HIPRI) { bio->bi_opf |= REQ_POLLED; submit_bio(bio); WRITE_ONCE(iocb->private, bio); } else { submit_bio(bio); } return -EIOCBQUEUED; } static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { unsigned int nr_pages; if (!iov_iter_count(iter)) return 0; nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); if (likely(nr_pages <= BIO_MAX_VECS)) { if (is_sync_kiocb(iocb)) return __blkdev_direct_IO_simple(iocb, iter, nr_pages); return __blkdev_direct_IO_async(iocb, iter, nr_pages); } return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages)); } static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { struct block_device *bdev = I_BDEV(inode); loff_t isize = i_size_read(inode); iomap->bdev = bdev; iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev)); if (iomap->offset >= isize) return -EIO; iomap->type = IOMAP_MAPPED; iomap->addr = iomap->offset; iomap->length = isize - iomap->offset; iomap->flags |= IOMAP_F_BUFFER_HEAD; /* noop for !CONFIG_BUFFER_HEAD */ return 0; } static const struct iomap_ops blkdev_iomap_ops = { .iomap_begin = blkdev_iomap_begin, }; #ifdef CONFIG_BUFFER_HEAD static int blkdev_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { bh->b_bdev = I_BDEV(inode); bh->b_blocknr = iblock; set_buffer_mapped(bh); return 0; } static int blkdev_writepage(struct page *page, struct writeback_control *wbc) { return block_write_full_page(page, blkdev_get_block, wbc); } static int blkdev_read_folio(struct file *file, struct folio *folio) { return block_read_full_folio(folio, blkdev_get_block); } static void blkdev_readahead(struct readahead_control *rac) { mpage_readahead(rac, blkdev_get_block); } static int blkdev_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct page **pagep, void **fsdata) { return block_write_begin(mapping, pos, len, pagep, blkdev_get_block); } static int blkdev_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { int ret; ret = block_write_end(file, mapping, pos, len, copied, page, fsdata); unlock_page(page); put_page(page); return ret; } const struct address_space_operations def_blk_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, .read_folio = blkdev_read_folio, .readahead = blkdev_readahead, .writepage = blkdev_writepage, .write_begin = blkdev_write_begin, .write_end = blkdev_write_end, .migrate_folio = buffer_migrate_folio_norefs, .is_dirty_writeback = buffer_check_dirty_writeback, }; #else /* CONFIG_BUFFER_HEAD */ static int blkdev_read_folio(struct file *file, struct folio *folio) { return iomap_read_folio(folio, &blkdev_iomap_ops); } static void blkdev_readahead(struct readahead_control *rac) { iomap_readahead(rac, &blkdev_iomap_ops); } static int blkdev_map_blocks(struct iomap_writepage_ctx *wpc, struct inode *inode, loff_t offset) { loff_t isize = i_size_read(inode); if (WARN_ON_ONCE(offset >= isize)) return -EIO; if (offset >= wpc->iomap.offset && offset < wpc->iomap.offset + wpc->iomap.length) return 0; return blkdev_iomap_begin(inode, offset, isize - offset, IOMAP_WRITE, &wpc->iomap, NULL); } static const struct iomap_writeback_ops blkdev_writeback_ops = { .map_blocks = blkdev_map_blocks, }; static int blkdev_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct iomap_writepage_ctx wpc = { }; return iomap_writepages(mapping, wbc, &wpc, &blkdev_writeback_ops); } const struct address_space_operations def_blk_aops = { .dirty_folio = filemap_dirty_folio, .release_folio = iomap_release_folio, .invalidate_folio = iomap_invalidate_folio, .read_folio = blkdev_read_folio, .readahead = blkdev_readahead, .writepages = blkdev_writepages, .is_partially_uptodate = iomap_is_partially_uptodate, .error_remove_page = generic_error_remove_page, .migrate_folio = filemap_migrate_folio, }; #endif /* CONFIG_BUFFER_HEAD */ /* * for a block special file file_inode(file)->i_size is zero * so we compute the size by hand (just as in block_read/write above) */ static loff_t blkdev_llseek(struct file *file, loff_t offset, int whence) { struct inode *bd_inode = bdev_file_inode(file); loff_t retval; inode_lock(bd_inode); retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode)); inode_unlock(bd_inode); return retval; } static int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { struct block_device *bdev = I_BDEV(filp->f_mapping->host); int error; error = file_write_and_wait_range(filp, start, end); if (error) return error; /* * There is no need to serialise calls to blkdev_issue_flush with * i_mutex and doing so causes performance issues with concurrent * O_SYNC writers to a block device. */ error = blkdev_issue_flush(bdev); if (error == -EOPNOTSUPP) error = 0; return error; } /** * file_to_blk_mode - get block open flags from file flags * @file: file whose open flags should be converted * * Look at file open flags and generate corresponding block open flags from * them. The function works both for file just being open (e.g. during ->open * callback) and for file that is already open. This is actually non-trivial * (see comment in the function). */ blk_mode_t file_to_blk_mode(struct file *file) { blk_mode_t mode = 0; struct bdev_handle *handle = file->private_data; if (file->f_mode & FMODE_READ) mode |= BLK_OPEN_READ; if (file->f_mode & FMODE_WRITE) mode |= BLK_OPEN_WRITE; /* * do_dentry_open() clears O_EXCL from f_flags, use handle->mode to * determine whether the open was exclusive for already open files. */ if (handle) mode |= handle->mode & BLK_OPEN_EXCL; else if (file->f_flags & O_EXCL) mode |= BLK_OPEN_EXCL; if (file->f_flags & O_NDELAY) mode |= BLK_OPEN_NDELAY; /* * If all bits in O_ACCMODE set (aka O_RDWR | O_WRONLY), the floppy * driver has historically allowed ioctls as if the file was opened for * writing, but does not allow and actual reads or writes. */ if ((file->f_flags & O_ACCMODE) == (O_RDWR | O_WRONLY)) mode |= BLK_OPEN_WRITE_IOCTL; return mode; } static int blkdev_open(struct inode *inode, struct file *filp) { struct bdev_handle *handle; blk_mode_t mode; /* * Preserve backwards compatibility and allow large file access * even if userspace doesn't ask for it explicitly. Some mkfs * binary needs it. We might want to drop this workaround * during an unstable branch. */ filp->f_flags |= O_LARGEFILE; filp->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT; mode = file_to_blk_mode(filp); handle = bdev_open_by_dev(inode->i_rdev, mode, mode & BLK_OPEN_EXCL ? filp : NULL, NULL); if (IS_ERR(handle)) return PTR_ERR(handle); if (bdev_nowait(handle->bdev)) filp->f_mode |= FMODE_NOWAIT; filp->f_mapping = handle->bdev->bd_inode->i_mapping; filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); filp->private_data = handle; return 0; } static int blkdev_release(struct inode *inode, struct file *filp) { bdev_release(filp->private_data); return 0; } static ssize_t blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from) { size_t count = iov_iter_count(from); ssize_t written; written = kiocb_invalidate_pages(iocb, count); if (written) { if (written == -EBUSY) return 0; return written; } written = blkdev_direct_IO(iocb, from); if (written > 0) { kiocb_invalidate_post_direct_write(iocb, count); iocb->ki_pos += written; count -= written; } if (written != -EIOCBQUEUED) iov_iter_revert(from, count - iov_iter_count(from)); return written; } static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from) { return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops); } /* * Write data to the block device. Only intended for the block device itself * and the raw driver which basically is a fake block device. * * Does not take i_mutex for the write and thus is not for general purpose * use. */ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct block_device *bdev = I_BDEV(file->f_mapping->host); struct inode *bd_inode = bdev->bd_inode; loff_t size = bdev_nr_bytes(bdev); size_t shorted = 0; ssize_t ret; if (bdev_read_only(bdev)) return -EPERM; if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev)) return -ETXTBSY; if (!iov_iter_count(from)) return 0; if (iocb->ki_pos >= size) return -ENOSPC; if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT) return -EOPNOTSUPP; size -= iocb->ki_pos; if (iov_iter_count(from) > size) { shorted = iov_iter_count(from) - size; iov_iter_truncate(from, size); } ret = file_update_time(file); if (ret) return ret; if (iocb->ki_flags & IOCB_DIRECT) { ret = blkdev_direct_write(iocb, from); if (ret >= 0 && iov_iter_count(from)) ret = direct_write_fallback(iocb, from, ret, blkdev_buffered_write(iocb, from)); } else { ret = blkdev_buffered_write(iocb, from); } if (ret > 0) ret = generic_write_sync(iocb, ret); iov_iter_reexpand(from, iov_iter_count(from) + shorted); return ret; } static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); loff_t size = bdev_nr_bytes(bdev); loff_t pos = iocb->ki_pos; size_t shorted = 0; ssize_t ret = 0; size_t count; if (unlikely(pos + iov_iter_count(to) > size)) { if (pos >= size) return 0; size -= pos; shorted = iov_iter_count(to) - size; iov_iter_truncate(to, size); } count = iov_iter_count(to); if (!count) goto reexpand; /* skip atime */ if (iocb->ki_flags & IOCB_DIRECT) { ret = kiocb_write_and_wait(iocb, count); if (ret < 0) goto reexpand; file_accessed(iocb->ki_filp); ret = blkdev_direct_IO(iocb, to); if (ret >= 0) { iocb->ki_pos += ret; count -= ret; } iov_iter_revert(to, count - iov_iter_count(to)); if (ret < 0 || !count) goto reexpand; } ret = filemap_read(iocb, to, ret); reexpand: if (unlikely(shorted)) iov_iter_reexpand(to, iov_iter_count(to) + shorted); return ret; } #define BLKDEV_FALLOC_FL_SUPPORTED \ (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE) static long blkdev_fallocate(struct file *file, int mode, loff_t start, loff_t len) { struct inode *inode = bdev_file_inode(file); struct block_device *bdev = I_BDEV(inode); loff_t end = start + len - 1; loff_t isize; int error; /* Fail if we don't recognize the flags. */ if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED) return -EOPNOTSUPP; /* Don't go off the end of the device. */ isize = bdev_nr_bytes(bdev); if (start >= isize) return -EINVAL; if (end >= isize) { if (mode & FALLOC_FL_KEEP_SIZE) { len = isize - start; end = start + len - 1; } else return -EINVAL; } /* * Don't allow IO that isn't aligned to logical block size. */ if ((start | len) & (bdev_logical_block_size(bdev) - 1)) return -EINVAL; filemap_invalidate_lock(inode->i_mapping); /* * Invalidate the page cache, including dirty pages, for valid * de-allocate mode calls to fallocate(). */ switch (mode) { case FALLOC_FL_ZERO_RANGE: case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE: error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end); if (error) goto fail; error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT, len >> SECTOR_SHIFT, GFP_KERNEL, BLKDEV_ZERO_NOUNMAP); break; case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE: error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end); if (error) goto fail; error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT, len >> SECTOR_SHIFT, GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK); break; case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE: error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end); if (error) goto fail; error = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT, len >> SECTOR_SHIFT, GFP_KERNEL); break; default: error = -EOPNOTSUPP; } fail: filemap_invalidate_unlock(inode->i_mapping); return error; } static int blkdev_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *bd_inode = bdev_file_inode(file); if (bdev_read_only(I_BDEV(bd_inode))) return generic_file_readonly_mmap(file, vma); return generic_file_mmap(file, vma); } const struct file_operations def_blk_fops = { .open = blkdev_open, .release = blkdev_release, .llseek = blkdev_llseek, .read_iter = blkdev_read_iter, .write_iter = blkdev_write_iter, .iopoll = iocb_bio_iopoll, .mmap = blkdev_mmap, .fsync = blkdev_fsync, .unlocked_ioctl = blkdev_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = compat_blkdev_ioctl, #endif .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, .fallocate = blkdev_fallocate, }; static __init int blkdev_init(void) { return bioset_init(&blkdev_dio_pool, 4, offsetof(struct blkdev_dio, bio), BIOSET_NEED_BVECS|BIOSET_PERCPU_CACHE); } module_init(blkdev_init);
5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620 9621 9622 9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9758 9759 9760 9761 9762 9763 9764 9765 9766 9767 9768 9769 9770 9771 9772 9773 9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784 9785 9786 9787 9788 9789 9790 9791 9792 9793 9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805 9806 9807 9808 9809 9810 9811 9812 9813 9814 9815 9816 9817 9818 9819 9820 9821 9822 9823 9824 9825 9826 9827 9828 9829 9830 9831 9832 9833 9834 9835 9836 9837 9838 9839 9840 9841 9842 9843 9844 9845 9846 9847 9848 9849 9850 9851 9852 9853 9854 9855 9856 9857 9858 9859 9860 9861 9862 9863 9864 9865 9866 9867 9868 9869 9870 9871 9872 9873 9874 9875 9876 9877 9878 9879 9880 9881 9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895 9896 9897 9898 9899 9900 9901 9902 9903 9904 9905 9906 9907 9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924 9925 9926 9927 9928 9929 9930 9931 9932 9933 9934 9935 9936 9937 9938 9939 9940 9941 9942 9943 9944 9945 9946 9947 9948 9949 9950 9951 9952 9953 9954 9955 9956 9957 9958 9959 9960 9961 9962 9963 9964 9965 9966 9967 9968 9969 9970 9971 9972 9973 9974 9975 9976 9977 9978 9979 9980 9981 9982 9983 9984 9985 9986 9987 9988 9989 9990 9991 9992 9993 9994 9995 9996 9997 9998 9999 10000 10001 10002 10003 10004 10005 10006 10007 10008 10009 10010 10011 10012 10013 10014 10015 10016 10017 10018 10019 10020 10021 10022 10023 10024 10025 10026 10027 10028 10029 10030 10031 10032 10033 10034 10035 10036 10037 10038 10039 10040 10041 10042 10043 10044 10045 10046 10047 10048 10049 10050 10051 10052 10053 10054 10055 10056 10057 10058 10059 10060 10061 10062 10063 10064 10065 10066 10067 10068 10069 10070 10071 10072 10073 10074 10075 10076 10077 10078 10079 10080 10081 10082 10083 10084 10085 10086 10087 10088 10089 10090 10091 10092 10093 10094 10095 10096 10097 10098 10099 10100 10101 10102 10103 10104 10105 10106 10107 10108 10109 10110 10111 10112 10113 10114 10115 10116 10117 10118 10119 10120 10121 10122 10123 10124 10125 10126 10127 10128 10129 10130 10131 // SPDX-License-Identifier: GPL-2.0-or-later /* md.c : Multiple Devices driver for Linux Copyright (C) 1998, 1999, 2000 Ingo Molnar completely rewritten, based on the MD driver code from Marc Zyngier Changes: - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> - kerneld support by Boris Tobotras <boris@xtalk.msk.su> - kmod support by: Cyrus Durgin - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> - lots of fixes and improvements to the RAID1/RAID5 and generic RAID code (such as request based resynchronization): Neil Brown <neilb@cse.unsw.edu.au>. - persistent bitmap code Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. Errors, Warnings, etc. Please use: pr_crit() for error conditions that risk data loss pr_err() for error conditions that are unexpected, like an IO error or internal inconsistency pr_warn() for error conditions that could have been predicated, like adding a device to an array when it has incompatible metadata pr_info() for every interesting, very rare events, like an array starting or stopping, or resync starting or stopping pr_debug() for everything else. */ #include <linux/sched/mm.h> #include <linux/sched/signal.h> #include <linux/kthread.h> #include <linux/blkdev.h> #include <linux/blk-integrity.h> #include <linux/badblocks.h> #include <linux/sysctl.h> #include <linux/seq_file.h> #include <linux/fs.h> #include <linux/poll.h> #include <linux/ctype.h> #include <linux/string.h> #include <linux/hdreg.h> #include <linux/proc_fs.h> #include <linux/random.h> #include <linux/major.h> #include <linux/module.h> #include <linux/reboot.h> #include <linux/file.h> #include <linux/compat.h> #include <linux/delay.h> #include <linux/raid/md_p.h> #include <linux/raid/md_u.h> #include <linux/raid/detect.h> #include <linux/slab.h> #include <linux/percpu-refcount.h> #include <linux/part_stat.h> #include <trace/events/block.h> #include "md.h" #include "md-bitmap.h" #include "md-cluster.h" /* pers_list is a list of registered personalities protected by pers_lock. */ static LIST_HEAD(pers_list); static DEFINE_SPINLOCK(pers_lock); static const struct kobj_type md_ktype; struct md_cluster_operations *md_cluster_ops; EXPORT_SYMBOL(md_cluster_ops); static struct module *md_cluster_mod; static DECLARE_WAIT_QUEUE_HEAD(resync_wait); static struct workqueue_struct *md_wq; static struct workqueue_struct *md_misc_wq; struct workqueue_struct *md_bitmap_wq; static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this); static void mddev_detach(struct mddev *mddev); static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); static void md_wakeup_thread_directly(struct md_thread __rcu *thread); enum md_ro_state { MD_RDWR, MD_RDONLY, MD_AUTO_READ, MD_MAX_STATE }; static bool md_is_rdwr(struct mddev *mddev) { return (mddev->ro == MD_RDWR); } /* * Default number of read corrections we'll attempt on an rdev * before ejecting it from the array. We divide the read error * count by 2 for every hour elapsed between read errors. */ #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 /* Default safemode delay: 200 msec */ #define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1) /* * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' * is 1000 KB/sec, so the extra system load does not show up that much. * Increase it if you want to have more _guaranteed_ speed. Note that * the RAID driver will use the maximum available bandwidth if the IO * subsystem is idle. There is also an 'absolute maximum' reconstruction * speed limit - in case reconstruction slows down your system despite * idle IO detection. * * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. * or /sys/block/mdX/md/sync_speed_{min,max} */ static int sysctl_speed_limit_min = 1000; static int sysctl_speed_limit_max = 200000; static inline int speed_min(struct mddev *mddev) { return mddev->sync_speed_min ? mddev->sync_speed_min : sysctl_speed_limit_min; } static inline int speed_max(struct mddev *mddev) { return mddev->sync_speed_max ? mddev->sync_speed_max : sysctl_speed_limit_max; } static void rdev_uninit_serial(struct md_rdev *rdev) { if (!test_and_clear_bit(CollisionCheck, &rdev->flags)) return; kvfree(rdev->serial); rdev->serial = NULL; } static void rdevs_uninit_serial(struct mddev *mddev) { struct md_rdev *rdev; rdev_for_each(rdev, mddev) rdev_uninit_serial(rdev); } static int rdev_init_serial(struct md_rdev *rdev) { /* serial_nums equals with BARRIER_BUCKETS_NR */ int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t)))); struct serial_in_rdev *serial = NULL; if (test_bit(CollisionCheck, &rdev->flags)) return 0; serial = kvmalloc(sizeof(struct serial_in_rdev) * serial_nums, GFP_KERNEL); if (!serial) return -ENOMEM; for (i = 0; i < serial_nums; i++) { struct serial_in_rdev *serial_tmp = &serial[i]; spin_lock_init(&serial_tmp->serial_lock); serial_tmp->serial_rb = RB_ROOT_CACHED; init_waitqueue_head(&serial_tmp->serial_io_wait); } rdev->serial = serial; set_bit(CollisionCheck, &rdev->flags); return 0; } static int rdevs_init_serial(struct mddev *mddev) { struct md_rdev *rdev; int ret = 0; rdev_for_each(rdev, mddev) { ret = rdev_init_serial(rdev); if (ret) break; } /* Free all resources if pool is not existed */ if (ret && !mddev->serial_info_pool) rdevs_uninit_serial(mddev); return ret; } /* * rdev needs to enable serial stuffs if it meets the conditions: * 1. it is multi-queue device flaged with writemostly. * 2. the write-behind mode is enabled. */ static int rdev_need_serial(struct md_rdev *rdev) { return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 && rdev->bdev->bd_disk->queue->nr_hw_queues != 1 && test_bit(WriteMostly, &rdev->flags)); } /* * Init resource for rdev(s), then create serial_info_pool if: * 1. rdev is the first device which return true from rdev_enable_serial. * 2. rdev is NULL, means we want to enable serialization for all rdevs. */ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev) { int ret = 0; if (rdev && !rdev_need_serial(rdev) && !test_bit(CollisionCheck, &rdev->flags)) return; if (!rdev) ret = rdevs_init_serial(mddev); else ret = rdev_init_serial(rdev); if (ret) return; if (mddev->serial_info_pool == NULL) { /* * already in memalloc noio context by * mddev_suspend() */ mddev->serial_info_pool = mempool_create_kmalloc_pool(NR_SERIAL_INFOS, sizeof(struct serial_info)); if (!mddev->serial_info_pool) { rdevs_uninit_serial(mddev); pr_err("can't alloc memory pool for serialization\n"); } } } /* * Free resource from rdev(s), and destroy serial_info_pool under conditions: * 1. rdev is the last device flaged with CollisionCheck. * 2. when bitmap is destroyed while policy is not enabled. * 3. for disable policy, the pool is destroyed only when no rdev needs it. */ void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev) { if (rdev && !test_bit(CollisionCheck, &rdev->flags)) return; if (mddev->serial_info_pool) { struct md_rdev *temp; int num = 0; /* used to track if other rdevs need the pool */ rdev_for_each(temp, mddev) { if (!rdev) { if (!mddev->serialize_policy || !rdev_need_serial(temp)) rdev_uninit_serial(temp); else num++; } else if (temp != rdev && test_bit(CollisionCheck, &temp->flags)) num++; } if (rdev) rdev_uninit_serial(rdev); if (num) pr_info("The mempool could be used by other devices\n"); else { mempool_destroy(mddev->serial_info_pool); mddev->serial_info_pool = NULL; } } } static struct ctl_table_header *raid_table_header; static struct ctl_table raid_table[] = { { .procname = "speed_limit_min", .data = &sysctl_speed_limit_min, .maxlen = sizeof(int), .mode = S_IRUGO|S_IWUSR, .proc_handler = proc_dointvec, }, { .procname = "speed_limit_max", .data = &sysctl_speed_limit_max, .maxlen = sizeof(int), .mode = S_IRUGO|S_IWUSR, .proc_handler = proc_dointvec, }, }; static int start_readonly; /* * The original mechanism for creating an md device is to create * a device node in /dev and to open it. This causes races with device-close. * The preferred method is to write to the "new_array" module parameter. * This can avoid races. * Setting create_on_open to false disables the original mechanism * so all the races disappear. */ static bool create_on_open = true; /* * We have a system wide 'event count' that is incremented * on any 'interesting' event, and readers of /proc/mdstat * can use 'poll' or 'select' to find out when the event * count increases. * * Events are: * start array, stop array, error, add device, remove device, * start build, activate spare */ static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); static atomic_t md_event_count; void md_new_event(void) { atomic_inc(&md_event_count); wake_up(&md_event_waiters); } EXPORT_SYMBOL_GPL(md_new_event); /* * Enables to iterate over all existing md arrays * all_mddevs_lock protects this list. */ static LIST_HEAD(all_mddevs); static DEFINE_SPINLOCK(all_mddevs_lock); static bool is_md_suspended(struct mddev *mddev) { return percpu_ref_is_dying(&mddev->active_io); } /* Rather than calling directly into the personality make_request function, * IO requests come here first so that we can check if the device is * being suspended pending a reconfiguration. * We hold a refcount over the call to ->make_request. By the time that * call has finished, the bio has been linked into some internal structure * and so is visible to ->quiesce(), so we don't need the refcount any more. */ static bool is_suspended(struct mddev *mddev, struct bio *bio) { if (is_md_suspended(mddev)) return true; if (bio_data_dir(bio) != WRITE) return false; if (READ_ONCE(mddev->suspend_lo) >= READ_ONCE(mddev->suspend_hi)) return false; if (bio->bi_iter.bi_sector >= READ_ONCE(mddev->suspend_hi)) return false; if (bio_end_sector(bio) < READ_ONCE(mddev->suspend_lo)) return false; return true; } void md_handle_request(struct mddev *mddev, struct bio *bio) { check_suspended: if (is_suspended(mddev, bio)) { DEFINE_WAIT(__wait); /* Bail out if REQ_NOWAIT is set for the bio */ if (bio->bi_opf & REQ_NOWAIT) { bio_wouldblock_error(bio); return; } for (;;) { prepare_to_wait(&mddev->sb_wait, &__wait, TASK_UNINTERRUPTIBLE); if (!is_suspended(mddev, bio)) break; schedule(); } finish_wait(&mddev->sb_wait, &__wait); } if (!percpu_ref_tryget_live(&mddev->active_io)) goto check_suspended; if (!mddev->pers->make_request(mddev, bio)) { percpu_ref_put(&mddev->active_io); goto check_suspended; } percpu_ref_put(&mddev->active_io); } EXPORT_SYMBOL(md_handle_request); static void md_submit_bio(struct bio *bio) { const int rw = bio_data_dir(bio); struct mddev *mddev = bio->bi_bdev->bd_disk->private_data; if (mddev == NULL || mddev->pers == NULL) { bio_io_error(bio); return; } if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) { bio_io_error(bio); return; } bio = bio_split_to_limits(bio); if (!bio) return; if (mddev->ro == MD_RDONLY && unlikely(rw == WRITE)) { if (bio_sectors(bio) != 0) bio->bi_status = BLK_STS_IOERR; bio_endio(bio); return; } /* bio could be mergeable after passing to underlayer */ bio->bi_opf &= ~REQ_NOMERGE; md_handle_request(mddev, bio); } /* * Make sure no new requests are submitted to the device, and any requests that * have been submitted are completely handled. */ int mddev_suspend(struct mddev *mddev, bool interruptible) { int err = 0; /* * hold reconfig_mutex to wait for normal io will deadlock, because * other context can't update super_block, and normal io can rely on * updating super_block. */ lockdep_assert_not_held(&mddev->reconfig_mutex); if (interruptible) err = mutex_lock_interruptible(&mddev->suspend_mutex); else mutex_lock(&mddev->suspend_mutex); if (err) return err; if (mddev->suspended) { WRITE_ONCE(mddev->suspended, mddev->suspended + 1); mutex_unlock(&mddev->suspend_mutex); return 0; } percpu_ref_kill(&mddev->active_io); if (interruptible) err = wait_event_interruptible(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io)); else wait_event(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io)); if (err) { percpu_ref_resurrect(&mddev->active_io); mutex_unlock(&mddev->suspend_mutex); return err; } /* * For raid456, io might be waiting for reshape to make progress, * allow new reshape to start while waiting for io to be done to * prevent deadlock. */ WRITE_ONCE(mddev->suspended, mddev->suspended + 1); del_timer_sync(&mddev->safemode_timer); /* restrict memory reclaim I/O during raid array is suspend */ mddev->noio_flag = memalloc_noio_save(); mutex_unlock(&mddev->suspend_mutex); return 0; } EXPORT_SYMBOL_GPL(mddev_suspend); void mddev_resume(struct mddev *mddev) { lockdep_assert_not_held(&mddev->reconfig_mutex); mutex_lock(&mddev->suspend_mutex); WRITE_ONCE(mddev->suspended, mddev->suspended - 1); if (mddev->suspended) { mutex_unlock(&mddev->suspend_mutex); return; } /* entred the memalloc scope from mddev_suspend() */ memalloc_noio_restore(mddev->noio_flag); percpu_ref_resurrect(&mddev->active_io); wake_up(&mddev->sb_wait); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ mutex_unlock(&mddev->suspend_mutex); } EXPORT_SYMBOL_GPL(mddev_resume); /* * Generic flush handling for md */ static void md_end_flush(struct bio *bio) { struct md_rdev *rdev = bio->bi_private; struct mddev *mddev = rdev->mddev; bio_put(bio); rdev_dec_pending(rdev, mddev); if (atomic_dec_and_test(&mddev->flush_pending)) { /* The pre-request flush has finished */ queue_work(md_wq, &mddev->flush_work); } } static void md_submit_flush_data(struct work_struct *ws); static void submit_flushes(struct work_struct *ws) { struct mddev *mddev = container_of(ws, struct mddev, flush_work); struct md_rdev *rdev; mddev->start_flush = ktime_get_boottime(); INIT_WORK(&mddev->flush_work, md_submit_flush_data); atomic_set(&mddev->flush_pending, 1); rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags)) { /* Take two references, one is dropped * when request finishes, one after * we reclaim rcu_read_lock */ struct bio *bi; atomic_inc(&rdev->nr_pending); atomic_inc(&rdev->nr_pending); rcu_read_unlock(); bi = bio_alloc_bioset(rdev->bdev, 0, REQ_OP_WRITE | REQ_PREFLUSH, GFP_NOIO, &mddev->bio_set); bi->bi_end_io = md_end_flush; bi->bi_private = rdev; atomic_inc(&mddev->flush_pending); submit_bio(bi); rcu_read_lock(); rdev_dec_pending(rdev, mddev); } rcu_read_unlock(); if (atomic_dec_and_test(&mddev->flush_pending)) queue_work(md_wq, &mddev->flush_work); } static void md_submit_flush_data(struct work_struct *ws) { struct mddev *mddev = container_of(ws, struct mddev, flush_work); struct bio *bio = mddev->flush_bio; /* * must reset flush_bio before calling into md_handle_request to avoid a * deadlock, because other bios passed md_handle_request suspend check * could wait for this and below md_handle_request could wait for those * bios because of suspend check */ spin_lock_irq(&mddev->lock); mddev->prev_flush_start = mddev->start_flush; mddev->flush_bio = NULL; spin_unlock_irq(&mddev->lock); wake_up(&mddev->sb_wait); if (bio->bi_iter.bi_size == 0) { /* an empty barrier - all done */ bio_endio(bio); } else { bio->bi_opf &= ~REQ_PREFLUSH; md_handle_request(mddev, bio); } } /* * Manages consolidation of flushes and submitting any flushes needed for * a bio with REQ_PREFLUSH. Returns true if the bio is finished or is * being finished in another context. Returns false if the flushing is * complete but still needs the I/O portion of the bio to be processed. */ bool md_flush_request(struct mddev *mddev, struct bio *bio) { ktime_t req_start = ktime_get_boottime(); spin_lock_irq(&mddev->lock); /* flush requests wait until ongoing flush completes, * hence coalescing all the pending requests. */ wait_event_lock_irq(mddev->sb_wait, !mddev->flush_bio || ktime_before(req_start, mddev->prev_flush_start), mddev->lock); /* new request after previous flush is completed */ if (ktime_after(req_start, mddev->prev_flush_start)) { WARN_ON(mddev->flush_bio); mddev->flush_bio = bio; bio = NULL; } spin_unlock_irq(&mddev->lock); if (!bio) { INIT_WORK(&mddev->flush_work, submit_flushes); queue_work(md_wq, &mddev->flush_work); } else { /* flush was performed for some other bio while we waited. */ if (bio->bi_iter.bi_size == 0) /* an empty barrier - all done */ bio_endio(bio); else { bio->bi_opf &= ~REQ_PREFLUSH; return false; } } return true; } EXPORT_SYMBOL(md_flush_request); static inline struct mddev *mddev_get(struct mddev *mddev) { lockdep_assert_held(&all_mddevs_lock); if (test_bit(MD_DELETED, &mddev->flags)) return NULL; atomic_inc(&mddev->active); return mddev; } static void mddev_delayed_delete(struct work_struct *ws); static void __mddev_put(struct mddev *mddev) { if (mddev->raid_disks || !list_empty(&mddev->disks) || mddev->ctime || mddev->hold_active) return; /* Array is not configured at all, and not held active, so destroy it */ set_bit(MD_DELETED, &mddev->flags); /* * Call queue_work inside the spinlock so that flush_workqueue() after * mddev_find will succeed in waiting for the work to be done. */ queue_work(md_misc_wq, &mddev->del_work); } void mddev_put(struct mddev *mddev) { if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) return; __mddev_put(mddev); spin_unlock(&all_mddevs_lock); } static void md_safemode_timeout(struct timer_list *t); static void md_start_sync(struct work_struct *ws); static void active_io_release(struct percpu_ref *ref) { struct mddev *mddev = container_of(ref, struct mddev, active_io); wake_up(&mddev->sb_wait); } static void no_op(struct percpu_ref *r) {} int mddev_init(struct mddev *mddev) { if (percpu_ref_init(&mddev->active_io, active_io_release, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) return -ENOMEM; if (percpu_ref_init(&mddev->writes_pending, no_op, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { percpu_ref_exit(&mddev->active_io); return -ENOMEM; } /* We want to start with the refcount at zero */ percpu_ref_put(&mddev->writes_pending); mutex_init(&mddev->open_mutex); mutex_init(&mddev->reconfig_mutex); mutex_init(&mddev->sync_mutex); mutex_init(&mddev->suspend_mutex); mutex_init(&mddev->bitmap_info.mutex); INIT_LIST_HEAD(&mddev->disks); INIT_LIST_HEAD(&mddev->all_mddevs); INIT_LIST_HEAD(&mddev->deleting); timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); atomic_set(&mddev->active, 1); atomic_set(&mddev->openers, 0); atomic_set(&mddev->sync_seq, 0); spin_lock_init(&mddev->lock); atomic_set(&mddev->flush_pending, 0); init_waitqueue_head(&mddev->sb_wait); init_waitqueue_head(&mddev->recovery_wait); mddev->reshape_position = MaxSector; mddev->reshape_backwards = 0; mddev->last_sync_action = "none"; mddev->resync_min = 0; mddev->resync_max = MaxSector; mddev->level = LEVEL_NONE; INIT_WORK(&mddev->sync_work, md_start_sync); INIT_WORK(&mddev->del_work, mddev_delayed_delete); return 0; } EXPORT_SYMBOL_GPL(mddev_init); void mddev_destroy(struct mddev *mddev) { percpu_ref_exit(&mddev->active_io); percpu_ref_exit(&mddev->writes_pending); } EXPORT_SYMBOL_GPL(mddev_destroy); static struct mddev *mddev_find_locked(dev_t unit) { struct mddev *mddev; list_for_each_entry(mddev, &all_mddevs, all_mddevs) if (mddev->unit == unit) return mddev; return NULL; } /* find an unused unit number */ static dev_t mddev_alloc_unit(void) { static int next_minor = 512; int start = next_minor; bool is_free = 0; dev_t dev = 0; while (!is_free) { dev = MKDEV(MD_MAJOR, next_minor); next_minor++; if (next_minor > MINORMASK) next_minor = 0; if (next_minor == start) return 0; /* Oh dear, all in use. */ is_free = !mddev_find_locked(dev); } return dev; } static struct mddev *mddev_alloc(dev_t unit) { struct mddev *new; int error; if (unit && MAJOR(unit) != MD_MAJOR) unit &= ~((1 << MdpMinorShift) - 1); new = kzalloc(sizeof(*new), GFP_KERNEL); if (!new) return ERR_PTR(-ENOMEM); error = mddev_init(new); if (error) goto out_free_new; spin_lock(&all_mddevs_lock); if (unit) { error = -EEXIST; if (mddev_find_locked(unit)) goto out_destroy_new; new->unit = unit; if (MAJOR(unit) == MD_MAJOR) new->md_minor = MINOR(unit); else new->md_minor = MINOR(unit) >> MdpMinorShift; new->hold_active = UNTIL_IOCTL; } else { error = -ENODEV; new->unit = mddev_alloc_unit(); if (!new->unit) goto out_destroy_new; new->md_minor = MINOR(new->unit); new->hold_active = UNTIL_STOP; } list_add(&new->all_mddevs, &all_mddevs); spin_unlock(&all_mddevs_lock); return new; out_destroy_new: spin_unlock(&all_mddevs_lock); mddev_destroy(new); out_free_new: kfree(new); return ERR_PTR(error); } static void mddev_free(struct mddev *mddev) { spin_lock(&all_mddevs_lock); list_del(&mddev->all_mddevs); spin_unlock(&all_mddevs_lock); mddev_destroy(mddev); kfree(mddev); } static const struct attribute_group md_redundancy_group; void mddev_unlock(struct mddev *mddev) { struct md_rdev *rdev; struct md_rdev *tmp; LIST_HEAD(delete); if (!list_empty(&mddev->deleting)) list_splice_init(&mddev->deleting, &delete); if (mddev->to_remove) { /* These cannot be removed under reconfig_mutex as * an access to the files will try to take reconfig_mutex * while holding the file unremovable, which leads to * a deadlock. * So hold set sysfs_active while the remove in happeing, * and anything else which might set ->to_remove or my * otherwise change the sysfs namespace will fail with * -EBUSY if sysfs_active is still set. * We set sysfs_active under reconfig_mutex and elsewhere * test it under the same mutex to ensure its correct value * is seen. */ const struct attribute_group *to_remove = mddev->to_remove; mddev->to_remove = NULL; mddev->sysfs_active = 1; mutex_unlock(&mddev->reconfig_mutex); if (mddev->kobj.sd) { if (to_remove != &md_redundancy_group) sysfs_remove_group(&mddev->kobj, to_remove); if (mddev->pers == NULL || mddev->pers->sync_request == NULL) { sysfs_remove_group(&mddev->kobj, &md_redundancy_group); if (mddev->sysfs_action) sysfs_put(mddev->sysfs_action); if (mddev->sysfs_completed) sysfs_put(mddev->sysfs_completed); if (mddev->sysfs_degraded) sysfs_put(mddev->sysfs_degraded); mddev->sysfs_action = NULL; mddev->sysfs_completed = NULL; mddev->sysfs_degraded = NULL; } } mddev->sysfs_active = 0; } else mutex_unlock(&mddev->reconfig_mutex); md_wakeup_thread(mddev->thread); wake_up(&mddev->sb_wait); list_for_each_entry_safe(rdev, tmp, &delete, same_set) { list_del_init(&rdev->same_set); kobject_del(&rdev->kobj); export_rdev(rdev, mddev); } } EXPORT_SYMBOL_GPL(mddev_unlock); struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr) { struct md_rdev *rdev; rdev_for_each_rcu(rdev, mddev) if (rdev->desc_nr == nr) return rdev; return NULL; } EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu); static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) { struct md_rdev *rdev; rdev_for_each(rdev, mddev) if (rdev->bdev->bd_dev == dev) return rdev; return NULL; } struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev) { struct md_rdev *rdev; rdev_for_each_rcu(rdev, mddev) if (rdev->bdev->bd_dev == dev) return rdev; return NULL; } EXPORT_SYMBOL_GPL(md_find_rdev_rcu); static struct md_personality *find_pers(int level, char *clevel) { struct md_personality *pers; list_for_each_entry(pers, &pers_list, list) { if (level != LEVEL_NONE && pers->level == level) return pers; if (strcmp(pers->name, clevel)==0) return pers; } return NULL; } /* return the offset of the super block in 512byte sectors */ static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) { return MD_NEW_SIZE_SECTORS(bdev_nr_sectors(rdev->bdev)); } static int alloc_disk_sb(struct md_rdev *rdev) { rdev->sb_page = alloc_page(GFP_KERNEL); if (!rdev->sb_page) return -ENOMEM; return 0; } void md_rdev_clear(struct md_rdev *rdev) { if (rdev->sb_page) { put_page(rdev->sb_page); rdev->sb_loaded = 0; rdev->sb_page = NULL; rdev->sb_start = 0; rdev->sectors = 0; } if (rdev->bb_page) { put_page(rdev->bb_page); rdev->bb_page = NULL; } badblocks_exit(&rdev->badblocks); } EXPORT_SYMBOL_GPL(md_rdev_clear); static void super_written(struct bio *bio) { struct md_rdev *rdev = bio->bi_private; struct mddev *mddev = rdev->mddev; if (bio->bi_status) { pr_err("md: %s gets error=%d\n", __func__, blk_status_to_errno(bio->bi_status)); md_error(mddev, rdev); if (!test_bit(Faulty, &rdev->flags) && (bio->bi_opf & MD_FAILFAST)) { set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags); set_bit(LastDev, &rdev->flags); } } else clear_bit(LastDev, &rdev->flags); bio_put(bio); rdev_dec_pending(rdev, mddev); if (atomic_dec_and_test(&mddev->pending_writes)) wake_up(&mddev->sb_wait); } void md_super_write(struct mddev *mddev, struct md_rdev *rdev, sector_t sector, int size, struct page *page) { /* write first size bytes of page to sector of rdev * Increment mddev->pending_writes before returning * and decrement it on completion, waking up sb_wait * if zero is reached. * If an error occurred, call md_error */ struct bio *bio; if (!page) return; if (test_bit(Faulty, &rdev->flags)) return; bio = bio_alloc_bioset(rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev, 1, REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | REQ_FUA, GFP_NOIO, &mddev->sync_set); atomic_inc(&rdev->nr_pending); bio->bi_iter.bi_sector = sector; __bio_add_page(bio, page, size, 0); bio->bi_private = rdev; bio->bi_end_io = super_written; if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) && test_bit(FailFast, &rdev->flags) && !test_bit(LastDev, &rdev->flags)) bio->bi_opf |= MD_FAILFAST; atomic_inc(&mddev->pending_writes); submit_bio(bio); } int md_super_wait(struct mddev *mddev) { /* wait for all superblock writes that were scheduled to complete */ wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags)) return -EAGAIN; return 0; } int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, struct page *page, blk_opf_t opf, bool metadata_op) { struct bio bio; struct bio_vec bvec; if (metadata_op && rdev->meta_bdev) bio_init(&bio, rdev->meta_bdev, &bvec, 1, opf); else bio_init(&bio, rdev->bdev, &bvec, 1, opf); if (metadata_op) bio.bi_iter.bi_sector = sector + rdev->sb_start; else if (rdev->mddev->reshape_position != MaxSector && (rdev->mddev->reshape_backwards == (sector >= rdev->mddev->reshape_position))) bio.bi_iter.bi_sector = sector + rdev->new_data_offset; else bio.bi_iter.bi_sector = sector + rdev->data_offset; __bio_add_page(&bio, page, size, 0); submit_bio_wait(&bio); return !bio.bi_status; } EXPORT_SYMBOL_GPL(sync_page_io); static int read_disk_sb(struct md_rdev *rdev, int size) { if (rdev->sb_loaded) return 0; if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, true)) goto fail; rdev->sb_loaded = 1; return 0; fail: pr_err("md: disabled device %pg, could not read superblock.\n", rdev->bdev); return -EINVAL; } static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) { return sb1->set_uuid0 == sb2->set_uuid0 && sb1->set_uuid1 == sb2->set_uuid1 && sb1->set_uuid2 == sb2->set_uuid2 && sb1->set_uuid3 == sb2->set_uuid3; } static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) { int ret; mdp_super_t *tmp1, *tmp2; tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); if (!tmp1 || !tmp2) { ret = 0; goto abort; } *tmp1 = *sb1; *tmp2 = *sb2; /* * nr_disks is not constant */ tmp1->nr_disks = 0; tmp2->nr_disks = 0; ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); abort: kfree(tmp1); kfree(tmp2); return ret; } static u32 md_csum_fold(u32 csum) { csum = (csum & 0xffff) + (csum >> 16); return (csum & 0xffff) + (csum >> 16); } static unsigned int calc_sb_csum(mdp_super_t *sb) { u64 newcsum = 0; u32 *sb32 = (u32*)sb; int i; unsigned int disk_csum, csum; disk_csum = sb->sb_csum; sb->sb_csum = 0; for (i = 0; i < MD_SB_BYTES/4 ; i++) newcsum += sb32[i]; csum = (newcsum & 0xffffffff) + (newcsum>>32); #ifdef CONFIG_ALPHA /* This used to use csum_partial, which was wrong for several * reasons including that different results are returned on * different architectures. It isn't critical that we get exactly * the same return value as before (we always csum_fold before * testing, and that removes any differences). However as we * know that csum_partial always returned a 16bit value on * alphas, do a fold to maximise conformity to previous behaviour. */ sb->sb_csum = md_csum_fold(disk_csum); #else sb->sb_csum = disk_csum; #endif return csum; } /* * Handle superblock details. * We want to be able to handle multiple superblock formats * so we have a common interface to them all, and an array of * different handlers. * We rely on user-space to write the initial superblock, and support * reading and updating of superblocks. * Interface methods are: * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version) * loads and validates a superblock on dev. * if refdev != NULL, compare superblocks on both devices * Return: * 0 - dev has a superblock that is compatible with refdev * 1 - dev has a superblock that is compatible and newer than refdev * so dev should be used as the refdev in future * -EINVAL superblock incompatible or invalid * -othererror e.g. -EIO * * int validate_super(struct mddev *mddev, struct md_rdev *dev) * Verify that dev is acceptable into mddev. * The first time, mddev->raid_disks will be 0, and data from * dev should be merged in. Subsequent calls check that dev * is new enough. Return 0 or -EINVAL * * void sync_super(struct mddev *mddev, struct md_rdev *dev) * Update the superblock for rdev with data in mddev * This does not write to disc. * */ struct super_type { char *name; struct module *owner; int (*load_super)(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version); int (*validate_super)(struct mddev *mddev, struct md_rdev *rdev); void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); unsigned long long (*rdev_size_change)(struct md_rdev *rdev, sector_t num_sectors); int (*allow_new_offset)(struct md_rdev *rdev, unsigned long long new_offset); }; /* * Check that the given mddev has no bitmap. * * This function is called from the run method of all personalities that do not * support bitmaps. It prints an error message and returns non-zero if mddev * has a bitmap. Otherwise, it returns 0. * */ int md_check_no_bitmap(struct mddev *mddev) { if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) return 0; pr_warn("%s: bitmaps are not supported for %s\n", mdname(mddev), mddev->pers->name); return 1; } EXPORT_SYMBOL(md_check_no_bitmap); /* * load_super for 0.90.0 */ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) { mdp_super_t *sb; int ret; bool spare_disk = true; /* * Calculate the position of the superblock (512byte sectors), * it's at the end of the disk. * * It also happens to be a multiple of 4Kb. */ rdev->sb_start = calc_dev_sboffset(rdev); ret = read_disk_sb(rdev, MD_SB_BYTES); if (ret) return ret; ret = -EINVAL; sb = page_address(rdev->sb_page); if (sb->md_magic != MD_SB_MAGIC) { pr_warn("md: invalid raid superblock magic on %pg\n", rdev->bdev); goto abort; } if (sb->major_version != 0 || sb->minor_version < 90 || sb->minor_version > 91) { pr_warn("Bad version number %d.%d on %pg\n", sb->major_version, sb->minor_version, rdev->bdev); goto abort; } if (sb->raid_disks <= 0) goto abort; if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { pr_warn("md: invalid superblock checksum on %pg\n", rdev->bdev); goto abort; } rdev->preferred_minor = sb->md_minor; rdev->data_offset = 0; rdev->new_data_offset = 0; rdev->sb_size = MD_SB_BYTES; rdev->badblocks.shift = -1; if (sb->level == LEVEL_MULTIPATH) rdev->desc_nr = -1; else rdev->desc_nr = sb->this_disk.number; /* not spare disk, or LEVEL_MULTIPATH */ if (sb->level == LEVEL_MULTIPATH || (rdev->desc_nr >= 0 && rdev->desc_nr < MD_SB_DISKS && sb->disks[rdev->desc_nr].state & ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))) spare_disk = false; if (!refdev) { if (!spare_disk) ret = 1; else ret = 0; } else { __u64 ev1, ev2; mdp_super_t *refsb = page_address(refdev->sb_page); if (!md_uuid_equal(refsb, sb)) { pr_warn("md: %pg has different UUID to %pg\n", rdev->bdev, refdev->bdev); goto abort; } if (!md_sb_equal(refsb, sb)) { pr_warn("md: %pg has same UUID but different superblock to %pg\n", rdev->bdev, refdev->bdev); goto abort; } ev1 = md_event(sb); ev2 = md_event(refsb); if (!spare_disk && ev1 > ev2) ret = 1; else ret = 0; } rdev->sectors = rdev->sb_start; /* Limit to 4TB as metadata cannot record more than that. * (not needed for Linear and RAID0 as metadata doesn't * record this size) */ if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1) rdev->sectors = (sector_t)(2ULL << 32) - 2; if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) /* "this cannot possibly happen" ... */ ret = -EINVAL; abort: return ret; } /* * validate_super for 0.90.0 */ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) { mdp_disk_t *desc; mdp_super_t *sb = page_address(rdev->sb_page); __u64 ev1 = md_event(sb); rdev->raid_disk = -1; clear_bit(Faulty, &rdev->flags); clear_bit(In_sync, &rdev->flags); clear_bit(Bitmap_sync, &rdev->flags); clear_bit(WriteMostly, &rdev->flags); if (mddev->raid_disks == 0) { mddev->major_version = 0; mddev->minor_version = sb->minor_version; mddev->patch_version = sb->patch_version; mddev->external = 0; mddev->chunk_sectors = sb->chunk_size >> 9; mddev->ctime = sb->ctime; mddev->utime = sb->utime; mddev->level = sb->level; mddev->clevel[0] = 0; mddev->layout = sb->layout; mddev->raid_disks = sb->raid_disks; mddev->dev_sectors = ((sector_t)sb->size) * 2; mddev->events = ev1; mddev->bitmap_info.offset = 0; mddev->bitmap_info.space = 0; /* bitmap can use 60 K after the 4K superblocks */ mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); mddev->reshape_backwards = 0; if (mddev->minor_version >= 91) { mddev->reshape_position = sb->reshape_position; mddev->delta_disks = sb->delta_disks; mddev->new_level = sb->new_level; mddev->new_layout = sb->new_layout; mddev->new_chunk_sectors = sb->new_chunk >> 9; if (mddev->delta_disks < 0) mddev->reshape_backwards = 1; } else { mddev->reshape_position = MaxSector; mddev->delta_disks = 0; mddev->new_level = mddev->level; mddev->new_layout = mddev->layout; mddev->new_chunk_sectors = mddev->chunk_sectors; } if (mddev->level == 0) mddev->layout = -1; if (sb->state & (1<<MD_SB_CLEAN)) mddev->recovery_cp = MaxSector; else { if (sb->events_hi == sb->cp_events_hi && sb->events_lo == sb->cp_events_lo) { mddev->recovery_cp = sb->recovery_cp; } else mddev->recovery_cp = 0; } memcpy(mddev->uuid+0, &sb->set_uuid0, 4); memcpy(mddev->uuid+4, &sb->set_uuid1, 4); memcpy(mddev->uuid+8, &sb->set_uuid2, 4); memcpy(mddev->uuid+12,&sb->set_uuid3, 4); mddev->max_disks = MD_SB_DISKS; if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && mddev->bitmap_info.file == NULL) { mddev->bitmap_info.offset = mddev->bitmap_info.default_offset; mddev->bitmap_info.space = mddev->bitmap_info.default_space; } } else if (mddev->pers == NULL) { /* Insist on good event counter while assembling, except * for spares (which don't need an event count) */ ++ev1; if (sb->disks[rdev->desc_nr].state & ( (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) if (ev1 < mddev->events) return -EINVAL; } else if (mddev->bitmap) { /* if adding to array with a bitmap, then we can accept an * older device ... but not too old. */ if (ev1 < mddev->bitmap->events_cleared) return 0; if (ev1 < mddev->events) set_bit(Bitmap_sync, &rdev->flags); } else { if (ev1 < mddev->events) /* just a hot-add of a new device, leave raid_disk at -1 */ return 0; } if (mddev->level != LEVEL_MULTIPATH) { desc = sb->disks + rdev->desc_nr; if (desc->state & (1<<MD_DISK_FAULTY)) set_bit(Faulty, &rdev->flags); else if (desc->state & (1<<MD_DISK_SYNC) /* && desc->raid_disk < mddev->raid_disks */) { set_bit(In_sync, &rdev->flags); rdev->raid_disk = desc->raid_disk; rdev->saved_raid_disk = desc->raid_disk; } else if (desc->state & (1<<MD_DISK_ACTIVE)) { /* active but not in sync implies recovery up to * reshape position. We don't know exactly where * that is, so set to zero for now */ if (mddev->minor_version >= 91) { rdev->recovery_offset = 0; rdev->raid_disk = desc->raid_disk; } } if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) set_bit(WriteMostly, &rdev->flags); if (desc->state & (1<<MD_DISK_FAILFAST)) set_bit(FailFast, &rdev->flags); } else /* MULTIPATH are always insync */ set_bit(In_sync, &rdev->flags); return 0; } /* * sync_super for 0.90.0 */ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) { mdp_super_t *sb; struct md_rdev *rdev2; int next_spare = mddev->raid_disks; /* make rdev->sb match mddev data.. * * 1/ zero out disks * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); * 3/ any empty disks < next_spare become removed * * disks[0] gets initialised to REMOVED because * we cannot be sure from other fields if it has * been initialised or not. */ int i; int active=0, working=0,failed=0,spare=0,nr_disks=0; rdev->sb_size = MD_SB_BYTES; sb = page_address(rdev->sb_page); memset(sb, 0, sizeof(*sb)); sb->md_magic = MD_SB_MAGIC; sb->major_version = mddev->major_version; sb->patch_version = mddev->patch_version; sb->gvalid_words = 0; /* ignored */ memcpy(&sb->set_uuid0, mddev->uuid+0, 4); memcpy(&sb->set_uuid1, mddev->uuid+4, 4); memcpy(&sb->set_uuid2, mddev->uuid+8, 4); memcpy(&sb->set_uuid3, mddev->uuid+12,4); sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); sb->level = mddev->level; sb->size = mddev->dev_sectors / 2; sb->raid_disks = mddev->raid_disks; sb->md_minor = mddev->md_minor; sb->not_persistent = 0; sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); sb->state = 0; sb->events_hi = (mddev->events>>32); sb->events_lo = (u32)mddev->events; if (mddev->reshape_position == MaxSector) sb->minor_version = 90; else { sb->minor_version = 91; sb->reshape_position = mddev->reshape_position; sb->new_level = mddev->new_level; sb->delta_disks = mddev->delta_disks; sb->new_layout = mddev->new_layout; sb->new_chunk = mddev->new_chunk_sectors << 9; } mddev->minor_version = sb->minor_version; if (mddev->in_sync) { sb->recovery_cp = mddev->recovery_cp; sb->cp_events_hi = (mddev->events>>32); sb->cp_events_lo = (u32)mddev->events; if (mddev->recovery_cp == MaxSector) sb->state = (1<< MD_SB_CLEAN); } else sb->recovery_cp = 0; sb->layout = mddev->layout; sb->chunk_size = mddev->chunk_sectors << 9; if (mddev->bitmap && mddev->bitmap_info.file == NULL) sb->state |= (1<<MD_SB_BITMAP_PRESENT); sb->disks[0].state = (1<<MD_DISK_REMOVED); rdev_for_each(rdev2, mddev) { mdp_disk_t *d; int desc_nr; int is_active = test_bit(In_sync, &rdev2->flags); if (rdev2->raid_disk >= 0 && sb->minor_version >= 91) /* we have nowhere to store the recovery_offset, * but if it is not below the reshape_position, * we can piggy-back on that. */ is_active = 1; if (rdev2->raid_disk < 0 || test_bit(Faulty, &rdev2->flags)) is_active = 0; if (is_active) desc_nr = rdev2->raid_disk; else desc_nr = next_spare++; rdev2->desc_nr = desc_nr; d = &sb->disks[rdev2->desc_nr]; nr_disks++; d->number = rdev2->desc_nr; d->major = MAJOR(rdev2->bdev->bd_dev); d->minor = MINOR(rdev2->bdev->bd_dev); if (is_active) d->raid_disk = rdev2->raid_disk; else d->raid_disk = rdev2->desc_nr; /* compatibility */ if (test_bit(Faulty, &rdev2->flags)) d->state = (1<<MD_DISK_FAULTY); else if (is_active) { d->state = (1<<MD_DISK_ACTIVE); if (test_bit(In_sync, &rdev2->flags)) d->state |= (1<<MD_DISK_SYNC); active++; working++; } else { d->state = 0; spare++; working++; } if (test_bit(WriteMostly, &rdev2->flags)) d->state |= (1<<MD_DISK_WRITEMOSTLY); if (test_bit(FailFast, &rdev2->flags)) d->state |= (1<<MD_DISK_FAILFAST); } /* now set the "removed" and "faulty" bits on any missing devices */ for (i=0 ; i < mddev->raid_disks ; i++) { mdp_disk_t *d = &sb->disks[i]; if (d->state == 0 && d->number == 0) { d->number = i; d->raid_disk = i; d->state = (1<<MD_DISK_REMOVED); d->state |= (1<<MD_DISK_FAULTY); failed++; } } sb->nr_disks = nr_disks; sb->active_disks = active; sb->working_disks = working; sb->failed_disks = failed; sb->spare_disks = spare; sb->this_disk = sb->disks[rdev->desc_nr]; sb->sb_csum = calc_sb_csum(sb); } /* * rdev_size_change for 0.90.0 */ static unsigned long long super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) { if (num_sectors && num_sectors < rdev->mddev->dev_sectors) return 0; /* component must fit device */ if (rdev->mddev->bitmap_info.offset) return 0; /* can't move bitmap */ rdev->sb_start = calc_dev_sboffset(rdev); if (!num_sectors || num_sectors > rdev->sb_start) num_sectors = rdev->sb_start; /* Limit to 4TB as metadata cannot record more than that. * 4TB == 2^32 KB, or 2*2^32 sectors. */ if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) num_sectors = (sector_t)(2ULL << 32) - 2; do { md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, rdev->sb_page); } while (md_super_wait(rdev->mddev) < 0); return num_sectors; } static int super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) { /* non-zero offset changes not possible with v0.90 */ return new_offset == 0; } /* * version 1 superblock */ static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb) { __le32 disk_csum; u32 csum; unsigned long long newcsum; int size = 256 + le32_to_cpu(sb->max_dev)*2; __le32 *isuper = (__le32*)sb; disk_csum = sb->sb_csum; sb->sb_csum = 0; newcsum = 0; for (; size >= 4; size -= 4) newcsum += le32_to_cpu(*isuper++); if (size == 2) newcsum += le16_to_cpu(*(__le16*) isuper); csum = (newcsum & 0xffffffff) + (newcsum >> 32); sb->sb_csum = disk_csum; return cpu_to_le32(csum); } static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) { struct mdp_superblock_1 *sb; int ret; sector_t sb_start; sector_t sectors; int bmask; bool spare_disk = true; /* * Calculate the position of the superblock in 512byte sectors. * It is always aligned to a 4K boundary and * depeding on minor_version, it can be: * 0: At least 8K, but less than 12K, from end of device * 1: At start of device * 2: 4K from start of device. */ switch(minor_version) { case 0: sb_start = bdev_nr_sectors(rdev->bdev) - 8 * 2; sb_start &= ~(sector_t)(4*2-1); break; case 1: sb_start = 0; break; case 2: sb_start = 8; break; default: return -EINVAL; } rdev->sb_start = sb_start; /* superblock is rarely larger than 1K, but it can be larger, * and it is safe to read 4k, so we do that */ ret = read_disk_sb(rdev, 4096); if (ret) return ret; sb = page_address(rdev->sb_page); if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || sb->major_version != cpu_to_le32(1) || le32_to_cpu(sb->max_dev) > (4096-256)/2 || le64_to_cpu(sb->super_offset) != rdev->sb_start || (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) return -EINVAL; if (calc_sb_1_csum(sb) != sb->sb_csum) { pr_warn("md: invalid superblock checksum on %pg\n", rdev->bdev); return -EINVAL; } if (le64_to_cpu(sb->data_size) < 10) { pr_warn("md: data_size too small on %pg\n", rdev->bdev); return -EINVAL; } if (sb->pad0 || sb->pad3[0] || memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) /* Some padding is non-zero, might be a new feature */ return -EINVAL; rdev->preferred_minor = 0xffff; rdev->data_offset = le64_to_cpu(sb->data_offset); rdev->new_data_offset = rdev->data_offset; if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset); atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; if (rdev->sb_size & bmask) rdev->sb_size = (rdev->sb_size | bmask) + 1; if (minor_version && rdev->data_offset < sb_start + (rdev->sb_size/512)) return -EINVAL; if (minor_version && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) return -EINVAL; if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) rdev->desc_nr = -1; else rdev->desc_nr = le32_to_cpu(sb->dev_number); if (!rdev->bb_page) { rdev->bb_page = alloc_page(GFP_KERNEL); if (!rdev->bb_page) return -ENOMEM; } if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && rdev->badblocks.count == 0) { /* need to load the bad block list. * Currently we limit it to one page. */ s32 offset; sector_t bb_sector; __le64 *bbp; int i; int sectors = le16_to_cpu(sb->bblog_size); if (sectors > (PAGE_SIZE / 512)) return -EINVAL; offset = le32_to_cpu(sb->bblog_offset); if (offset == 0) return -EINVAL; bb_sector = (long long)offset; if (!sync_page_io(rdev, bb_sector, sectors << 9, rdev->bb_page, REQ_OP_READ, true)) return -EIO; bbp = (__le64 *)page_address(rdev->bb_page); rdev->badblocks.shift = sb->bblog_shift; for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { u64 bb = le64_to_cpu(*bbp); int count = bb & (0x3ff); u64 sector = bb >> 10; sector <<= sb->bblog_shift; count <<= sb->bblog_shift; if (bb + 1 == 0) break; if (badblocks_set(&rdev->badblocks, sector, count, 1)) return -EINVAL; } } else if (sb->bblog_offset != 0) rdev->badblocks.shift = 0; if ((le32_to_cpu(sb->feature_map) & (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) { rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset); rdev->ppl.size = le16_to_cpu(sb->ppl.size); rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset; } if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) && sb->level != 0) return -EINVAL; /* not spare disk, or LEVEL_MULTIPATH */ if (sb->level == cpu_to_le32(LEVEL_MULTIPATH) || (rdev->desc_nr >= 0 && rdev->desc_nr < le32_to_cpu(sb->max_dev) && (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))) spare_disk = false; if (!refdev) { if (!spare_disk) ret = 1; else ret = 0; } else { __u64 ev1, ev2; struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || sb->level != refsb->level || sb->layout != refsb->layout || sb->chunksize != refsb->chunksize) { pr_warn("md: %pg has strangely different superblock to %pg\n", rdev->bdev, refdev->bdev); return -EINVAL; } ev1 = le64_to_cpu(sb->events); ev2 = le64_to_cpu(refsb->events); if (!spare_disk && ev1 > ev2) ret = 1; else ret = 0; } if (minor_version) sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; else sectors = rdev->sb_start; if (sectors < le64_to_cpu(sb->data_size)) return -EINVAL; rdev->sectors = le64_to_cpu(sb->data_size); return ret; } static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) { struct mdp_superblock_1 *sb = page_address(rdev->sb_page); __u64 ev1 = le64_to_cpu(sb->events); rdev->raid_disk = -1; clear_bit(Faulty, &rdev->flags); clear_bit(In_sync, &rdev->flags); clear_bit(Bitmap_sync, &rdev->flags); clear_bit(WriteMostly, &rdev->flags); if (mddev->raid_disks == 0) { mddev->major_version = 1; mddev->patch_version = 0; mddev->external = 0; mddev->chunk_sectors = le32_to_cpu(sb->chunksize); mddev->ctime = le64_to_cpu(sb->ctime); mddev->utime = le64_to_cpu(sb->utime); mddev->level = le32_to_cpu(sb->level); mddev->clevel[0] = 0; mddev->layout = le32_to_cpu(sb->layout); mddev->raid_disks = le32_to_cpu(sb->raid_disks); mddev->dev_sectors = le64_to_cpu(sb->size); mddev->events = ev1; mddev->bitmap_info.offset = 0; mddev->bitmap_info.space = 0; /* Default location for bitmap is 1K after superblock * using 3K - total of 4K */ mddev->bitmap_info.default_offset = 1024 >> 9; mddev->bitmap_info.default_space = (4096-1024) >> 9; mddev->reshape_backwards = 0; mddev->recovery_cp = le64_to_cpu(sb->resync_offset); memcpy(mddev->uuid, sb->set_uuid, 16); mddev->max_disks = (4096-256)/2; if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && mddev->bitmap_info.file == NULL) { mddev->bitmap_info.offset = (__s32)le32_to_cpu(sb->bitmap_offset); /* Metadata doesn't record how much space is available. * For 1.0, we assume we can use up to the superblock * if before, else to 4K beyond superblock. * For others, assume no change is possible. */ if (mddev->minor_version > 0) mddev->bitmap_info.space = 0; else if (mddev->bitmap_info.offset > 0) mddev->bitmap_info.space = 8 - mddev->bitmap_info.offset; else mddev->bitmap_info.space = -mddev->bitmap_info.offset; } if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { mddev->reshape_position = le64_to_cpu(sb->reshape_position); mddev->delta_disks = le32_to_cpu(sb->delta_disks); mddev->new_level = le32_to_cpu(sb->new_level); mddev->new_layout = le32_to_cpu(sb->new_layout); mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); if (mddev->delta_disks < 0 || (mddev->delta_disks == 0 && (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_BACKWARDS))) mddev->reshape_backwards = 1; } else { mddev->reshape_position = MaxSector; mddev->delta_disks = 0; mddev->new_level = mddev->level; mddev->new_layout = mddev->layout; mddev->new_chunk_sectors = mddev->chunk_sectors; } if (mddev->level == 0 && !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT)) mddev->layout = -1; if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) set_bit(MD_HAS_JOURNAL, &mddev->flags); if (le32_to_cpu(sb->feature_map) & (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) { if (le32_to_cpu(sb->feature_map) & (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL)) return -EINVAL; if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) && (le32_to_cpu(sb->feature_map) & MD_FEATURE_MULTIPLE_PPLS)) return -EINVAL; set_bit(MD_HAS_PPL, &mddev->flags); } } else if (mddev->pers == NULL) { /* Insist of good event counter while assembling, except for * spares (which don't need an event count) */ ++ev1; if (rdev->desc_nr >= 0 && rdev->desc_nr < le32_to_cpu(sb->max_dev) && (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) if (ev1 < mddev->events) return -EINVAL; } else if (mddev->bitmap) { /* If adding to array with a bitmap, then we can accept an * older device, but not too old. */ if (ev1 < mddev->bitmap->events_cleared) return 0; if (ev1 < mddev->events) set_bit(Bitmap_sync, &rdev->flags); } else { if (ev1 < mddev->events) /* just a hot-add of a new device, leave raid_disk at -1 */ return 0; } if (mddev->level != LEVEL_MULTIPATH) { int role; if (rdev->desc_nr < 0 || rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { role = MD_DISK_ROLE_SPARE; rdev->desc_nr = -1; } else role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); switch(role) { case MD_DISK_ROLE_SPARE: /* spare */ break; case MD_DISK_ROLE_FAULTY: /* faulty */ set_bit(Faulty, &rdev->flags); break; case MD_DISK_ROLE_JOURNAL: /* journal device */ if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { /* journal device without journal feature */ pr_warn("md: journal device provided without journal feature, ignoring the device\n"); return -EINVAL; } set_bit(Journal, &rdev->flags); rdev->journal_tail = le64_to_cpu(sb->journal_tail); rdev->raid_disk = 0; break; default: rdev->saved_raid_disk = role; if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET)) { rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_BITMAP)) rdev->saved_raid_disk = -1; } else { /* * If the array is FROZEN, then the device can't * be in_sync with rest of array. */ if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) set_bit(In_sync, &rdev->flags); } rdev->raid_disk = role; break; } if (sb->devflags & WriteMostly1) set_bit(WriteMostly, &rdev->flags); if (sb->devflags & FailFast1) set_bit(FailFast, &rdev->flags); if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) set_bit(Replacement, &rdev->flags); } else /* MULTIPATH are always insync */ set_bit(In_sync, &rdev->flags); return 0; } static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) { struct mdp_superblock_1 *sb; struct md_rdev *rdev2; int max_dev, i; /* make rdev->sb match mddev and rdev data. */ sb = page_address(rdev->sb_page); sb->feature_map = 0; sb->pad0 = 0; sb->recovery_offset = cpu_to_le64(0); memset(sb->pad3, 0, sizeof(sb->pad3)); sb->utime = cpu_to_le64((__u64)mddev->utime); sb->events = cpu_to_le64(mddev->events); if (mddev->in_sync) sb->resync_offset = cpu_to_le64(mddev->recovery_cp); else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) sb->resync_offset = cpu_to_le64(MaxSector); else sb->resync_offset = cpu_to_le64(0); sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); sb->raid_disks = cpu_to_le32(mddev->raid_disks); sb->size = cpu_to_le64(mddev->dev_sectors); sb->chunksize = cpu_to_le32(mddev->chunk_sectors); sb->level = cpu_to_le32(mddev->level); sb->layout = cpu_to_le32(mddev->layout); if (test_bit(FailFast, &rdev->flags)) sb->devflags |= FailFast1; else sb->devflags &= ~FailFast1; if (test_bit(WriteMostly, &rdev->flags)) sb->devflags |= WriteMostly1; else sb->devflags &= ~WriteMostly1; sb->data_offset = cpu_to_le64(rdev->data_offset); sb->data_size = cpu_to_le64(rdev->sectors); if (mddev->bitmap && mddev->bitmap_info.file == NULL) { sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); } if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) && !test_bit(In_sync, &rdev->flags)) { sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); sb->recovery_offset = cpu_to_le64(rdev->recovery_offset); if (rdev->saved_raid_disk >= 0 && mddev->bitmap) sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); } /* Note: recovery_offset and journal_tail share space */ if (test_bit(Journal, &rdev->flags)) sb->journal_tail = cpu_to_le64(rdev->journal_tail); if (test_bit(Replacement, &rdev->flags)) sb->feature_map |= cpu_to_le32(MD_FEATURE_REPLACEMENT); if (mddev->reshape_position != MaxSector) { sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); sb->reshape_position = cpu_to_le64(mddev->reshape_position); sb->new_layout = cpu_to_le32(mddev->new_layout); sb->delta_disks = cpu_to_le32(mddev->delta_disks); sb->new_level = cpu_to_le32(mddev->new_level); sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); if (mddev->delta_disks == 0 && mddev->reshape_backwards) sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); if (rdev->new_data_offset != rdev->data_offset) { sb->feature_map |= cpu_to_le32(MD_FEATURE_NEW_OFFSET); sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset - rdev->data_offset)); } } if (mddev_is_clustered(mddev)) sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED); if (rdev->badblocks.count == 0) /* Nothing to do for bad blocks*/ ; else if (sb->bblog_offset == 0) /* Cannot record bad blocks on this device */ md_error(mddev, rdev); else { struct badblocks *bb = &rdev->badblocks; __le64 *bbp = (__le64 *)page_address(rdev->bb_page); u64 *p = bb->page; sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); if (bb->changed) { unsigned seq; retry: seq = read_seqbegin(&bb->lock); memset(bbp, 0xff, PAGE_SIZE); for (i = 0 ; i < bb->count ; i++) { u64 internal_bb = p[i]; u64 store_bb = ((BB_OFFSET(internal_bb) << 10) | BB_LEN(internal_bb)); bbp[i] = cpu_to_le64(store_bb); } bb->changed = 0; if (read_seqretry(&bb->lock, seq)) goto retry; bb->sector = (rdev->sb_start + (int)le32_to_cpu(sb->bblog_offset)); bb->size = le16_to_cpu(sb->bblog_size); } } max_dev = 0; rdev_for_each(rdev2, mddev) if (rdev2->desc_nr+1 > max_dev) max_dev = rdev2->desc_nr+1; if (max_dev > le32_to_cpu(sb->max_dev)) { int bmask; sb->max_dev = cpu_to_le32(max_dev); rdev->sb_size = max_dev * 2 + 256; bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; if (rdev->sb_size & bmask) rdev->sb_size = (rdev->sb_size | bmask) + 1; } else max_dev = le32_to_cpu(sb->max_dev); for (i=0; i<max_dev;i++) sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); if (test_bit(MD_HAS_PPL, &mddev->flags)) { if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags)) sb->feature_map |= cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS); else sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL); sb->ppl.offset = cpu_to_le16(rdev->ppl.offset); sb->ppl.size = cpu_to_le16(rdev->ppl.size); } rdev_for_each(rdev2, mddev) { i = rdev2->desc_nr; if (test_bit(Faulty, &rdev2->flags)) sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); else if (test_bit(In_sync, &rdev2->flags)) sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); else if (test_bit(Journal, &rdev2->flags)) sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL); else if (rdev2->raid_disk >= 0) sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); else sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); } sb->sb_csum = calc_sb_1_csum(sb); } static sector_t super_1_choose_bm_space(sector_t dev_size) { sector_t bm_space; /* if the device is bigger than 8Gig, save 64k for bitmap * usage, if bigger than 200Gig, save 128k */ if (dev_size < 64*2) bm_space = 0; else if (dev_size - 64*2 >= 200*1024*1024*2) bm_space = 128*2; else if (dev_size - 4*2 > 8*1024*1024*2) bm_space = 64*2; else bm_space = 4*2; return bm_space; } static unsigned long long super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) { struct mdp_superblock_1 *sb; sector_t max_sectors; if (num_sectors && num_sectors < rdev->mddev->dev_sectors) return 0; /* component must fit device */ if (rdev->data_offset != rdev->new_data_offset) return 0; /* too confusing */ if (rdev->sb_start < rdev->data_offset) { /* minor versions 1 and 2; superblock before data */ max_sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; if (!num_sectors || num_sectors > max_sectors) num_sectors = max_sectors; } else if (rdev->mddev->bitmap_info.offset) { /* minor version 0 with bitmap we can't move */ return 0; } else { /* minor version 0; superblock after data */ sector_t sb_start, bm_space; sector_t dev_size = bdev_nr_sectors(rdev->bdev); /* 8K is for superblock */ sb_start = dev_size - 8*2; sb_start &= ~(sector_t)(4*2 - 1); bm_space = super_1_choose_bm_space(dev_size); /* Space that can be used to store date needs to decrease * superblock bitmap space and bad block space(4K) */ max_sectors = sb_start - bm_space - 4*2; if (!num_sectors || num_sectors > max_sectors) num_sectors = max_sectors; rdev->sb_start = sb_start; } sb = page_address(rdev->sb_page); sb->data_size = cpu_to_le64(num_sectors); sb->super_offset = cpu_to_le64(rdev->sb_start); sb->sb_csum = calc_sb_1_csum(sb); do { md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, rdev->sb_page); } while (md_super_wait(rdev->mddev) < 0); return num_sectors; } static int super_1_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) { /* All necessary checks on new >= old have been done */ struct bitmap *bitmap; if (new_offset >= rdev->data_offset) return 1; /* with 1.0 metadata, there is no metadata to tread on * so we can always move back */ if (rdev->mddev->minor_version == 0) return 1; /* otherwise we must be sure not to step on * any metadata, so stay: * 36K beyond start of superblock * beyond end of badblocks * beyond write-intent bitmap */ if (rdev->sb_start + (32+4)*2 > new_offset) return 0; bitmap = rdev->mddev->bitmap; if (bitmap && !rdev->mddev->bitmap_info.file && rdev->sb_start + rdev->mddev->bitmap_info.offset + bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset) return 0; if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) return 0; return 1; } static struct super_type super_types[] = { [0] = { .name = "0.90.0", .owner = THIS_MODULE, .load_super = super_90_load, .validate_super = super_90_validate, .sync_super = super_90_sync, .rdev_size_change = super_90_rdev_size_change, .allow_new_offset = super_90_allow_new_offset, }, [1] = { .name = "md-1", .owner = THIS_MODULE, .load_super = super_1_load, .validate_super = super_1_validate, .sync_super = super_1_sync, .rdev_size_change = super_1_rdev_size_change, .allow_new_offset = super_1_allow_new_offset, }, }; static void sync_super(struct mddev *mddev, struct md_rdev *rdev) { if (mddev->sync_super) { mddev->sync_super(mddev, rdev); return; } BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types)); super_types[mddev->major_version].sync_super(mddev, rdev); } static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) { struct md_rdev *rdev, *rdev2; rcu_read_lock(); rdev_for_each_rcu(rdev, mddev1) { if (test_bit(Faulty, &rdev->flags) || test_bit(Journal, &rdev->flags) || rdev->raid_disk == -1) continue; rdev_for_each_rcu(rdev2, mddev2) { if (test_bit(Faulty, &rdev2->flags) || test_bit(Journal, &rdev2->flags) || rdev2->raid_disk == -1) continue; if (rdev->bdev->bd_disk == rdev2->bdev->bd_disk) { rcu_read_unlock(); return 1; } } } rcu_read_unlock(); return 0; } static LIST_HEAD(pending_raid_disks); /* * Try to register data integrity profile for an mddev * * This is called when an array is started and after a disk has been kicked * from the array. It only succeeds if all working and active component devices * are integrity capable with matching profiles. */ int md_integrity_register(struct mddev *mddev) { struct md_rdev *rdev, *reference = NULL; if (list_empty(&mddev->disks)) return 0; /* nothing to do */ if (!mddev->gendisk || blk_get_integrity(mddev->gendisk)) return 0; /* shouldn't register, or already is */ rdev_for_each(rdev, mddev) { /* skip spares and non-functional disks */ if (test_bit(Faulty, &rdev->flags)) continue; if (rdev->raid_disk < 0) continue; if (!reference) { /* Use the first rdev as the reference */ reference = rdev; continue; } /* does this rdev's profile match the reference profile? */ if (blk_integrity_compare(reference->bdev->bd_disk, rdev->bdev->bd_disk) < 0) return -EINVAL; } if (!reference || !bdev_get_integrity(reference->bdev)) return 0; /* * All component devices are integrity capable and have matching * profiles, register the common profile for the md device. */ blk_integrity_register(mddev->gendisk, bdev_get_integrity(reference->bdev)); pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) || (mddev->level != 1 && mddev->level != 10 && bioset_integrity_create(&mddev->io_clone_set, BIO_POOL_SIZE))) { /* * No need to handle the failure of bioset_integrity_create, * because the function is called by md_run() -> pers->run(), * md_run calls bioset_exit -> bioset_integrity_free in case * of failure case. */ pr_err("md: failed to create integrity pool for %s\n", mdname(mddev)); return -EINVAL; } return 0; } EXPORT_SYMBOL(md_integrity_register); /* * Attempt to add an rdev, but only if it is consistent with the current * integrity profile */ int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) { struct blk_integrity *bi_mddev; if (!mddev->gendisk) return 0; bi_mddev = blk_get_integrity(mddev->gendisk); if (!bi_mddev) /* nothing to do */ return 0; if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) { pr_err("%s: incompatible integrity profile for %pg\n", mdname(mddev), rdev->bdev); return -ENXIO; } return 0; } EXPORT_SYMBOL(md_integrity_add_rdev); static bool rdev_read_only(struct md_rdev *rdev) { return bdev_read_only(rdev->bdev) || (rdev->meta_bdev && bdev_read_only(rdev->meta_bdev)); } static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) { char b[BDEVNAME_SIZE]; int err; /* prevent duplicates */ if (find_rdev(mddev, rdev->bdev->bd_dev)) return -EEXIST; if (rdev_read_only(rdev) && mddev->pers) return -EROFS; /* make sure rdev->sectors exceeds mddev->dev_sectors */ if (!test_bit(Journal, &rdev->flags) && rdev->sectors && (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) { if (mddev->pers) { /* Cannot change size, so fail * If mddev->level <= 0, then we don't care * about aligning sizes (e.g. linear) */ if (mddev->level > 0) return -ENOSPC; } else mddev->dev_sectors = rdev->sectors; } /* Verify rdev->desc_nr is unique. * If it is -1, assign a free number, else * check number is not in use */ rcu_read_lock(); if (rdev->desc_nr < 0) { int choice = 0; if (mddev->pers) choice = mddev->raid_disks; while (md_find_rdev_nr_rcu(mddev, choice)) choice++; rdev->desc_nr = choice; } else { if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) { rcu_read_unlock(); return -EBUSY; } } rcu_read_unlock(); if (!test_bit(Journal, &rdev->flags) && mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { pr_warn("md: %s: array is limited to %d devices\n", mdname(mddev), mddev->max_disks); return -EBUSY; } snprintf(b, sizeof(b), "%pg", rdev->bdev); strreplace(b, '/', '!'); rdev->mddev = mddev; pr_debug("md: bind<%s>\n", b); if (mddev->raid_disks) mddev_create_serial_pool(mddev, rdev); if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) goto fail; /* failure here is OK */ err = sysfs_create_link(&rdev->kobj, bdev_kobj(rdev->bdev), "block"); rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); rdev->sysfs_unack_badblocks = sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks"); rdev->sysfs_badblocks = sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks"); list_add_rcu(&rdev->same_set, &mddev->disks); bd_link_disk_holder(rdev->bdev, mddev->gendisk); /* May as well allow recovery to be retried once */ mddev->recovery_disabled++; return 0; fail: pr_warn("md: failed to register dev-%s for %s\n", b, mdname(mddev)); return err; } void md_autodetect_dev(dev_t dev); /* just for claiming the bdev */ static struct md_rdev claim_rdev; static void export_rdev(struct md_rdev *rdev, struct mddev *mddev) { pr_debug("md: export_rdev(%pg)\n", rdev->bdev); md_rdev_clear(rdev); #ifndef MODULE if (test_bit(AutoDetected, &rdev->flags)) md_autodetect_dev(rdev->bdev->bd_dev); #endif bdev_release(rdev->bdev_handle); rdev->bdev = NULL; kobject_put(&rdev->kobj); } static void md_kick_rdev_from_array(struct md_rdev *rdev) { struct mddev *mddev = rdev->mddev; bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); list_del_rcu(&rdev->same_set); pr_debug("md: unbind<%pg>\n", rdev->bdev); mddev_destroy_serial_pool(rdev->mddev, rdev); rdev->mddev = NULL; sysfs_remove_link(&rdev->kobj, "block"); sysfs_put(rdev->sysfs_state); sysfs_put(rdev->sysfs_unack_badblocks); sysfs_put(rdev->sysfs_badblocks); rdev->sysfs_state = NULL; rdev->sysfs_unack_badblocks = NULL; rdev->sysfs_badblocks = NULL; rdev->badblocks.count = 0; synchronize_rcu(); /* * kobject_del() will wait for all in progress writers to be done, where * reconfig_mutex is held, hence it can't be called under * reconfig_mutex and it's delayed to mddev_unlock(). */ list_add(&rdev->same_set, &mddev->deleting); } static void export_array(struct mddev *mddev) { struct md_rdev *rdev; while (!list_empty(&mddev->disks)) { rdev = list_first_entry(&mddev->disks, struct md_rdev, same_set); md_kick_rdev_from_array(rdev); } mddev->raid_disks = 0; mddev->major_version = 0; } static bool set_in_sync(struct mddev *mddev) { lockdep_assert_held(&mddev->lock); if (!mddev->in_sync) { mddev->sync_checkers++; spin_unlock(&mddev->lock); percpu_ref_switch_to_atomic_sync(&mddev->writes_pending); spin_lock(&mddev->lock); if (!mddev->in_sync && percpu_ref_is_zero(&mddev->writes_pending)) { mddev->in_sync = 1; /* * Ensure ->in_sync is visible before we clear * ->sync_checkers. */ smp_mb(); set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); sysfs_notify_dirent_safe(mddev->sysfs_state); } if (--mddev->sync_checkers == 0) percpu_ref_switch_to_percpu(&mddev->writes_pending); } if (mddev->safemode == 1) mddev->safemode = 0; return mddev->in_sync; } static void sync_sbs(struct mddev *mddev, int nospares) { /* Update each superblock (in-memory image), but * if we are allowed to, skip spares which already * have the right event counter, or have one earlier * (which would mean they aren't being marked as dirty * with the rest of the array) */ struct md_rdev *rdev; rdev_for_each(rdev, mddev) { if (rdev->sb_events == mddev->events || (nospares && rdev->raid_disk < 0 && rdev->sb_events+1 == mddev->events)) { /* Don't update this superblock */ rdev->sb_loaded = 2; } else { sync_super(mddev, rdev); rdev->sb_loaded = 1; } } } static bool does_sb_need_changing(struct mddev *mddev) { struct md_rdev *rdev = NULL, *iter; struct mdp_superblock_1 *sb; int role; /* Find a good rdev */ rdev_for_each(iter, mddev) if ((iter->raid_disk >= 0) && !test_bit(Faulty, &iter->flags)) { rdev = iter; break; } /* No good device found. */ if (!rdev) return false; sb = page_address(rdev->sb_page); /* Check if a device has become faulty or a spare become active */ rdev_for_each(rdev, mddev) { role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); /* Device activated? */ if (role == MD_DISK_ROLE_SPARE && rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags)) return true; /* Device turned faulty? */ if (test_bit(Faulty, &rdev->flags) && (role < MD_DISK_ROLE_MAX)) return true; } /* Check if any mddev parameters have changed */ if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || (mddev->layout != le32_to_cpu(sb->layout)) || (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) return true; return false; } void md_update_sb(struct mddev *mddev, int force_change) { struct md_rdev *rdev; int sync_req; int nospares = 0; int any_badblocks_changed = 0; int ret = -1; if (!md_is_rdwr(mddev)) { if (force_change) set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); return; } repeat: if (mddev_is_clustered(mddev)) { if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) force_change = 1; if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) nospares = 1; ret = md_cluster_ops->metadata_update_start(mddev); /* Has someone else has updated the sb */ if (!does_sb_need_changing(mddev)) { if (ret == 0) md_cluster_ops->metadata_update_cancel(mddev); bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN)); return; } } /* * First make sure individual recovery_offsets are correct * curr_resync_completed can only be used during recovery. * During reshape/resync it might use array-addresses rather * that device addresses. */ rdev_for_each(rdev, mddev) { if (rdev->raid_disk >= 0 && mddev->delta_disks >= 0 && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && !test_bit(Journal, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && mddev->curr_resync_completed > rdev->recovery_offset) rdev->recovery_offset = mddev->curr_resync_completed; } if (!mddev->persistent) { clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); if (!mddev->external) { clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); rdev_for_each(rdev, mddev) { if (rdev->badblocks.changed) { rdev->badblocks.changed = 0; ack_all_badblocks(&rdev->badblocks); md_error(mddev, rdev); } clear_bit(Blocked, &rdev->flags); clear_bit(BlockedBadBlocks, &rdev->flags); wake_up(&rdev->blocked_wait); } } wake_up(&mddev->sb_wait); return; } spin_lock(&mddev->lock); mddev->utime = ktime_get_real_seconds(); if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) force_change = 1; if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) /* just a clean<-> dirty transition, possibly leave spares alone, * though if events isn't the right even/odd, we will have to do * spares after all */ nospares = 1; if (force_change) nospares = 0; if (mddev->degraded) /* If the array is degraded, then skipping spares is both * dangerous and fairly pointless. * Dangerous because a device that was removed from the array * might have a event_count that still looks up-to-date, * so it can be re-added without a resync. * Pointless because if there are any spares to skip, * then a recovery will happen and soon that array won't * be degraded any more and the spare can go back to sleep then. */ nospares = 0; sync_req = mddev->in_sync; /* If this is just a dirty<->clean transition, and the array is clean * and 'events' is odd, we can roll back to the previous clean state */ if (nospares && (mddev->in_sync && mddev->recovery_cp == MaxSector) && mddev->can_decrease_events && mddev->events != 1) { mddev->events--; mddev->can_decrease_events = 0; } else { /* otherwise we have to go forward and ... */ mddev->events ++; mddev->can_decrease_events = nospares; } /* * This 64-bit counter should never wrap. * Either we are in around ~1 trillion A.C., assuming * 1 reboot per second, or we have a bug... */ WARN_ON(mddev->events == 0); rdev_for_each(rdev, mddev) { if (rdev->badblocks.changed) any_badblocks_changed++; if (test_bit(Faulty, &rdev->flags)) set_bit(FaultRecorded, &rdev->flags); } sync_sbs(mddev, nospares); spin_unlock(&mddev->lock); pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", mdname(mddev), mddev->in_sync); if (mddev->queue) blk_add_trace_msg(mddev->queue, "md md_update_sb"); rewrite: md_bitmap_update_sb(mddev->bitmap); rdev_for_each(rdev, mddev) { if (rdev->sb_loaded != 1) continue; /* no noise on spare devices */ if (!test_bit(Faulty, &rdev->flags)) { md_super_write(mddev,rdev, rdev->sb_start, rdev->sb_size, rdev->sb_page); pr_debug("md: (write) %pg's sb offset: %llu\n", rdev->bdev, (unsigned long long)rdev->sb_start); rdev->sb_events = mddev->events; if (rdev->badblocks.size) { md_super_write(mddev, rdev, rdev->badblocks.sector, rdev->badblocks.size << 9, rdev->bb_page); rdev->badblocks.size = 0; } } else pr_debug("md: %pg (skipping faulty)\n", rdev->bdev); if (mddev->level == LEVEL_MULTIPATH) /* only need to write one superblock... */ break; } if (md_super_wait(mddev) < 0) goto rewrite; /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */ if (mddev_is_clustered(mddev) && ret == 0) md_cluster_ops->metadata_update_finish(mddev); if (mddev->in_sync != sync_req || !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN))) /* have to write it out again */ goto repeat; wake_up(&mddev->sb_wait); if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) sysfs_notify_dirent_safe(mddev->sysfs_completed); rdev_for_each(rdev, mddev) { if (test_and_clear_bit(FaultRecorded, &rdev->flags)) clear_bit(Blocked, &rdev->flags); if (any_badblocks_changed) ack_all_badblocks(&rdev->badblocks); clear_bit(BlockedBadBlocks, &rdev->flags); wake_up(&rdev->blocked_wait); } } EXPORT_SYMBOL(md_update_sb); static int add_bound_rdev(struct md_rdev *rdev) { struct mddev *mddev = rdev->mddev; int err = 0; bool add_journal = test_bit(Journal, &rdev->flags); if (!mddev->pers->hot_remove_disk || add_journal) { /* If there is hot_add_disk but no hot_remove_disk * then added disks for geometry changes, * and should be added immediately. */ super_types[mddev->major_version]. validate_super(mddev, rdev); err = mddev->pers->hot_add_disk(mddev, rdev); if (err) { md_kick_rdev_from_array(rdev); return err; } } sysfs_notify_dirent_safe(rdev->sysfs_state); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); if (mddev->degraded) set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_new_event(); md_wakeup_thread(mddev->thread); return 0; } /* words written to sysfs files may, or may not, be \n terminated. * We want to accept with case. For this we use cmd_match. */ static int cmd_match(const char *cmd, const char *str) { /* See if cmd, written into a sysfs file, matches * str. They must either be the same, or cmd can * have a trailing newline */ while (*cmd && *str && *cmd == *str) { cmd++; str++; } if (*cmd == '\n') cmd++; if (*str || *cmd) return 0; return 1; } struct rdev_sysfs_entry { struct attribute attr; ssize_t (*show)(struct md_rdev *, char *); ssize_t (*store)(struct md_rdev *, const char *, size_t); }; static ssize_t state_show(struct md_rdev *rdev, char *page) { char *sep = ","; size_t len = 0; unsigned long flags = READ_ONCE(rdev->flags); if (test_bit(Faulty, &flags) || (!test_bit(ExternalBbl, &flags) && rdev->badblocks.unacked_exist)) len += sprintf(page+len, "faulty%s", sep); if (test_bit(In_sync, &flags)) len += sprintf(page+len, "in_sync%s", sep); if (test_bit(Journal, &flags)) len += sprintf(page+len, "journal%s", sep); if (test_bit(WriteMostly, &flags)) len += sprintf(page+len, "write_mostly%s", sep); if (test_bit(Blocked, &flags) || (rdev->badblocks.unacked_exist && !test_bit(Faulty, &flags))) len += sprintf(page+len, "blocked%s", sep); if (!test_bit(Faulty, &flags) && !test_bit(Journal, &flags) && !test_bit(In_sync, &flags)) len += sprintf(page+len, "spare%s", sep); if (test_bit(WriteErrorSeen, &flags)) len += sprintf(page+len, "write_error%s", sep); if (test_bit(WantReplacement, &flags)) len += sprintf(page+len, "want_replacement%s", sep); if (test_bit(Replacement, &flags)) len += sprintf(page+len, "replacement%s", sep); if (test_bit(ExternalBbl, &flags)) len += sprintf(page+len, "external_bbl%s", sep); if (test_bit(FailFast, &flags)) len += sprintf(page+len, "failfast%s", sep); if (len) len -= strlen(sep); return len+sprintf(page+len, "\n"); } static ssize_t state_store(struct md_rdev *rdev, const char *buf, size_t len) { /* can write * faulty - simulates an error * remove - disconnects the device * writemostly - sets write_mostly * -writemostly - clears write_mostly * blocked - sets the Blocked flags * -blocked - clears the Blocked and possibly simulates an error * insync - sets Insync providing device isn't active * -insync - clear Insync for a device with a slot assigned, * so that it gets rebuilt based on bitmap * write_error - sets WriteErrorSeen * -write_error - clears WriteErrorSeen * {,-}failfast - set/clear FailFast */ struct mddev *mddev = rdev->mddev; int err = -EINVAL; bool need_update_sb = false; if (cmd_match(buf, "faulty") && rdev->mddev->pers) { md_error(rdev->mddev, rdev); if (test_bit(MD_BROKEN, &rdev->mddev->flags)) err = -EBUSY; else err = 0; } else if (cmd_match(buf, "remove")) { if (rdev->mddev->pers) { clear_bit(Blocked, &rdev->flags); remove_and_add_spares(rdev->mddev, rdev); } if (rdev->raid_disk >= 0) err = -EBUSY; else { err = 0; if (mddev_is_clustered(mddev)) err = md_cluster_ops->remove_disk(mddev, rdev); if (err == 0) { md_kick_rdev_from_array(rdev); if (mddev->pers) { set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); md_wakeup_thread(mddev->thread); } md_new_event(); } } } else if (cmd_match(buf, "writemostly")) { set_bit(WriteMostly, &rdev->flags); mddev_create_serial_pool(rdev->mddev, rdev); need_update_sb = true; err = 0; } else if (cmd_match(buf, "-writemostly")) { mddev_destroy_serial_pool(rdev->mddev, rdev); clear_bit(WriteMostly, &rdev->flags); need_update_sb = true; err = 0; } else if (cmd_match(buf, "blocked")) { set_bit(Blocked, &rdev->flags); err = 0; } else if (cmd_match(buf, "-blocked")) { if (!test_bit(Faulty, &rdev->flags) && !test_bit(ExternalBbl, &rdev->flags) && rdev->badblocks.unacked_exist) { /* metadata handler doesn't understand badblocks, * so we need to fail the device */ md_error(rdev->mddev, rdev); } clear_bit(Blocked, &rdev->flags); clear_bit(BlockedBadBlocks, &rdev->flags); wake_up(&rdev->blocked_wait); set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); md_wakeup_thread(rdev->mddev->thread); err = 0; } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { set_bit(In_sync, &rdev->flags); err = 0; } else if (cmd_match(buf, "failfast")) { set_bit(FailFast, &rdev->flags); need_update_sb = true; err = 0; } else if (cmd_match(buf, "-failfast")) { clear_bit(FailFast, &rdev->flags); need_update_sb = true; err = 0; } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags)) { if (rdev->mddev->pers == NULL) { clear_bit(In_sync, &rdev->flags); rdev->saved_raid_disk = rdev->raid_disk; rdev->raid_disk = -1; err = 0; } } else if (cmd_match(buf, "write_error")) { set_bit(WriteErrorSeen, &rdev->flags); err = 0; } else if (cmd_match(buf, "-write_error")) { clear_bit(WriteErrorSeen, &rdev->flags); err = 0; } else if (cmd_match(buf, "want_replacement")) { /* Any non-spare device that is not a replacement can * become want_replacement at any time, but we then need to * check if recovery is needed. */ if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) && !test_bit(Replacement, &rdev->flags)) set_bit(WantReplacement, &rdev->flags); set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); md_wakeup_thread(rdev->mddev->thread); err = 0; } else if (cmd_match(buf, "-want_replacement")) { /* Clearing 'want_replacement' is always allowed. * Once replacements starts it is too late though. */ err = 0; clear_bit(WantReplacement, &rdev->flags); } else if (cmd_match(buf, "replacement")) { /* Can only set a device as a replacement when array has not * yet been started. Once running, replacement is automatic * from spares, or by assigning 'slot'. */ if (rdev->mddev->pers) err = -EBUSY; else { set_bit(Replacement, &rdev->flags); err = 0; } } else if (cmd_match(buf, "-replacement")) { /* Similarly, can only clear Replacement before start */ if (rdev->mddev->pers) err = -EBUSY; else { clear_bit(Replacement, &rdev->flags); err = 0; } } else if (cmd_match(buf, "re-add")) { if (!rdev->mddev->pers) err = -EINVAL; else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) && rdev->saved_raid_disk >= 0) { /* clear_bit is performed _after_ all the devices * have their local Faulty bit cleared. If any writes * happen in the meantime in the local node, they * will land in the local bitmap, which will be synced * by this node eventually */ if (!mddev_is_clustered(rdev->mddev) || (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) { clear_bit(Faulty, &rdev->flags); err = add_bound_rdev(rdev); } } else err = -EBUSY; } else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) { set_bit(ExternalBbl, &rdev->flags); rdev->badblocks.shift = 0; err = 0; } else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) { clear_bit(ExternalBbl, &rdev->flags); err = 0; } if (need_update_sb) md_update_sb(mddev, 1); if (!err) sysfs_notify_dirent_safe(rdev->sysfs_state); return err ? err : len; } static struct rdev_sysfs_entry rdev_state = __ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store); static ssize_t errors_show(struct md_rdev *rdev, char *page) { return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); } static ssize_t errors_store(struct md_rdev *rdev, const char *buf, size_t len) { unsigned int n; int rv; rv = kstrtouint(buf, 10, &n); if (rv < 0) return rv; atomic_set(&rdev->corrected_errors, n); return len; } static struct rdev_sysfs_entry rdev_errors = __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); static ssize_t slot_show(struct md_rdev *rdev, char *page) { if (test_bit(Journal, &rdev->flags)) return sprintf(page, "journal\n"); else if (rdev->raid_disk < 0) return sprintf(page, "none\n"); else return sprintf(page, "%d\n", rdev->raid_disk); } static ssize_t slot_store(struct md_rdev *rdev, const char *buf, size_t len) { int slot; int err; if (test_bit(Journal, &rdev->flags)) return -EBUSY; if (strncmp(buf, "none", 4)==0) slot = -1; else { err = kstrtouint(buf, 10, (unsigned int *)&slot); if (err < 0) return err; if (slot < 0) /* overflow */ return -ENOSPC; } if (rdev->mddev->pers && slot == -1) { /* Setting 'slot' on an active array requires also * updating the 'rd%d' link, and communicating * with the personality with ->hot_*_disk. * For now we only support removing * failed/spare devices. This normally happens automatically, * but not when the metadata is externally managed. */ if (rdev->raid_disk == -1) return -EEXIST; /* personality does all needed checks */ if (rdev->mddev->pers->hot_remove_disk == NULL) return -EINVAL; clear_bit(Blocked, &rdev->flags); remove_and_add_spares(rdev->mddev, rdev); if (rdev->raid_disk >= 0) return -EBUSY; set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); md_wakeup_thread(rdev->mddev->thread); } else if (rdev->mddev->pers) { /* Activating a spare .. or possibly reactivating * if we ever get bitmaps working here. */ int err; if (rdev->raid_disk != -1) return -EBUSY; if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery)) return -EBUSY; if (rdev->mddev->pers->hot_add_disk == NULL) return -EINVAL; if (slot >= rdev->mddev->raid_disks && slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) return -ENOSPC; rdev->raid_disk = slot; if (test_bit(In_sync, &rdev->flags)) rdev->saved_raid_disk = slot; else rdev->saved_raid_disk = -1; clear_bit(In_sync, &rdev->flags); clear_bit(Bitmap_sync, &rdev->flags); err = rdev->mddev->pers->hot_add_disk(rdev->mddev, rdev); if (err) { rdev->raid_disk = -1; return err; } else sysfs_notify_dirent_safe(rdev->sysfs_state); /* failure here is OK */; sysfs_link_rdev(rdev->mddev, rdev); /* don't wakeup anyone, leave that to userspace. */ } else { if (slot >= rdev->mddev->raid_disks && slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) return -ENOSPC; rdev->raid_disk = slot; /* assume it is working */ clear_bit(Faulty, &rdev->flags); clear_bit(WriteMostly, &rdev->flags); set_bit(In_sync, &rdev->flags); sysfs_notify_dirent_safe(rdev->sysfs_state); } return len; } static struct rdev_sysfs_entry rdev_slot = __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); static ssize_t offset_show(struct md_rdev *rdev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); } static ssize_t offset_store(struct md_rdev *rdev, const char *buf, size_t len) { unsigned long long offset; if (kstrtoull(buf, 10, &offset) < 0) return -EINVAL; if (rdev->mddev->pers && rdev->raid_disk >= 0) return -EBUSY; if (rdev->sectors && rdev->mddev->external) /* Must set offset before size, so overlap checks * can be sane */ return -EBUSY; rdev->data_offset = offset; rdev->new_data_offset = offset; return len; } static struct rdev_sysfs_entry rdev_offset = __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); static ssize_t new_offset_show(struct md_rdev *rdev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)rdev->new_data_offset); } static ssize_t new_offset_store(struct md_rdev *rdev, const char *buf, size_t len) { unsigned long long new_offset; struct mddev *mddev = rdev->mddev; if (kstrtoull(buf, 10, &new_offset) < 0) return -EINVAL; if (mddev->sync_thread || test_bit(MD_RECOVERY_RUNNING,&mddev->recovery)) return -EBUSY; if (new_offset == rdev->data_offset) /* reset is always permitted */ ; else if (new_offset > rdev->data_offset) { /* must not push array size beyond rdev_sectors */ if (new_offset - rdev->data_offset + mddev->dev_sectors > rdev->sectors) return -E2BIG; } /* Metadata worries about other space details. */ /* decreasing the offset is inconsistent with a backwards * reshape. */ if (new_offset < rdev->data_offset && mddev->reshape_backwards) return -EINVAL; /* Increasing offset is inconsistent with forwards * reshape. reshape_direction should be set to * 'backwards' first. */ if (new_offset > rdev->data_offset && !mddev->reshape_backwards) return -EINVAL; if (mddev->pers && mddev->persistent && !super_types[mddev->major_version] .allow_new_offset(rdev, new_offset)) return -E2BIG; rdev->new_data_offset = new_offset; if (new_offset > rdev->data_offset) mddev->reshape_backwards = 1; else if (new_offset < rdev->data_offset) mddev->reshape_backwards = 0; return len; } static struct rdev_sysfs_entry rdev_new_offset = __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store); static ssize_t rdev_size_show(struct md_rdev *rdev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); } static int md_rdevs_overlap(struct md_rdev *a, struct md_rdev *b) { /* check if two start/length pairs overlap */ if (a->data_offset + a->sectors <= b->data_offset) return false; if (b->data_offset + b->sectors <= a->data_offset) return false; return true; } static bool md_rdev_overlaps(struct md_rdev *rdev) { struct mddev *mddev; struct md_rdev *rdev2; spin_lock(&all_mddevs_lock); list_for_each_entry(mddev, &all_mddevs, all_mddevs) { if (test_bit(MD_DELETED, &mddev->flags)) continue; rdev_for_each(rdev2, mddev) { if (rdev != rdev2 && rdev->bdev == rdev2->bdev && md_rdevs_overlap(rdev, rdev2)) { spin_unlock(&all_mddevs_lock); return true; } } } spin_unlock(&all_mddevs_lock); return false; } static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) { unsigned long long blocks; sector_t new; if (kstrtoull(buf, 10, &blocks) < 0) return -EINVAL; if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) return -EINVAL; /* sector conversion overflow */ new = blocks * 2; if (new != blocks * 2) return -EINVAL; /* unsigned long long to sector_t overflow */ *sectors = new; return 0; } static ssize_t rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) { struct mddev *my_mddev = rdev->mddev; sector_t oldsectors = rdev->sectors; sector_t sectors; if (test_bit(Journal, &rdev->flags)) return -EBUSY; if (strict_blocks_to_sectors(buf, &sectors) < 0) return -EINVAL; if (rdev->data_offset != rdev->new_data_offset) return -EINVAL; /* too confusing */ if (my_mddev->pers && rdev->raid_disk >= 0) { if (my_mddev->persistent) { sectors = super_types[my_mddev->major_version]. rdev_size_change(rdev, sectors); if (!sectors) return -EBUSY; } else if (!sectors) sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; if (!my_mddev->pers->resize) /* Cannot change size for RAID0 or Linear etc */ return -EINVAL; } if (sectors < my_mddev->dev_sectors) return -EINVAL; /* component must fit device */ rdev->sectors = sectors; /* * Check that all other rdevs with the same bdev do not overlap. This * check does not provide a hard guarantee, it just helps avoid * dangerous mistakes. */ if (sectors > oldsectors && my_mddev->external && md_rdev_overlaps(rdev)) { /* * Someone else could have slipped in a size change here, but * doing so is just silly. We put oldsectors back because we * know it is safe, and trust userspace not to race with itself. */ rdev->sectors = oldsectors; return -EBUSY; } return len; } static struct rdev_sysfs_entry rdev_size = __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); static ssize_t recovery_start_show(struct md_rdev *rdev, char *page) { unsigned long long recovery_start = rdev->recovery_offset; if (test_bit(In_sync, &rdev->flags) || recovery_start == MaxSector) return sprintf(page, "none\n"); return sprintf(page, "%llu\n", recovery_start); } static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len) { unsigned long long recovery_start; if (cmd_match(buf, "none")) recovery_start = MaxSector; else if (kstrtoull(buf, 10, &recovery_start)) return -EINVAL; if (rdev->mddev->pers && rdev->raid_disk >= 0) return -EBUSY; rdev->recovery_offset = recovery_start; if (recovery_start == MaxSector) set_bit(In_sync, &rdev->flags); else clear_bit(In_sync, &rdev->flags); return len; } static struct rdev_sysfs_entry rdev_recovery_start = __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); /* sysfs access to bad-blocks list. * We present two files. * 'bad-blocks' lists sector numbers and lengths of ranges that * are recorded as bad. The list is truncated to fit within * the one-page limit of sysfs. * Writing "sector length" to this file adds an acknowledged * bad block list. * 'unacknowledged-bad-blocks' lists bad blocks that have not yet * been acknowledged. Writing to this file adds bad blocks * without acknowledging them. This is largely for testing. */ static ssize_t bb_show(struct md_rdev *rdev, char *page) { return badblocks_show(&rdev->badblocks, page, 0); } static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len) { int rv = badblocks_store(&rdev->badblocks, page, len, 0); /* Maybe that ack was all we needed */ if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags)) wake_up(&rdev->blocked_wait); return rv; } static struct rdev_sysfs_entry rdev_bad_blocks = __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); static ssize_t ubb_show(struct md_rdev *rdev, char *page) { return badblocks_show(&rdev->badblocks, page, 1); } static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len) { return badblocks_store(&rdev->badblocks, page, len, 1); } static struct rdev_sysfs_entry rdev_unack_bad_blocks = __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); static ssize_t ppl_sector_show(struct md_rdev *rdev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector); } static ssize_t ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len) { unsigned long long sector; if (kstrtoull(buf, 10, &sector) < 0) return -EINVAL; if (sector != (sector_t)sector) return -EINVAL; if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && rdev->raid_disk >= 0) return -EBUSY; if (rdev->mddev->persistent) { if (rdev->mddev->major_version == 0) return -EINVAL; if ((sector > rdev->sb_start && sector - rdev->sb_start > S16_MAX) || (sector < rdev->sb_start && rdev->sb_start - sector > -S16_MIN)) return -EINVAL; rdev->ppl.offset = sector - rdev->sb_start; } else if (!rdev->mddev->external) { return -EBUSY; } rdev->ppl.sector = sector; return len; } static struct rdev_sysfs_entry rdev_ppl_sector = __ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store); static ssize_t ppl_size_show(struct md_rdev *rdev, char *page) { return sprintf(page, "%u\n", rdev->ppl.size); } static ssize_t ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len) { unsigned int size; if (kstrtouint(buf, 10, &size) < 0) return -EINVAL; if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && rdev->raid_disk >= 0) return -EBUSY; if (rdev->mddev->persistent) { if (rdev->mddev->major_version == 0) return -EINVAL; if (size > U16_MAX) return -EINVAL; } else if (!rdev->mddev->external) { return -EBUSY; } rdev->ppl.size = size; return len; } static struct rdev_sysfs_entry rdev_ppl_size = __ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store); static struct attribute *rdev_default_attrs[] = { &rdev_state.attr, &rdev_errors.attr, &rdev_slot.attr, &rdev_offset.attr, &rdev_new_offset.attr, &rdev_size.attr, &rdev_recovery_start.attr, &rdev_bad_blocks.attr, &rdev_unack_bad_blocks.attr, &rdev_ppl_sector.attr, &rdev_ppl_size.attr, NULL, }; ATTRIBUTE_GROUPS(rdev_default); static ssize_t rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); if (!entry->show) return -EIO; if (!rdev->mddev) return -ENODEV; return entry->show(rdev, page); } static ssize_t rdev_attr_store(struct kobject *kobj, struct attribute *attr, const char *page, size_t length) { struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); struct kernfs_node *kn = NULL; bool suspend = false; ssize_t rv; struct mddev *mddev = rdev->mddev; if (!entry->store) return -EIO; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (!mddev) return -ENODEV; if (entry->store == state_store) { if (cmd_match(page, "remove")) kn = sysfs_break_active_protection(kobj, attr); if (cmd_match(page, "remove") || cmd_match(page, "re-add") || cmd_match(page, "writemostly") || cmd_match(page, "-writemostly")) suspend = true; } rv = suspend ? mddev_suspend_and_lock(mddev) : mddev_lock(mddev); if (!rv) { if (rdev->mddev == NULL) rv = -ENODEV; else rv = entry->store(rdev, page, length); suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev); } if (kn) sysfs_unbreak_active_protection(kn); return rv; } static void rdev_free(struct kobject *ko) { struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj); kfree(rdev); } static const struct sysfs_ops rdev_sysfs_ops = { .show = rdev_attr_show, .store = rdev_attr_store, }; static const struct kobj_type rdev_ktype = { .release = rdev_free, .sysfs_ops = &rdev_sysfs_ops, .default_groups = rdev_default_groups, }; int md_rdev_init(struct md_rdev *rdev) { rdev->desc_nr = -1; rdev->saved_raid_disk = -1; rdev->raid_disk = -1; rdev->flags = 0; rdev->data_offset = 0; rdev->new_data_offset = 0; rdev->sb_events = 0; rdev->last_read_error = 0; rdev->sb_loaded = 0; rdev->bb_page = NULL; atomic_set(&rdev->nr_pending, 0); atomic_set(&rdev->read_errors, 0); atomic_set(&rdev->corrected_errors, 0); INIT_LIST_HEAD(&rdev->same_set); init_waitqueue_head(&rdev->blocked_wait); /* Add space to store bad block list. * This reserves the space even on arrays where it cannot * be used - I wonder if that matters */ return badblocks_init(&rdev->badblocks, 0); } EXPORT_SYMBOL_GPL(md_rdev_init); /* * Import a device. If 'super_format' >= 0, then sanity check the superblock * * mark the device faulty if: * * - the device is nonexistent (zero size) * - the device has no valid superblock * * a faulty rdev _never_ has rdev->sb set. */ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) { struct md_rdev *rdev; sector_t size; int err; rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); if (!rdev) return ERR_PTR(-ENOMEM); err = md_rdev_init(rdev); if (err) goto out_free_rdev; err = alloc_disk_sb(rdev); if (err) goto out_clear_rdev; rdev->bdev_handle = bdev_open_by_dev(newdev, BLK_OPEN_READ | BLK_OPEN_WRITE, super_format == -2 ? &claim_rdev : rdev, NULL); if (IS_ERR(rdev->bdev_handle)) { pr_warn("md: could not open device unknown-block(%u,%u).\n", MAJOR(newdev), MINOR(newdev)); err = PTR_ERR(rdev->bdev_handle); goto out_clear_rdev; } rdev->bdev = rdev->bdev_handle->bdev; kobject_init(&rdev->kobj, &rdev_ktype); size = bdev_nr_bytes(rdev->bdev) >> BLOCK_SIZE_BITS; if (!size) { pr_warn("md: %pg has zero or unknown size, marking faulty!\n", rdev->bdev); err = -EINVAL; goto out_blkdev_put; } if (super_format >= 0) { err = super_types[super_format]. load_super(rdev, NULL, super_minor); if (err == -EINVAL) { pr_warn("md: %pg does not have a valid v%d.%d superblock, not importing!\n", rdev->bdev, super_format, super_minor); goto out_blkdev_put; } if (err < 0) { pr_warn("md: could not read %pg's sb, not importing!\n", rdev->bdev); goto out_blkdev_put; } } return rdev; out_blkdev_put: bdev_release(rdev->bdev_handle); out_clear_rdev: md_rdev_clear(rdev); out_free_rdev: kfree(rdev); return ERR_PTR(err); } /* * Check a full RAID array for plausibility */ static int analyze_sbs(struct mddev *mddev) { int i; struct md_rdev *rdev, *freshest, *tmp; freshest = NULL; rdev_for_each_safe(rdev, tmp, mddev) switch (super_types[mddev->major_version]. load_super(rdev, freshest, mddev->minor_version)) { case 1: freshest = rdev; break; case 0: break; default: pr_warn("md: fatal superblock inconsistency in %pg -- removing from array\n", rdev->bdev); md_kick_rdev_from_array(rdev); } /* Cannot find a valid fresh disk */ if (!freshest) { pr_warn("md: cannot find a valid disk\n"); return -EINVAL; } super_types[mddev->major_version]. validate_super(mddev, freshest); i = 0; rdev_for_each_safe(rdev, tmp, mddev) { if (mddev->max_disks && (rdev->desc_nr >= mddev->max_disks || i > mddev->max_disks)) { pr_warn("md: %s: %pg: only %d devices permitted\n", mdname(mddev), rdev->bdev, mddev->max_disks); md_kick_rdev_from_array(rdev); continue; } if (rdev != freshest) { if (super_types[mddev->major_version]. validate_super(mddev, rdev)) { pr_warn("md: kicking non-fresh %pg from array!\n", rdev->bdev); md_kick_rdev_from_array(rdev); continue; } } if (mddev->level == LEVEL_MULTIPATH) { rdev->desc_nr = i++; rdev->raid_disk = rdev->desc_nr; set_bit(In_sync, &rdev->flags); } else if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks)) && !test_bit(Journal, &rdev->flags)) { rdev->raid_disk = -1; clear_bit(In_sync, &rdev->flags); } } return 0; } /* Read a fixed-point number. * Numbers in sysfs attributes should be in "standard" units where * possible, so time should be in seconds. * However we internally use a a much smaller unit such as * milliseconds or jiffies. * This function takes a decimal number with a possible fractional * component, and produces an integer which is the result of * multiplying that number by 10^'scale'. * all without any floating-point arithmetic. */ int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) { unsigned long result = 0; long decimals = -1; while (isdigit(*cp) || (*cp == '.' && decimals < 0)) { if (*cp == '.') decimals = 0; else if (decimals < scale) { unsigned int value; value = *cp - '0'; result = result * 10 + value; if (decimals >= 0) decimals++; } cp++; } if (*cp == '\n') cp++; if (*cp) return -EINVAL; if (decimals < 0) decimals = 0; *res = result * int_pow(10, scale - decimals); return 0; } static ssize_t safe_delay_show(struct mddev *mddev, char *page) { unsigned int msec = ((unsigned long)mddev->safemode_delay*1000)/HZ; return sprintf(page, "%u.%03u\n", msec/1000, msec%1000); } static ssize_t safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) { unsigned long msec; if (mddev_is_clustered(mddev)) { pr_warn("md: Safemode is disabled for clustered mode\n"); return -EINVAL; } if (strict_strtoul_scaled(cbuf, &msec, 3) < 0 || msec > UINT_MAX / HZ) return -EINVAL; if (msec == 0) mddev->safemode_delay = 0; else { unsigned long old_delay = mddev->safemode_delay; unsigned long new_delay = (msec*HZ)/1000; if (new_delay == 0) new_delay = 1; mddev->safemode_delay = new_delay; if (new_delay < old_delay || old_delay == 0) mod_timer(&mddev->safemode_timer, jiffies+1); } return len; } static struct md_sysfs_entry md_safe_delay = __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); static ssize_t level_show(struct mddev *mddev, char *page) { struct md_personality *p; int ret; spin_lock(&mddev->lock); p = mddev->pers; if (p) ret = sprintf(page, "%s\n", p->name); else if (mddev->clevel[0]) ret = sprintf(page, "%s\n", mddev->clevel); else if (mddev->level != LEVEL_NONE) ret = sprintf(page, "%d\n", mddev->level); else ret = 0; spin_unlock(&mddev->lock); return ret; } static ssize_t level_store(struct mddev *mddev, const char *buf, size_t len) { char clevel[16]; ssize_t rv; size_t slen = len; struct md_personality *pers, *oldpers; long level; void *priv, *oldpriv; struct md_rdev *rdev; if (slen == 0 || slen >= sizeof(clevel)) return -EINVAL; rv = mddev_suspend_and_lock(mddev); if (rv) return rv; if (mddev->pers == NULL) { memcpy(mddev->clevel, buf, slen); if (mddev->clevel[slen-1] == '\n') slen--; mddev->clevel[slen] = 0; mddev->level = LEVEL_NONE; rv = len; goto out_unlock; } rv = -EROFS; if (!md_is_rdwr(mddev)) goto out_unlock; /* request to change the personality. Need to ensure: * - array is not engaged in resync/recovery/reshape * - old personality can be suspended * - new personality will access other array. */ rv = -EBUSY; if (mddev->sync_thread || test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || mddev->reshape_position != MaxSector || mddev->sysfs_active) goto out_unlock; rv = -EINVAL; if (!mddev->pers->quiesce) { pr_warn("md: %s: %s does not support online personality change\n", mdname(mddev), mddev->pers->name); goto out_unlock; } /* Now find the new personality */ memcpy(clevel, buf, slen); if (clevel[slen-1] == '\n') slen--; clevel[slen] = 0; if (kstrtol(clevel, 10, &level)) level = LEVEL_NONE; if (request_module("md-%s", clevel) != 0) request_module("md-level-%s", clevel); spin_lock(&pers_lock); pers = find_pers(level, clevel); if (!pers || !try_module_get(pers->owner)) { spin_unlock(&pers_lock); pr_warn("md: personality %s not loaded\n", clevel); rv = -EINVAL; goto out_unlock; } spin_unlock(&pers_lock); if (pers == mddev->pers) { /* Nothing to do! */ module_put(pers->owner); rv = len; goto out_unlock; } if (!pers->takeover) { module_put(pers->owner); pr_warn("md: %s: %s does not support personality takeover\n", mdname(mddev), clevel); rv = -EINVAL; goto out_unlock; } rdev_for_each(rdev, mddev) rdev->new_raid_disk = rdev->raid_disk; /* ->takeover must set new_* and/or delta_disks * if it succeeds, and may set them when it fails. */ priv = pers->takeover(mddev); if (IS_ERR(priv)) { mddev->new_level = mddev->level; mddev->new_layout = mddev->layout; mddev->new_chunk_sectors = mddev->chunk_sectors; mddev->raid_disks -= mddev->delta_disks; mddev->delta_disks = 0; mddev->reshape_backwards = 0; module_put(pers->owner); pr_warn("md: %s: %s would not accept array\n", mdname(mddev), clevel); rv = PTR_ERR(priv); goto out_unlock; } /* Looks like we have a winner */ mddev_detach(mddev); spin_lock(&mddev->lock); oldpers = mddev->pers; oldpriv = mddev->private; mddev->pers = pers; mddev->private = priv; strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); mddev->level = mddev->new_level; mddev->layout = mddev->new_layout; mddev->chunk_sectors = mddev->new_chunk_sectors; mddev->delta_disks = 0; mddev->reshape_backwards = 0; mddev->degraded = 0; spin_unlock(&mddev->lock); if (oldpers->sync_request == NULL && mddev->external) { /* We are converting from a no-redundancy array * to a redundancy array and metadata is managed * externally so we need to be sure that writes * won't block due to a need to transition * clean->dirty * until external management is started. */ mddev->in_sync = 0; mddev->safemode_delay = 0; mddev->safemode = 0; } oldpers->free(mddev, oldpriv); if (oldpers->sync_request == NULL && pers->sync_request != NULL) { /* need to add the md_redundancy_group */ if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) pr_warn("md: cannot register extra attributes for %s\n", mdname(mddev)); mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); } if (oldpers->sync_request != NULL && pers->sync_request == NULL) { /* need to remove the md_redundancy_group */ if (mddev->to_remove == NULL) mddev->to_remove = &md_redundancy_group; } module_put(oldpers->owner); rdev_for_each(rdev, mddev) { if (rdev->raid_disk < 0) continue; if (rdev->new_raid_disk >= mddev->raid_disks) rdev->new_raid_disk = -1; if (rdev->new_raid_disk == rdev->raid_disk) continue; sysfs_unlink_rdev(mddev, rdev); } rdev_for_each(rdev, mddev) { if (rdev->raid_disk < 0) continue; if (rdev->new_raid_disk == rdev->raid_disk) continue; rdev->raid_disk = rdev->new_raid_disk; if (rdev->raid_disk < 0) clear_bit(In_sync, &rdev->flags); else { if (sysfs_link_rdev(mddev, rdev)) pr_warn("md: cannot register rd%d for %s after level change\n", rdev->raid_disk, mdname(mddev)); } } if (pers->sync_request == NULL) { /* this is now an array without redundancy, so * it must always be in_sync */ mddev->in_sync = 1; del_timer_sync(&mddev->safemode_timer); } blk_set_stacking_limits(&mddev->queue->limits); pers->run(mddev); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); if (!mddev->thread) md_update_sb(mddev, 1); sysfs_notify_dirent_safe(mddev->sysfs_level); md_new_event(); rv = len; out_unlock: mddev_unlock_and_resume(mddev); return rv; } static struct md_sysfs_entry md_level = __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); static ssize_t layout_show(struct mddev *mddev, char *page) { /* just a number, not meaningful for all levels */ if (mddev->reshape_position != MaxSector && mddev->layout != mddev->new_layout) return sprintf(page, "%d (%d)\n", mddev->new_layout, mddev->layout); return sprintf(page, "%d\n", mddev->layout); } static ssize_t layout_store(struct mddev *mddev, const char *buf, size_t len) { unsigned int n; int err; err = kstrtouint(buf, 10, &n); if (err < 0) return err; err = mddev_lock(mddev); if (err) return err; if (mddev->pers) { if (mddev->pers->check_reshape == NULL) err = -EBUSY; else if (!md_is_rdwr(mddev)) err = -EROFS; else { mddev->new_layout = n; err = mddev->pers->check_reshape(mddev); if (err) mddev->new_layout = mddev->layout; } } else { mddev->new_layout = n; if (mddev->reshape_position == MaxSector) mddev->layout = n; } mddev_unlock(mddev); return err ?: len; } static struct md_sysfs_entry md_layout = __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); static ssize_t raid_disks_show(struct mddev *mddev, char *page) { if (mddev->raid_disks == 0) return 0; if (mddev->reshape_position != MaxSector && mddev->delta_disks != 0) return sprintf(page, "%d (%d)\n", mddev->raid_disks, mddev->raid_disks - mddev->delta_disks); return sprintf(page, "%d\n", mddev->raid_disks); } static int update_raid_disks(struct mddev *mddev, int raid_disks); static ssize_t raid_disks_store(struct mddev *mddev, const char *buf, size_t len) { unsigned int n; int err; err = kstrtouint(buf, 10, &n); if (err < 0) return err; err = mddev_lock(mddev); if (err) return err; if (mddev->pers) err = update_raid_disks(mddev, n); else if (mddev->reshape_position != MaxSector) { struct md_rdev *rdev; int olddisks = mddev->raid_disks - mddev->delta_disks; err = -EINVAL; rdev_for_each(rdev, mddev) { if (olddisks < n && rdev->data_offset < rdev->new_data_offset) goto out_unlock; if (olddisks > n && rdev->data_offset > rdev->new_data_offset) goto out_unlock; } err = 0; mddev->delta_disks = n - olddisks; mddev->raid_disks = n; mddev->reshape_backwards = (mddev->delta_disks < 0); } else mddev->raid_disks = n; out_unlock: mddev_unlock(mddev); return err ? err : len; } static struct md_sysfs_entry md_raid_disks = __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); static ssize_t uuid_show(struct mddev *mddev, char *page) { return sprintf(page, "%pU\n", mddev->uuid); } static struct md_sysfs_entry md_uuid = __ATTR(uuid, S_IRUGO, uuid_show, NULL); static ssize_t chunk_size_show(struct mddev *mddev, char *page) { if (mddev->reshape_position != MaxSector && mddev->chunk_sectors != mddev->new_chunk_sectors) return sprintf(page, "%d (%d)\n", mddev->new_chunk_sectors << 9, mddev->chunk_sectors << 9); return sprintf(page, "%d\n", mddev->chunk_sectors << 9); } static ssize_t chunk_size_store(struct mddev *mddev, const char *buf, size_t len) { unsigned long n; int err; err = kstrtoul(buf, 10, &n); if (err < 0) return err; err = mddev_lock(mddev); if (err) return err; if (mddev->pers) { if (mddev->pers->check_reshape == NULL) err = -EBUSY; else if (!md_is_rdwr(mddev)) err = -EROFS; else { mddev->new_chunk_sectors = n >> 9; err = mddev->pers->check_reshape(mddev); if (err) mddev->new_chunk_sectors = mddev->chunk_sectors; } } else { mddev->new_chunk_sectors = n >> 9; if (mddev->reshape_position == MaxSector) mddev->chunk_sectors = n >> 9; } mddev_unlock(mddev); return err ?: len; } static struct md_sysfs_entry md_chunk_size = __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); static ssize_t resync_start_show(struct mddev *mddev, char *page) { if (mddev->recovery_cp == MaxSector) return sprintf(page, "none\n"); return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); } static ssize_t resync_start_store(struct mddev *mddev, const char *buf, size_t len) { unsigned long long n; int err; if (cmd_match(buf, "none")) n = MaxSector; else { err = kstrtoull(buf, 10, &n); if (err < 0) return err; if (n != (sector_t)n) return -EINVAL; } err = mddev_lock(mddev); if (err) return err; if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) err = -EBUSY; if (!err) { mddev->recovery_cp = n; if (mddev->pers) set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); } mddev_unlock(mddev); return err ?: len; } static struct md_sysfs_entry md_resync_start = __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store); /* * The array state can be: * * clear * No devices, no size, no level * Equivalent to STOP_ARRAY ioctl * inactive * May have some settings, but array is not active * all IO results in error * When written, doesn't tear down array, but just stops it * suspended (not supported yet) * All IO requests will block. The array can be reconfigured. * Writing this, if accepted, will block until array is quiescent * readonly * no resync can happen. no superblocks get written. * write requests fail * read-auto * like readonly, but behaves like 'clean' on a write request. * * clean - no pending writes, but otherwise active. * When written to inactive array, starts without resync * If a write request arrives then * if metadata is known, mark 'dirty' and switch to 'active'. * if not known, block and switch to write-pending * If written to an active array that has pending writes, then fails. * active * fully active: IO and resync can be happening. * When written to inactive array, starts with resync * * write-pending * clean, but writes are blocked waiting for 'active' to be written. * * active-idle * like active, but no writes have been seen for a while (100msec). * * broken * Array is failed. It's useful because mounted-arrays aren't stopped * when array is failed, so this state will at least alert the user that * something is wrong. */ enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, write_pending, active_idle, broken, bad_word}; static char *array_states[] = { "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", "write-pending", "active-idle", "broken", NULL }; static int match_word(const char *word, char **list) { int n; for (n=0; list[n]; n++) if (cmd_match(word, list[n])) break; return n; } static ssize_t array_state_show(struct mddev *mddev, char *page) { enum array_state st = inactive; if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) { switch(mddev->ro) { case MD_RDONLY: st = readonly; break; case MD_AUTO_READ: st = read_auto; break; case MD_RDWR: spin_lock(&mddev->lock); if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) st = write_pending; else if (mddev->in_sync) st = clean; else if (mddev->safemode) st = active_idle; else st = active; spin_unlock(&mddev->lock); } if (test_bit(MD_BROKEN, &mddev->flags) && st == clean) st = broken; } else { if (list_empty(&mddev->disks) && mddev->raid_disks == 0 && mddev->dev_sectors == 0) st = clear; else st = inactive; } return sprintf(page, "%s\n", array_states[st]); } static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev); static int md_set_readonly(struct mddev *mddev, struct block_device *bdev); static int restart_array(struct mddev *mddev); static ssize_t array_state_store(struct mddev *mddev, const char *buf, size_t len) { int err = 0; enum array_state st = match_word(buf, array_states); /* No lock dependent actions */ switch (st) { case suspended: /* not supported yet */ case write_pending: /* cannot be set */ case active_idle: /* cannot be set */ case broken: /* cannot be set */ case bad_word: return -EINVAL; default: break; } if (mddev->pers && (st == active || st == clean) && mddev->ro != MD_RDONLY) { /* don't take reconfig_mutex when toggling between * clean and active */ spin_lock(&mddev->lock); if (st == active) { restart_array(mddev); clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); md_wakeup_thread(mddev->thread); wake_up(&mddev->sb_wait); } else /* st == clean */ { restart_array(mddev); if (!set_in_sync(mddev)) err = -EBUSY; } if (!err) sysfs_notify_dirent_safe(mddev->sysfs_state); spin_unlock(&mddev->lock); return err ?: len; } err = mddev_lock(mddev); if (err) return err; switch (st) { case inactive: /* stop an active array, return 0 otherwise */ if (mddev->pers) err = do_md_stop(mddev, 2, NULL); break; case clear: err = do_md_stop(mddev, 0, NULL); break; case readonly: if (mddev->pers) err = md_set_readonly(mddev, NULL); else { mddev->ro = MD_RDONLY; set_disk_ro(mddev->gendisk, 1); err = do_md_run(mddev); } break; case read_auto: if (mddev->pers) { if (md_is_rdwr(mddev)) err = md_set_readonly(mddev, NULL); else if (mddev->ro == MD_RDONLY) err = restart_array(mddev); if (err == 0) { mddev->ro = MD_AUTO_READ; set_disk_ro(mddev->gendisk, 0); } } else { mddev->ro = MD_AUTO_READ; err = do_md_run(mddev); } break; case clean: if (mddev->pers) { err = restart_array(mddev); if (err) break; spin_lock(&mddev->lock); if (!set_in_sync(mddev)) err = -EBUSY; spin_unlock(&mddev->lock); } else err = -EINVAL; break; case active: if (mddev->pers) { err = restart_array(mddev); if (err) break; clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); wake_up(&mddev->sb_wait); err = 0; } else { mddev->ro = MD_RDWR; set_disk_ro(mddev->gendisk, 0); err = do_md_run(mddev); } break; default: err = -EINVAL; break; } if (!err) { if (mddev->hold_active == UNTIL_IOCTL) mddev->hold_active = 0; sysfs_notify_dirent_safe(mddev->sysfs_state); } mddev_unlock(mddev); return err ?: len; } static struct md_sysfs_entry md_array_state = __ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); static ssize_t max_corrected_read_errors_show(struct mddev *mddev, char *page) { return sprintf(page, "%d\n", atomic_read(&mddev->max_corr_read_errors)); } static ssize_t max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len) { unsigned int n; int rv; rv = kstrtouint(buf, 10, &n); if (rv < 0) return rv; if (n > INT_MAX) return -EINVAL; atomic_set(&mddev->max_corr_read_errors, n); return len; } static struct md_sysfs_entry max_corr_read_errors = __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, max_corrected_read_errors_store); static ssize_t null_show(struct mddev *mddev, char *page) { return -EINVAL; } static ssize_t new_dev_store(struct mddev *mddev, const char *buf, size_t len) { /* buf must be %d:%d\n? giving major and minor numbers */ /* The new device is added to the array. * If the array has a persistent superblock, we read the * superblock to initialise info and check validity. * Otherwise, only checking done is that in bind_rdev_to_array, * which mainly checks size. */ char *e; int major = simple_strtoul(buf, &e, 10); int minor; dev_t dev; struct md_rdev *rdev; int err; if (!*buf || *e != ':' || !e[1] || e[1] == '\n') return -EINVAL; minor = simple_strtoul(e+1, &e, 10); if (*e && *e != '\n') return -EINVAL; dev = MKDEV(major, minor); if (major != MAJOR(dev) || minor != MINOR(dev)) return -EOVERFLOW; err = mddev_suspend_and_lock(mddev); if (err) return err; if (mddev->persistent) { rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { struct md_rdev *rdev0 = list_entry(mddev->disks.next, struct md_rdev, same_set); err = super_types[mddev->major_version] .load_super(rdev, rdev0, mddev->minor_version); if (err < 0) goto out; } } else if (mddev->external) rdev = md_import_device(dev, -2, -1); else rdev = md_import_device(dev, -1, -1); if (IS_ERR(rdev)) { mddev_unlock_and_resume(mddev); return PTR_ERR(rdev); } err = bind_rdev_to_array(rdev, mddev); out: if (err) export_rdev(rdev, mddev); mddev_unlock_and_resume(mddev); if (!err) md_new_event(); return err ? err : len; } static struct md_sysfs_entry md_new_device = __ATTR(new_dev, S_IWUSR, null_show, new_dev_store); static ssize_t bitmap_store(struct mddev *mddev, const char *buf, size_t len) { char *end; unsigned long chunk, end_chunk; int err; err = mddev_lock(mddev); if (err) return err; if (!mddev->bitmap) goto out; /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ while (*buf) { chunk = end_chunk = simple_strtoul(buf, &end, 0); if (buf == end) break; if (*end == '-') { /* range */ buf = end + 1; end_chunk = simple_strtoul(buf, &end, 0); if (buf == end) break; } if (*end && !isspace(*end)) break; md_bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); buf = skip_spaces(end); } md_bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ out: mddev_unlock(mddev); return len; } static struct md_sysfs_entry md_bitmap = __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); static ssize_t size_show(struct mddev *mddev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)mddev->dev_sectors / 2); } static int update_size(struct mddev *mddev, sector_t num_sectors); static ssize_t size_store(struct mddev *mddev, const char *buf, size_t len) { /* If array is inactive, we can reduce the component size, but * not increase it (except from 0). * If array is active, we can try an on-line resize */ sector_t sectors; int err = strict_blocks_to_sectors(buf, &sectors); if (err < 0) return err; err = mddev_lock(mddev); if (err) return err; if (mddev->pers) { err = update_size(mddev, sectors); if (err == 0) md_update_sb(mddev, 1); } else { if (mddev->dev_sectors == 0 || mddev->dev_sectors > sectors) mddev->dev_sectors = sectors; else err = -ENOSPC; } mddev_unlock(mddev); return err ? err : len; } static struct md_sysfs_entry md_size = __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); /* Metadata version. * This is one of * 'none' for arrays with no metadata (good luck...) * 'external' for arrays with externally managed metadata, * or N.M for internally known formats */ static ssize_t metadata_show(struct mddev *mddev, char *page) { if (mddev->persistent) return sprintf(page, "%d.%d\n", mddev->major_version, mddev->minor_version); else if (mddev->external) return sprintf(page, "external:%s\n", mddev->metadata_type); else return sprintf(page, "none\n"); } static ssize_t metadata_store(struct mddev *mddev, const char *buf, size_t len) { int major, minor; char *e; int err; /* Changing the details of 'external' metadata is * always permitted. Otherwise there must be * no devices attached to the array. */ err = mddev_lock(mddev); if (err) return err; err = -EBUSY; if (mddev->external && strncmp(buf, "external:", 9) == 0) ; else if (!list_empty(&mddev->disks)) goto out_unlock; err = 0; if (cmd_match(buf, "none")) { mddev->persistent = 0; mddev->external = 0; mddev->major_version = 0; mddev->minor_version = 90; goto out_unlock; } if (strncmp(buf, "external:", 9) == 0) { size_t namelen = len-9; if (namelen >= sizeof(mddev->metadata_type)) namelen = sizeof(mddev->metadata_type)-1; memcpy(mddev->metadata_type, buf+9, namelen); mddev->metadata_type[namelen] = 0; if (namelen && mddev->metadata_type[namelen-1] == '\n') mddev->metadata_type[--namelen] = 0; mddev->persistent = 0; mddev->external = 1; mddev->major_version = 0; mddev->minor_version = 90; goto out_unlock; } major = simple_strtoul(buf, &e, 10); err = -EINVAL; if (e==buf || *e != '.') goto out_unlock; buf = e+1; minor = simple_strtoul(buf, &e, 10); if (e==buf || (*e && *e != '\n') ) goto out_unlock; err = -ENOENT; if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) goto out_unlock; mddev->major_version = major; mddev->minor_version = minor; mddev->persistent = 1; mddev->external = 0; err = 0; out_unlock: mddev_unlock(mddev); return err ?: len; } static struct md_sysfs_entry md_metadata = __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); static ssize_t action_show(struct mddev *mddev, char *page) { char *type = "idle"; unsigned long recovery = mddev->recovery; if (test_bit(MD_RECOVERY_FROZEN, &recovery)) type = "frozen"; else if (test_bit(MD_RECOVERY_RUNNING, &recovery) || (md_is_rdwr(mddev) && test_bit(MD_RECOVERY_NEEDED, &recovery))) { if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) type = "reshape"; else if (test_bit(MD_RECOVERY_SYNC, &recovery)) { if (!test_bit(MD_RECOVERY_REQUESTED, &recovery)) type = "resync"; else if (test_bit(MD_RECOVERY_CHECK, &recovery)) type = "check"; else type = "repair"; } else if (test_bit(MD_RECOVERY_RECOVER, &recovery)) type = "recover"; else if (mddev->reshape_position != MaxSector) type = "reshape"; } return sprintf(page, "%s\n", type); } static void stop_sync_thread(struct mddev *mddev) { if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return; if (mddev_lock(mddev)) return; /* * Check again in case MD_RECOVERY_RUNNING is cleared before lock is * held. */ if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { mddev_unlock(mddev); return; } if (work_pending(&mddev->del_work)) flush_workqueue(md_misc_wq); set_bit(MD_RECOVERY_INTR, &mddev->recovery); /* * Thread might be blocked waiting for metadata update which will now * never happen */ md_wakeup_thread_directly(mddev->sync_thread); mddev_unlock(mddev); } static void idle_sync_thread(struct mddev *mddev) { int sync_seq = atomic_read(&mddev->sync_seq); mutex_lock(&mddev->sync_mutex); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); stop_sync_thread(mddev); wait_event(resync_wait, sync_seq != atomic_read(&mddev->sync_seq) || !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)); mutex_unlock(&mddev->sync_mutex); } static void frozen_sync_thread(struct mddev *mddev) { mutex_lock(&mddev->sync_mutex); set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); stop_sync_thread(mddev); wait_event(resync_wait, mddev->sync_thread == NULL && !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)); mutex_unlock(&mddev->sync_mutex); } static ssize_t action_store(struct mddev *mddev, const char *page, size_t len) { if (!mddev->pers || !mddev->pers->sync_request) return -EINVAL; if (cmd_match(page, "idle")) idle_sync_thread(mddev); else if (cmd_match(page, "frozen")) frozen_sync_thread(mddev); else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return -EBUSY; else if (cmd_match(page, "resync")) clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); else if (cmd_match(page, "recover")) { clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); } else if (cmd_match(page, "reshape")) { int err; if (mddev->pers->start_reshape == NULL) return -EINVAL; err = mddev_lock(mddev); if (!err) { if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { err = -EBUSY; } else if (mddev->reshape_position == MaxSector || mddev->pers->check_reshape == NULL || mddev->pers->check_reshape(mddev)) { clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); err = mddev->pers->start_reshape(mddev); } else { /* * If reshape is still in progress, and * md_check_recovery() can continue to reshape, * don't restart reshape because data can be * corrupted for raid456. */ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } mddev_unlock(mddev); } if (err) return err; sysfs_notify_dirent_safe(mddev->sysfs_degraded); } else { if (cmd_match(page, "check")) set_bit(MD_RECOVERY_CHECK, &mddev->recovery); else if (!cmd_match(page, "repair")) return -EINVAL; clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); set_bit(MD_RECOVERY_SYNC, &mddev->recovery); } if (mddev->ro == MD_AUTO_READ) { /* A write to sync_action is enough to justify * canceling read-auto mode */ flush_work(&mddev->sync_work); mddev->ro = MD_RDWR; md_wakeup_thread(mddev->sync_thread); } set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); sysfs_notify_dirent_safe(mddev->sysfs_action); return len; } static struct md_sysfs_entry md_scan_mode = __ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); static ssize_t last_sync_action_show(struct mddev *mddev, char *page) { return sprintf(page, "%s\n", mddev->last_sync_action); } static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action); static ssize_t mismatch_cnt_show(struct mddev *mddev, char *page) { return sprintf(page, "%llu\n", (unsigned long long) atomic64_read(&mddev->resync_mismatches)); } static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); static ssize_t sync_min_show(struct mddev *mddev, char *page) { return sprintf(page, "%d (%s)\n", speed_min(mddev), mddev->sync_speed_min ? "local": "system"); } static ssize_t sync_min_store(struct mddev *mddev, const char *buf, size_t len) { unsigned int min; int rv; if (strncmp(buf, "system", 6)==0) { min = 0; } else { rv = kstrtouint(buf, 10, &min); if (rv < 0) return rv; if (min == 0) return -EINVAL; } mddev->sync_speed_min = min; return len; } static struct md_sysfs_entry md_sync_min = __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); static ssize_t sync_max_show(struct mddev *mddev, char *page) { return sprintf(page, "%d (%s)\n", speed_max(mddev), mddev->sync_speed_max ? "local": "system"); } static ssize_t sync_max_store(struct mddev *mddev, const char *buf, size_t len) { unsigned int max; int rv; if (strncmp(buf, "system", 6)==0) { max = 0; } else { rv = kstrtouint(buf, 10, &max); if (rv < 0) return rv; if (max == 0) return -EINVAL; } mddev->sync_speed_max = max; return len; } static struct md_sysfs_entry md_sync_max = __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); static ssize_t degraded_show(struct mddev *mddev, char *page) { return sprintf(page, "%d\n", mddev->degraded); } static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); static ssize_t sync_force_parallel_show(struct mddev *mddev, char *page) { return sprintf(page, "%d\n", mddev->parallel_resync); } static ssize_t sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len) { long n; if (kstrtol(buf, 10, &n)) return -EINVAL; if (n != 0 && n != 1) return -EINVAL; mddev->parallel_resync = n; if (mddev->sync_thread) wake_up(&resync_wait); return len; } /* force parallel resync, even with shared block devices */ static struct md_sysfs_entry md_sync_force_parallel = __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, sync_force_parallel_show, sync_force_parallel_store); static ssize_t sync_speed_show(struct mddev *mddev, char *page) { unsigned long resync, dt, db; if (mddev->curr_resync == MD_RESYNC_NONE) return sprintf(page, "none\n"); resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); dt = (jiffies - mddev->resync_mark) / HZ; if (!dt) dt++; db = resync - mddev->resync_mark_cnt; return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ } static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); static ssize_t sync_completed_show(struct mddev *mddev, char *page) { unsigned long long max_sectors, resync; if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return sprintf(page, "none\n"); if (mddev->curr_resync == MD_RESYNC_YIELDED || mddev->curr_resync == MD_RESYNC_DELAYED) return sprintf(page, "delayed\n"); if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) max_sectors = mddev->resync_max_sectors; else max_sectors = mddev->dev_sectors; resync = mddev->curr_resync_completed; return sprintf(page, "%llu / %llu\n", resync, max_sectors); } static struct md_sysfs_entry md_sync_completed = __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL); static ssize_t min_sync_show(struct mddev *mddev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)mddev->resync_min); } static ssize_t min_sync_store(struct mddev *mddev, const char *buf, size_t len) { unsigned long long min; int err; if (kstrtoull(buf, 10, &min)) return -EINVAL; spin_lock(&mddev->lock); err = -EINVAL; if (min > mddev->resync_max) goto out_unlock; err = -EBUSY; if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) goto out_unlock; /* Round down to multiple of 4K for safety */ mddev->resync_min = round_down(min, 8); err = 0; out_unlock: spin_unlock(&mddev->lock); return err ?: len; } static struct md_sysfs_entry md_min_sync = __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); static ssize_t max_sync_show(struct mddev *mddev, char *page) { if (mddev->resync_max == MaxSector) return sprintf(page, "max\n"); else return sprintf(page, "%llu\n", (unsigned long long)mddev->resync_max); } static ssize_t max_sync_store(struct mddev *mddev, const char *buf, size_t len) { int err; spin_lock(&mddev->lock); if (strncmp(buf, "max", 3) == 0) mddev->resync_max = MaxSector; else { unsigned long long max; int chunk; err = -EINVAL; if (kstrtoull(buf, 10, &max)) goto out_unlock; if (max < mddev->resync_min) goto out_unlock; err = -EBUSY; if (max < mddev->resync_max && md_is_rdwr(mddev) && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) goto out_unlock; /* Must be a multiple of chunk_size */ chunk = mddev->chunk_sectors; if (chunk) { sector_t temp = max; err = -EINVAL; if (sector_div(temp, chunk)) goto out_unlock; } mddev->resync_max = max; } wake_up(&mddev->recovery_wait); err = 0; out_unlock: spin_unlock(&mddev->lock); return err ?: len; } static struct md_sysfs_entry md_max_sync = __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); static ssize_t suspend_lo_show(struct mddev *mddev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)READ_ONCE(mddev->suspend_lo)); } static ssize_t suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) { unsigned long long new; int err; err = kstrtoull(buf, 10, &new); if (err < 0) return err; if (new != (sector_t)new) return -EINVAL; err = mddev_suspend(mddev, true); if (err) return err; WRITE_ONCE(mddev->suspend_lo, new); mddev_resume(mddev); return len; } static struct md_sysfs_entry md_suspend_lo = __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); static ssize_t suspend_hi_show(struct mddev *mddev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)READ_ONCE(mddev->suspend_hi)); } static ssize_t suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) { unsigned long long new; int err; err = kstrtoull(buf, 10, &new); if (err < 0) return err; if (new != (sector_t)new) return -EINVAL; err = mddev_suspend(mddev, true); if (err) return err; WRITE_ONCE(mddev->suspend_hi, new); mddev_resume(mddev); return len; } static struct md_sysfs_entry md_suspend_hi = __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); static ssize_t reshape_position_show(struct mddev *mddev, char *page) { if (mddev->reshape_position != MaxSector) return sprintf(page, "%llu\n", (unsigned long long)mddev->reshape_position); strcpy(page, "none\n"); return 5; } static ssize_t reshape_position_store(struct mddev *mddev, const char *buf, size_t len) { struct md_rdev *rdev; unsigned long long new; int err; err = kstrtoull(buf, 10, &new); if (err < 0) return err; if (new != (sector_t)new) return -EINVAL; err = mddev_lock(mddev); if (err) return err; err = -EBUSY; if (mddev->pers) goto unlock; mddev->reshape_position = new; mddev->delta_disks = 0; mddev->reshape_backwards = 0; mddev->new_level = mddev->level; mddev->new_layout = mddev->layout; mddev->new_chunk_sectors = mddev->chunk_sectors; rdev_for_each(rdev, mddev) rdev->new_data_offset = rdev->data_offset; err = 0; unlock: mddev_unlock(mddev); return err ?: len; } static struct md_sysfs_entry md_reshape_position = __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, reshape_position_store); static ssize_t reshape_direction_show(struct mddev *mddev, char *page) { return sprintf(page, "%s\n", mddev->reshape_backwards ? "backwards" : "forwards"); } static ssize_t reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) { int backwards = 0; int err; if (cmd_match(buf, "forwards")) backwards = 0; else if (cmd_match(buf, "backwards")) backwards = 1; else return -EINVAL; if (mddev->reshape_backwards == backwards) return len; err = mddev_lock(mddev); if (err) return err; /* check if we are allowed to change */ if (mddev->delta_disks) err = -EBUSY; else if (mddev->persistent && mddev->major_version == 0) err = -EINVAL; else mddev->reshape_backwards = backwards; mddev_unlock(mddev); return err ?: len; } static struct md_sysfs_entry md_reshape_direction = __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show, reshape_direction_store); static ssize_t array_size_show(struct mddev *mddev, char *page) { if (mddev->external_size) return sprintf(page, "%llu\n", (unsigned long long)mddev->array_sectors/2); else return sprintf(page, "default\n"); } static ssize_t array_size_store(struct mddev *mddev, const char *buf, size_t len) { sector_t sectors; int err; err = mddev_lock(mddev); if (err) return err; /* cluster raid doesn't support change array_sectors */ if (mddev_is_clustered(mddev)) { mddev_unlock(mddev); return -EINVAL; } if (strncmp(buf, "default", 7) == 0) { if (mddev->pers) sectors = mddev->pers->size(mddev, 0, 0); else sectors = mddev->array_sectors; mddev->external_size = 0; } else { if (strict_blocks_to_sectors(buf, &sectors) < 0) err = -EINVAL; else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) err = -E2BIG; else mddev->external_size = 1; } if (!err) { mddev->array_sectors = sectors; if (mddev->pers) set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); } mddev_unlock(mddev); return err ?: len; } static struct md_sysfs_entry md_array_size = __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, array_size_store); static ssize_t consistency_policy_show(struct mddev *mddev, char *page) { int ret; if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { ret = sprintf(page, "journal\n"); } else if (test_bit(MD_HAS_PPL, &mddev->flags)) { ret = sprintf(page, "ppl\n"); } else if (mddev->bitmap) { ret = sprintf(page, "bitmap\n"); } else if (mddev->pers) { if (mddev->pers->sync_request) ret = sprintf(page, "resync\n"); else ret = sprintf(page, "none\n"); } else { ret = sprintf(page, "unknown\n"); } return ret; } static ssize_t consistency_policy_store(struct mddev *mddev, const char *buf, size_t len) { int err = 0; if (mddev->pers) { if (mddev->pers->change_consistency_policy) err = mddev->pers->change_consistency_policy(mddev, buf); else err = -EBUSY; } else if (mddev->external && strncmp(buf, "ppl", 3) == 0) { set_bit(MD_HAS_PPL, &mddev->flags); } else { err = -EINVAL; } return err ? err : len; } static struct md_sysfs_entry md_consistency_policy = __ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show, consistency_policy_store); static ssize_t fail_last_dev_show(struct mddev *mddev, char *page) { return sprintf(page, "%d\n", mddev->fail_last_dev); } /* * Setting fail_last_dev to true to allow last device to be forcibly removed * from RAID1/RAID10. */ static ssize_t fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len) { int ret; bool value; ret = kstrtobool(buf, &value); if (ret) return ret; if (value != mddev->fail_last_dev) mddev->fail_last_dev = value; return len; } static struct md_sysfs_entry md_fail_last_dev = __ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show, fail_last_dev_store); static ssize_t serialize_policy_show(struct mddev *mddev, char *page) { if (mddev->pers == NULL || (mddev->pers->level != 1)) return sprintf(page, "n/a\n"); else return sprintf(page, "%d\n", mddev->serialize_policy); } /* * Setting serialize_policy to true to enforce write IO is not reordered * for raid1. */ static ssize_t serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) { int err; bool value; err = kstrtobool(buf, &value); if (err) return err; if (value == mddev->serialize_policy) return len; err = mddev_suspend_and_lock(mddev); if (err) return err; if (mddev->pers == NULL || (mddev->pers->level != 1)) { pr_err("md: serialize_policy is only effective for raid1\n"); err = -EINVAL; goto unlock; } if (value) mddev_create_serial_pool(mddev, NULL); else mddev_destroy_serial_pool(mddev, NULL); mddev->serialize_policy = value; unlock: mddev_unlock_and_resume(mddev); return err ?: len; } static struct md_sysfs_entry md_serialize_policy = __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show, serialize_policy_store); static struct attribute *md_default_attrs[] = { &md_level.attr, &md_layout.attr, &md_raid_disks.attr, &md_uuid.attr, &md_chunk_size.attr, &md_size.attr, &md_resync_start.attr, &md_metadata.attr, &md_new_device.attr, &md_safe_delay.attr, &md_array_state.attr, &md_reshape_position.attr, &md_reshape_direction.attr, &md_array_size.attr, &max_corr_read_errors.attr, &md_consistency_policy.attr, &md_fail_last_dev.attr, &md_serialize_policy.attr, NULL, }; static const struct attribute_group md_default_group = { .attrs = md_default_attrs, }; static struct attribute *md_redundancy_attrs[] = { &md_scan_mode.attr, &md_last_scan_mode.attr, &md_mismatches.attr, &md_sync_min.attr, &md_sync_max.attr, &md_sync_speed.attr, &md_sync_force_parallel.attr, &md_sync_completed.attr, &md_min_sync.attr, &md_max_sync.attr, &md_suspend_lo.attr, &md_suspend_hi.attr, &md_bitmap.attr, &md_degraded.attr, NULL, }; static const struct attribute_group md_redundancy_group = { .name = NULL, .attrs = md_redundancy_attrs, }; static const struct attribute_group *md_attr_groups[] = { &md_default_group, &md_bitmap_group, NULL, }; static ssize_t md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); struct mddev *mddev = container_of(kobj, struct mddev, kobj); ssize_t rv; if (!entry->show) return -EIO; spin_lock(&all_mddevs_lock); if (!mddev_get(mddev)) { spin_unlock(&all_mddevs_lock); return -EBUSY; } spin_unlock(&all_mddevs_lock); rv = entry->show(mddev, page); mddev_put(mddev); return rv; } static ssize_t md_attr_store(struct kobject *kobj, struct attribute *attr, const char *page, size_t length) { struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); struct mddev *mddev = container_of(kobj, struct mddev, kobj); ssize_t rv; if (!entry->store) return -EIO; if (!capable(CAP_SYS_ADMIN)) return -EACCES; spin_lock(&all_mddevs_lock); if (!mddev_get(mddev)) { spin_unlock(&all_mddevs_lock); return -EBUSY; } spin_unlock(&all_mddevs_lock); rv = entry->store(mddev, page, length); mddev_put(mddev); return rv; } static void md_kobj_release(struct kobject *ko) { struct mddev *mddev = container_of(ko, struct mddev, kobj); if (mddev->sysfs_state) sysfs_put(mddev->sysfs_state); if (mddev->sysfs_level) sysfs_put(mddev->sysfs_level); del_gendisk(mddev->gendisk); put_disk(mddev->gendisk); } static const struct sysfs_ops md_sysfs_ops = { .show = md_attr_show, .store = md_attr_store, }; static const struct kobj_type md_ktype = { .release = md_kobj_release, .sysfs_ops = &md_sysfs_ops, .default_groups = md_attr_groups, }; int mdp_major = 0; static void mddev_delayed_delete(struct work_struct *ws) { struct mddev *mddev = container_of(ws, struct mddev, del_work); kobject_put(&mddev->kobj); } struct mddev *md_alloc(dev_t dev, char *name) { /* * If dev is zero, name is the name of a device to allocate with * an arbitrary minor number. It will be "md_???" * If dev is non-zero it must be a device number with a MAJOR of * MD_MAJOR or mdp_major. In this case, if "name" is NULL, then * the device is being created by opening a node in /dev. * If "name" is not NULL, the device is being created by * writing to /sys/module/md_mod/parameters/new_array. */ static DEFINE_MUTEX(disks_mutex); struct mddev *mddev; struct gendisk *disk; int partitioned; int shift; int unit; int error ; /* * Wait for any previous instance of this device to be completely * removed (mddev_delayed_delete). */ flush_workqueue(md_misc_wq); mutex_lock(&disks_mutex); mddev = mddev_alloc(dev); if (IS_ERR(mddev)) { error = PTR_ERR(mddev); goto out_unlock; } partitioned = (MAJOR(mddev->unit) != MD_MAJOR); shift = partitioned ? MdpMinorShift : 0; unit = MINOR(mddev->unit) >> shift; if (name && !dev) { /* Need to ensure that 'name' is not a duplicate. */ struct mddev *mddev2; spin_lock(&all_mddevs_lock); list_for_each_entry(mddev2, &all_mddevs, all_mddevs) if (mddev2->gendisk && strcmp(mddev2->gendisk->disk_name, name) == 0) { spin_unlock(&all_mddevs_lock); error = -EEXIST; goto out_free_mddev; } spin_unlock(&all_mddevs_lock); } if (name && dev) /* * Creating /dev/mdNNN via "newarray", so adjust hold_active. */ mddev->hold_active = UNTIL_STOP; error = -ENOMEM; disk = blk_alloc_disk(NUMA_NO_NODE); if (!disk) goto out_free_mddev; disk->major = MAJOR(mddev->unit); disk->first_minor = unit << shift; disk->minors = 1 << shift; if (name) strcpy(disk->disk_name, name); else if (partitioned) sprintf(disk->disk_name, "md_d%d", unit); else sprintf(disk->disk_name, "md%d", unit); disk->fops = &md_fops; disk->private_data = mddev; mddev->queue = disk->queue; blk_set_stacking_limits(&mddev->queue->limits); blk_queue_write_cache(mddev->queue, true, true); disk->events |= DISK_EVENT_MEDIA_CHANGE; mddev->gendisk = disk; error = add_disk(disk); if (error) goto out_put_disk; kobject_init(&mddev->kobj, &md_ktype); error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md"); if (error) { /* * The disk is already live at this point. Clear the hold flag * and let mddev_put take care of the deletion, as it isn't any * different from a normal close on last release now. */ mddev->hold_active = 0; mutex_unlock(&disks_mutex); mddev_put(mddev); return ERR_PTR(error); } kobject_uevent(&mddev->kobj, KOBJ_ADD); mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level"); mutex_unlock(&disks_mutex); return mddev; out_put_disk: put_disk(disk); out_free_mddev: mddev_free(mddev); out_unlock: mutex_unlock(&disks_mutex); return ERR_PTR(error); } static int md_alloc_and_put(dev_t dev, char *name) { struct mddev *mddev = md_alloc(dev, name); if (IS_ERR(mddev)) return PTR_ERR(mddev); mddev_put(mddev); return 0; } static void md_probe(dev_t dev) { if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512) return; if (create_on_open) md_alloc_and_put(dev, NULL); } static int add_named_array(const char *val, const struct kernel_param *kp) { /* * val must be "md_*" or "mdNNN". * For "md_*" we allocate an array with a large free minor number, and * set the name to val. val must not already be an active name. * For "mdNNN" we allocate an array with the minor number NNN * which must not already be in use. */ int len = strlen(val); char buf[DISK_NAME_LEN]; unsigned long devnum; while (len && val[len-1] == '\n') len--; if (len >= DISK_NAME_LEN) return -E2BIG; strscpy(buf, val, len+1); if (strncmp(buf, "md_", 3) == 0) return md_alloc_and_put(0, buf); if (strncmp(buf, "md", 2) == 0 && isdigit(buf[2]) && kstrtoul(buf+2, 10, &devnum) == 0 && devnum <= MINORMASK) return md_alloc_and_put(MKDEV(MD_MAJOR, devnum), NULL); return -EINVAL; } static void md_safemode_timeout(struct timer_list *t) { struct mddev *mddev = from_timer(mddev, t, safemode_timer); mddev->safemode = 1; if (mddev->external) sysfs_notify_dirent_safe(mddev->sysfs_state); md_wakeup_thread(mddev->thread); } static int start_dirty_degraded; int md_run(struct mddev *mddev) { int err; struct md_rdev *rdev; struct md_personality *pers; bool nowait = true; if (list_empty(&mddev->disks)) /* cannot run an array with no devices.. */ return -EINVAL; if (mddev->pers) return -EBUSY; /* Cannot run until previous stop completes properly */ if (mddev->sysfs_active) return -EBUSY; /* * Analyze all RAID superblock(s) */ if (!mddev->raid_disks) { if (!mddev->persistent) return -EINVAL; err = analyze_sbs(mddev); if (err) return -EINVAL; } if (mddev->level != LEVEL_NONE) request_module("md-level-%d", mddev->level); else if (mddev->clevel[0]) request_module("md-%s", mddev->clevel); /* * Drop all container device buffers, from now on * the only valid external interface is through the md * device. */ mddev->has_superblocks = false; rdev_for_each(rdev, mddev) { if (test_bit(Faulty, &rdev->flags)) continue; sync_blockdev(rdev->bdev); invalidate_bdev(rdev->bdev); if (mddev->ro != MD_RDONLY && rdev_read_only(rdev)) { mddev->ro = MD_RDONLY; if (mddev->gendisk) set_disk_ro(mddev->gendisk, 1); } if (rdev->sb_page) mddev->has_superblocks = true; /* perform some consistency tests on the device. * We don't want the data to overlap the metadata, * Internal Bitmap issues have been handled elsewhere. */ if (rdev->meta_bdev) { /* Nothing to check */; } else if (rdev->data_offset < rdev->sb_start) { if (mddev->dev_sectors && rdev->data_offset + mddev->dev_sectors > rdev->sb_start) { pr_warn("md: %s: data overlaps metadata\n", mdname(mddev)); return -EINVAL; } } else { if (rdev->sb_start + rdev->sb_size/512 > rdev->data_offset) { pr_warn("md: %s: metadata overlaps data\n", mdname(mddev)); return -EINVAL; } } sysfs_notify_dirent_safe(rdev->sysfs_state); nowait = nowait && bdev_nowait(rdev->bdev); } if (!bioset_initialized(&mddev->bio_set)) { err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); if (err) return err; } if (!bioset_initialized(&mddev->sync_set)) { err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); if (err) goto exit_bio_set; } if (!bioset_initialized(&mddev->io_clone_set)) { err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE, offsetof(struct md_io_clone, bio_clone), 0); if (err) goto exit_sync_set; } spin_lock(&pers_lock); pers = find_pers(mddev->level, mddev->clevel); if (!pers || !try_module_get(pers->owner)) { spin_unlock(&pers_lock); if (mddev->level != LEVEL_NONE) pr_warn("md: personality for level %d is not loaded!\n", mddev->level); else pr_warn("md: personality for level %s is not loaded!\n", mddev->clevel); err = -EINVAL; goto abort; } spin_unlock(&pers_lock); if (mddev->level != pers->level) { mddev->level = pers->level; mddev->new_level = pers->level; } strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); if (mddev->reshape_position != MaxSector && pers->start_reshape == NULL) { /* This personality cannot handle reshaping... */ module_put(pers->owner); err = -EINVAL; goto abort; } if (pers->sync_request) { /* Warn if this is a potentially silly * configuration. */ struct md_rdev *rdev2; int warned = 0; rdev_for_each(rdev, mddev) rdev_for_each(rdev2, mddev) { if (rdev < rdev2 && rdev->bdev->bd_disk == rdev2->bdev->bd_disk) { pr_warn("%s: WARNING: %pg appears to be on the same physical disk as %pg.\n", mdname(mddev), rdev->bdev, rdev2->bdev); warned = 1; } } if (warned) pr_warn("True protection against single-disk failure might be compromised.\n"); } mddev->recovery = 0; /* may be over-ridden by personality */ mddev->resync_max_sectors = mddev->dev_sectors; mddev->ok_start_degraded = start_dirty_degraded; if (start_readonly && md_is_rdwr(mddev)) mddev->ro = MD_AUTO_READ; /* read-only, but switch on first write */ err = pers->run(mddev); if (err) pr_warn("md: pers->run() failed ...\n"); else if (pers->size(mddev, 0, 0) < mddev->array_sectors) { WARN_ONCE(!mddev->external_size, "%s: default size too small, but 'external_size' not in effect?\n", __func__); pr_warn("md: invalid array_size %llu > default size %llu\n", (unsigned long long)mddev->array_sectors / 2, (unsigned long long)pers->size(mddev, 0, 0) / 2); err = -EINVAL; } if (err == 0 && pers->sync_request && (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { struct bitmap *bitmap; bitmap = md_bitmap_create(mddev, -1); if (IS_ERR(bitmap)) { err = PTR_ERR(bitmap); pr_warn("%s: failed to create bitmap (%d)\n", mdname(mddev), err); } else mddev->bitmap = bitmap; } if (err) goto bitmap_abort; if (mddev->bitmap_info.max_write_behind > 0) { bool create_pool = false; rdev_for_each(rdev, mddev) { if (test_bit(WriteMostly, &rdev->flags) && rdev_init_serial(rdev)) create_pool = true; } if (create_pool && mddev->serial_info_pool == NULL) { mddev->serial_info_pool = mempool_create_kmalloc_pool(NR_SERIAL_INFOS, sizeof(struct serial_info)); if (!mddev->serial_info_pool) { err = -ENOMEM; goto bitmap_abort; } } } if (mddev->queue) { bool nonrot = true; rdev_for_each(rdev, mddev) { if (rdev->raid_disk >= 0 && !bdev_nonrot(rdev->bdev)) { nonrot = false; break; } } if (mddev->degraded) nonrot = false; if (nonrot) blk_queue_flag_set(QUEUE_FLAG_NONROT, mddev->queue); else blk_queue_flag_clear(QUEUE_FLAG_NONROT, mddev->queue); blk_queue_flag_set(QUEUE_FLAG_IO_STAT, mddev->queue); /* Set the NOWAIT flags if all underlying devices support it */ if (nowait) blk_queue_flag_set(QUEUE_FLAG_NOWAIT, mddev->queue); } if (pers->sync_request) { if (mddev->kobj.sd && sysfs_create_group(&mddev->kobj, &md_redundancy_group)) pr_warn("md: cannot register extra attributes for %s\n", mdname(mddev)); mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); } else if (mddev->ro == MD_AUTO_READ) mddev->ro = MD_RDWR; atomic_set(&mddev->max_corr_read_errors, MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); mddev->safemode = 0; if (mddev_is_clustered(mddev)) mddev->safemode_delay = 0; else mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; mddev->in_sync = 1; smp_wmb(); spin_lock(&mddev->lock); mddev->pers = pers; spin_unlock(&mddev->lock); rdev_for_each(rdev, mddev) if (rdev->raid_disk >= 0) sysfs_link_rdev(mddev, rdev); /* failure here is OK */ if (mddev->degraded && md_is_rdwr(mddev)) /* This ensures that recovering status is reported immediately * via sysfs - until a lack of spares is confirmed. */ set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); if (mddev->sb_flags) md_update_sb(mddev, 0); md_new_event(); return 0; bitmap_abort: mddev_detach(mddev); if (mddev->private) pers->free(mddev, mddev->private); mddev->private = NULL; module_put(pers->owner); md_bitmap_destroy(mddev); abort: bioset_exit(&mddev->io_clone_set); exit_sync_set: bioset_exit(&mddev->sync_set); exit_bio_set: bioset_exit(&mddev->bio_set); return err; } EXPORT_SYMBOL_GPL(md_run); int do_md_run(struct mddev *mddev) { int err; set_bit(MD_NOT_READY, &mddev->flags); err = md_run(mddev); if (err) goto out; err = md_bitmap_load(mddev); if (err) { md_bitmap_destroy(mddev); goto out; } if (mddev_is_clustered(mddev)) md_allow_write(mddev); /* run start up tasks that require md_thread */ md_start(mddev); md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); clear_bit(MD_NOT_READY, &mddev->flags); mddev->changed = 1; kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); sysfs_notify_dirent_safe(mddev->sysfs_state); sysfs_notify_dirent_safe(mddev->sysfs_action); sysfs_notify_dirent_safe(mddev->sysfs_degraded); out: clear_bit(MD_NOT_READY, &mddev->flags); return err; } int md_start(struct mddev *mddev) { int ret = 0; if (mddev->pers->start) { set_bit(MD_RECOVERY_WAIT, &mddev->recovery); md_wakeup_thread(mddev->thread); ret = mddev->pers->start(mddev); clear_bit(MD_RECOVERY_WAIT, &mddev->recovery); md_wakeup_thread(mddev->sync_thread); } return ret; } EXPORT_SYMBOL_GPL(md_start); static int restart_array(struct mddev *mddev) { struct gendisk *disk = mddev->gendisk; struct md_rdev *rdev; bool has_journal = false; bool has_readonly = false; /* Complain if it has no devices */ if (list_empty(&mddev->disks)) return -ENXIO; if (!mddev->pers) return -EINVAL; if (md_is_rdwr(mddev)) return -EBUSY; rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) { if (test_bit(Journal, &rdev->flags) && !test_bit(Faulty, &rdev->flags)) has_journal = true; if (rdev_read_only(rdev)) has_readonly = true; } rcu_read_unlock(); if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal) /* Don't restart rw with journal missing/faulty */ return -EINVAL; if (has_readonly) return -EROFS; mddev->safemode = 0; mddev->ro = MD_RDWR; set_disk_ro(disk, 0); pr_debug("md: %s switched to read-write mode.\n", mdname(mddev)); /* Kick recovery or resync if necessary */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); sysfs_notify_dirent_safe(mddev->sysfs_state); return 0; } static void md_clean(struct mddev *mddev) { mddev->array_sectors = 0; mddev->external_size = 0; mddev->dev_sectors = 0; mddev->raid_disks = 0; mddev->recovery_cp = 0; mddev->resync_min = 0; mddev->resync_max = MaxSector; mddev->reshape_position = MaxSector; /* we still need mddev->external in export_rdev, do not clear it yet */ mddev->persistent = 0; mddev->level = LEVEL_NONE; mddev->clevel[0] = 0; mddev->flags = 0; mddev->sb_flags = 0; mddev->ro = MD_RDWR; mddev->metadata_type[0] = 0; mddev->chunk_sectors = 0; mddev->ctime = mddev->utime = 0; mddev->layout = 0; mddev->max_disks = 0; mddev->events = 0; mddev->can_decrease_events = 0; mddev->delta_disks = 0; mddev->reshape_backwards = 0; mddev->new_level = LEVEL_NONE; mddev->new_layout = 0; mddev->new_chunk_sectors = 0; mddev->curr_resync = MD_RESYNC_NONE; atomic64_set(&mddev->resync_mismatches, 0); mddev->suspend_lo = mddev->suspend_hi = 0; mddev->sync_speed_min = mddev->sync_speed_max = 0; mddev->recovery = 0; mddev->in_sync = 0; mddev->changed = 0; mddev->degraded = 0; mddev->safemode = 0; mddev->private = NULL; mddev->cluster_info = NULL; mddev->bitmap_info.offset = 0; mddev->bitmap_info.default_offset = 0; mddev->bitmap_info.default_space = 0; mddev->bitmap_info.chunksize = 0; mddev->bitmap_info.daemon_sleep = 0; mddev->bitmap_info.max_write_behind = 0; mddev->bitmap_info.nodes = 0; } static void __md_stop_writes(struct mddev *mddev) { set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); if (work_pending(&mddev->del_work)) flush_workqueue(md_misc_wq); if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); md_reap_sync_thread(mddev); } del_timer_sync(&mddev->safemode_timer); if (mddev->pers && mddev->pers->quiesce) { mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); } md_bitmap_flush(mddev); if (md_is_rdwr(mddev) && ((!mddev->in_sync && !mddev_is_clustered(mddev)) || mddev->sb_flags)) { /* mark array as shutdown cleanly */ if (!mddev_is_clustered(mddev)) mddev->in_sync = 1; md_update_sb(mddev, 1); } /* disable policy to guarantee rdevs free resources for serialization */ mddev->serialize_policy = 0; mddev_destroy_serial_pool(mddev, NULL); } void md_stop_writes(struct mddev *mddev) { mddev_lock_nointr(mddev); __md_stop_writes(mddev); mddev_unlock(mddev); } EXPORT_SYMBOL_GPL(md_stop_writes); static void mddev_detach(struct mddev *mddev) { md_bitmap_wait_behind_writes(mddev); if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) { mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); } md_unregister_thread(mddev, &mddev->thread); if (mddev->queue) blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ } static void __md_stop(struct mddev *mddev) { struct md_personality *pers = mddev->pers; md_bitmap_destroy(mddev); mddev_detach(mddev); /* Ensure ->event_work is done */ if (mddev->event_work.func) flush_workqueue(md_misc_wq); spin_lock(&mddev->lock); mddev->pers = NULL; spin_unlock(&mddev->lock); if (mddev->private) pers->free(mddev, mddev->private); mddev->private = NULL; if (pers->sync_request && mddev->to_remove == NULL) mddev->to_remove = &md_redundancy_group; module_put(pers->owner); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); bioset_exit(&mddev->io_clone_set); } void md_stop(struct mddev *mddev) { lockdep_assert_held(&mddev->reconfig_mutex); /* stop the array and free an attached data structures. * This is called from dm-raid */ __md_stop_writes(mddev); __md_stop(mddev); } EXPORT_SYMBOL_GPL(md_stop); static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) { int err = 0; int did_freeze = 0; if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { did_freeze = 1; set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); md_wakeup_thread(mddev->thread); } if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) set_bit(MD_RECOVERY_INTR, &mddev->recovery); /* * Thread might be blocked waiting for metadata update which will now * never happen */ md_wakeup_thread_directly(mddev->sync_thread); if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) return -EBUSY; mddev_unlock(mddev); wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)); wait_event(mddev->sb_wait, !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); mddev_lock_nointr(mddev); mutex_lock(&mddev->open_mutex); if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || mddev->sync_thread || test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { pr_warn("md: %s still in use.\n",mdname(mddev)); if (did_freeze) { clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); } err = -EBUSY; goto out; } if (mddev->pers) { __md_stop_writes(mddev); err = -ENXIO; if (mddev->ro == MD_RDONLY) goto out; mddev->ro = MD_RDONLY; set_disk_ro(mddev->gendisk, 1); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); sysfs_notify_dirent_safe(mddev->sysfs_state); err = 0; } out: mutex_unlock(&mddev->open_mutex); return err; } /* mode: * 0 - completely stop and dis-assemble array * 2 - stop but do not disassemble array */ static int do_md_stop(struct mddev *mddev, int mode, struct block_device *bdev) { struct gendisk *disk = mddev->gendisk; struct md_rdev *rdev; int did_freeze = 0; if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { did_freeze = 1; set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); md_wakeup_thread(mddev->thread); } if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) set_bit(MD_RECOVERY_INTR, &mddev->recovery); /* * Thread might be blocked waiting for metadata update which will now * never happen */ md_wakeup_thread_directly(mddev->sync_thread); mddev_unlock(mddev); wait_event(resync_wait, (mddev->sync_thread == NULL && !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))); mddev_lock_nointr(mddev); mutex_lock(&mddev->open_mutex); if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || mddev->sysfs_active || mddev->sync_thread || test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { pr_warn("md: %s still in use.\n",mdname(mddev)); mutex_unlock(&mddev->open_mutex); if (did_freeze) { clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); } return -EBUSY; } if (mddev->pers) { if (!md_is_rdwr(mddev)) set_disk_ro(disk, 0); __md_stop_writes(mddev); __md_stop(mddev); /* tell userspace to handle 'inactive' */ sysfs_notify_dirent_safe(mddev->sysfs_state); rdev_for_each(rdev, mddev) if (rdev->raid_disk >= 0) sysfs_unlink_rdev(mddev, rdev); set_capacity_and_notify(disk, 0); mutex_unlock(&mddev->open_mutex); mddev->changed = 1; if (!md_is_rdwr(mddev)) mddev->ro = MD_RDWR; } else mutex_unlock(&mddev->open_mutex); /* * Free resources if final stop */ if (mode == 0) { pr_info("md: %s stopped.\n", mdname(mddev)); if (mddev->bitmap_info.file) { struct file *f = mddev->bitmap_info.file; spin_lock(&mddev->lock); mddev->bitmap_info.file = NULL; spin_unlock(&mddev->lock); fput(f); } mddev->bitmap_info.offset = 0; export_array(mddev); md_clean(mddev); if (mddev->hold_active == UNTIL_STOP) mddev->hold_active = 0; } md_new_event(); sysfs_notify_dirent_safe(mddev->sysfs_state); return 0; } #ifndef MODULE static void autorun_array(struct mddev *mddev) { struct md_rdev *rdev; int err; if (list_empty(&mddev->disks)) return; pr_info("md: running: "); rdev_for_each(rdev, mddev) { pr_cont("<%pg>", rdev->bdev); } pr_cont("\n"); err = do_md_run(mddev); if (err) { pr_warn("md: do_md_run() returned %d\n", err); do_md_stop(mddev, 0, NULL); } } /* * lets try to run arrays based on all disks that have arrived * until now. (those are in pending_raid_disks) * * the method: pick the first pending disk, collect all disks with * the same UUID, remove all from the pending list and put them into * the 'same_array' list. Then order this list based on superblock * update time (freshest comes first), kick out 'old' disks and * compare superblocks. If everything's fine then run it. * * If "unit" is allocated, then bump its reference count */ static void autorun_devices(int part) { struct md_rdev *rdev0, *rdev, *tmp; struct mddev *mddev; pr_info("md: autorun ...\n"); while (!list_empty(&pending_raid_disks)) { int unit; dev_t dev; LIST_HEAD(candidates); rdev0 = list_entry(pending_raid_disks.next, struct md_rdev, same_set); pr_debug("md: considering %pg ...\n", rdev0->bdev); INIT_LIST_HEAD(&candidates); rdev_for_each_list(rdev, tmp, &pending_raid_disks) if (super_90_load(rdev, rdev0, 0) >= 0) { pr_debug("md: adding %pg ...\n", rdev->bdev); list_move(&rdev->same_set, &candidates); } /* * now we have a set of devices, with all of them having * mostly sane superblocks. It's time to allocate the * mddev. */ if (part) { dev = MKDEV(mdp_major, rdev0->preferred_minor << MdpMinorShift); unit = MINOR(dev) >> MdpMinorShift; } else { dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); unit = MINOR(dev); } if (rdev0->preferred_minor != unit) { pr_warn("md: unit number in %pg is bad: %d\n", rdev0->bdev, rdev0->preferred_minor); break; } mddev = md_alloc(dev, NULL); if (IS_ERR(mddev)) break; if (mddev_suspend_and_lock(mddev)) pr_warn("md: %s locked, cannot run\n", mdname(mddev)); else if (mddev->raid_disks || mddev->major_version || !list_empty(&mddev->disks)) { pr_warn("md: %s already running, cannot run %pg\n", mdname(mddev), rdev0->bdev); mddev_unlock_and_resume(mddev); } else { pr_debug("md: created %s\n", mdname(mddev)); mddev->persistent = 1; rdev_for_each_list(rdev, tmp, &candidates) { list_del_init(&rdev->same_set); if (bind_rdev_to_array(rdev, mddev)) export_rdev(rdev, mddev); } autorun_array(mddev); mddev_unlock_and_resume(mddev); } /* on success, candidates will be empty, on error * it won't... */ rdev_for_each_list(rdev, tmp, &candidates) { list_del_init(&rdev->same_set); export_rdev(rdev, mddev); } mddev_put(mddev); } pr_info("md: ... autorun DONE.\n"); } #endif /* !MODULE */ static int get_version(void __user *arg) { mdu_version_t ver; ver.major = MD_MAJOR_VERSION; ver.minor = MD_MINOR_VERSION; ver.patchlevel = MD_PATCHLEVEL_VERSION; if (copy_to_user(arg, &ver, sizeof(ver))) return -EFAULT; return 0; } static int get_array_info(struct mddev *mddev, void __user *arg) { mdu_array_info_t info; int nr,working,insync,failed,spare; struct md_rdev *rdev; nr = working = insync = failed = spare = 0; rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) { nr++; if (test_bit(Faulty, &rdev->flags)) failed++; else { working++; if (test_bit(In_sync, &rdev->flags)) insync++; else if (test_bit(Journal, &rdev->flags)) /* TODO: add journal count to md_u.h */ ; else spare++; } } rcu_read_unlock(); info.major_version = mddev->major_version; info.minor_version = mddev->minor_version; info.patch_version = MD_PATCHLEVEL_VERSION; info.ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); info.level = mddev->level; info.size = mddev->dev_sectors / 2; if (info.size != mddev->dev_sectors / 2) /* overflow */ info.size = -1; info.nr_disks = nr; info.raid_disks = mddev->raid_disks; info.md_minor = mddev->md_minor; info.not_persistent= !mddev->persistent; info.utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); info.state = 0; if (mddev->in_sync) info.state = (1<<MD_SB_CLEAN); if (mddev->bitmap && mddev->bitmap_info.offset) info.state |= (1<<MD_SB_BITMAP_PRESENT); if (mddev_is_clustered(mddev)) info.state |= (1<<MD_SB_CLUSTERED); info.active_disks = insync; info.working_disks = working; info.failed_disks = failed; info.spare_disks = spare; info.layout = mddev->layout; info.chunk_size = mddev->chunk_sectors << 9; if (copy_to_user(arg, &info, sizeof(info))) return -EFAULT; return 0; } static int get_bitmap_file(struct mddev *mddev, void __user * arg) { mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ char *ptr; int err; file = kzalloc(sizeof(*file), GFP_NOIO); if (!file) return -ENOMEM; err = 0; spin_lock(&mddev->lock); /* bitmap enabled */ if (mddev->bitmap_info.file) { ptr = file_path(mddev->bitmap_info.file, file->pathname, sizeof(file->pathname)); if (IS_ERR(ptr)) err = PTR_ERR(ptr); else memmove(file->pathname, ptr, sizeof(file->pathname)-(ptr-file->pathname)); } spin_unlock(&mddev->lock); if (err == 0 && copy_to_user(arg, file, sizeof(*file))) err = -EFAULT; kfree(file); return err; } static int get_disk_info(struct mddev *mddev, void __user * arg) { mdu_disk_info_t info; struct md_rdev *rdev; if (copy_from_user(&info, arg, sizeof(info))) return -EFAULT; rcu_read_lock(); rdev = md_find_rdev_nr_rcu(mddev, info.number); if (rdev) { info.major = MAJOR(rdev->bdev->bd_dev); info.minor = MINOR(rdev->bdev->bd_dev); info.raid_disk = rdev->raid_disk; info.state = 0; if (test_bit(Faulty, &rdev->flags)) info.state |= (1<<MD_DISK_FAULTY); else if (test_bit(In_sync, &rdev->flags)) { info.state |= (1<<MD_DISK_ACTIVE); info.state |= (1<<MD_DISK_SYNC); } if (test_bit(Journal, &rdev->flags)) info.state |= (1<<MD_DISK_JOURNAL); if (test_bit(WriteMostly, &rdev->flags)) info.state |= (1<<MD_DISK_WRITEMOSTLY); if (test_bit(FailFast, &rdev->flags)) info.state |= (1<<MD_DISK_FAILFAST); } else { info.major = info.minor = 0; info.raid_disk = -1; info.state = (1<<MD_DISK_REMOVED); } rcu_read_unlock(); if (copy_to_user(arg, &info, sizeof(info))) return -EFAULT; return 0; } int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) { struct md_rdev *rdev; dev_t dev = MKDEV(info->major,info->minor); if (mddev_is_clustered(mddev) && !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) { pr_warn("%s: Cannot add to clustered mddev.\n", mdname(mddev)); return -EINVAL; } if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) return -EOVERFLOW; if (!mddev->raid_disks) { int err; /* expecting a device which has a superblock */ rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); if (IS_ERR(rdev)) { pr_warn("md: md_import_device returned %ld\n", PTR_ERR(rdev)); return PTR_ERR(rdev); } if (!list_empty(&mddev->disks)) { struct md_rdev *rdev0 = list_entry(mddev->disks.next, struct md_rdev, same_set); err = super_types[mddev->major_version] .load_super(rdev, rdev0, mddev->minor_version); if (err < 0) { pr_warn("md: %pg has different UUID to %pg\n", rdev->bdev, rdev0->bdev); export_rdev(rdev, mddev); return -EINVAL; } } err = bind_rdev_to_array(rdev, mddev); if (err) export_rdev(rdev, mddev); return err; } /* * md_add_new_disk can be used once the array is assembled * to add "hot spares". They must already have a superblock * written */ if (mddev->pers) { int err; if (!mddev->pers->hot_add_disk) { pr_warn("%s: personality does not support diskops!\n", mdname(mddev)); return -EINVAL; } if (mddev->persistent) rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); else rdev = md_import_device(dev, -1, -1); if (IS_ERR(rdev)) { pr_warn("md: md_import_device returned %ld\n", PTR_ERR(rdev)); return PTR_ERR(rdev); } /* set saved_raid_disk if appropriate */ if (!mddev->persistent) { if (info->state & (1<<MD_DISK_SYNC) && info->raid_disk < mddev->raid_disks) { rdev->raid_disk = info->raid_disk; clear_bit(Bitmap_sync, &rdev->flags); } else rdev->raid_disk = -1; rdev->saved_raid_disk = rdev->raid_disk; } else super_types[mddev->major_version]. validate_super(mddev, rdev); if ((info->state & (1<<MD_DISK_SYNC)) && rdev->raid_disk != info->raid_disk) { /* This was a hot-add request, but events doesn't * match, so reject it. */ export_rdev(rdev, mddev); return -EINVAL; } clear_bit(In_sync, &rdev->flags); /* just to be sure */ if (info->state & (1<<MD_DISK_WRITEMOSTLY)) set_bit(WriteMostly, &rdev->flags); else clear_bit(WriteMostly, &rdev->flags); if (info->state & (1<<MD_DISK_FAILFAST)) set_bit(FailFast, &rdev->flags); else clear_bit(FailFast, &rdev->flags); if (info->state & (1<<MD_DISK_JOURNAL)) { struct md_rdev *rdev2; bool has_journal = false; /* make sure no existing journal disk */ rdev_for_each(rdev2, mddev) { if (test_bit(Journal, &rdev2->flags)) { has_journal = true; break; } } if (has_journal || mddev->bitmap) { export_rdev(rdev, mddev); return -EBUSY; } set_bit(Journal, &rdev->flags); } /* * check whether the device shows up in other nodes */ if (mddev_is_clustered(mddev)) { if (info->state & (1 << MD_DISK_CANDIDATE)) set_bit(Candidate, &rdev->flags); else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { /* --add initiated by this node */ err = md_cluster_ops->add_new_disk(mddev, rdev); if (err) { export_rdev(rdev, mddev); return err; } } } rdev->raid_disk = -1; err = bind_rdev_to_array(rdev, mddev); if (err) export_rdev(rdev, mddev); if (mddev_is_clustered(mddev)) { if (info->state & (1 << MD_DISK_CANDIDATE)) { if (!err) { err = md_cluster_ops->new_disk_ack(mddev, err == 0); if (err) md_kick_rdev_from_array(rdev); } } else { if (err) md_cluster_ops->add_new_disk_cancel(mddev); else err = add_bound_rdev(rdev); } } else if (!err) err = add_bound_rdev(rdev); return err; } /* otherwise, md_add_new_disk is only allowed * for major_version==0 superblocks */ if (mddev->major_version != 0) { pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev)); return -EINVAL; } if (!(info->state & (1<<MD_DISK_FAULTY))) { int err; rdev = md_import_device(dev, -1, 0); if (IS_ERR(rdev)) { pr_warn("md: error, md_import_device() returned %ld\n", PTR_ERR(rdev)); return PTR_ERR(rdev); } rdev->desc_nr = info->number; if (info->raid_disk < mddev->raid_disks) rdev->raid_disk = info->raid_disk; else rdev->raid_disk = -1; if (rdev->raid_disk < mddev->raid_disks) if (info->state & (1<<MD_DISK_SYNC)) set_bit(In_sync, &rdev->flags); if (info->state & (1<<MD_DISK_WRITEMOSTLY)) set_bit(WriteMostly, &rdev->flags); if (info->state & (1<<MD_DISK_FAILFAST)) set_bit(FailFast, &rdev->flags); if (!mddev->persistent) { pr_debug("md: nonpersistent superblock ...\n"); rdev->sb_start = bdev_nr_sectors(rdev->bdev); } else rdev->sb_start = calc_dev_sboffset(rdev); rdev->sectors = rdev->sb_start; err = bind_rdev_to_array(rdev, mddev); if (err) { export_rdev(rdev, mddev); return err; } } return 0; } static int hot_remove_disk(struct mddev *mddev, dev_t dev) { struct md_rdev *rdev; if (!mddev->pers) return -ENODEV; rdev = find_rdev(mddev, dev); if (!rdev) return -ENXIO; if (rdev->raid_disk < 0) goto kick_rdev; clear_bit(Blocked, &rdev->flags); remove_and_add_spares(mddev, rdev); if (rdev->raid_disk >= 0) goto busy; kick_rdev: if (mddev_is_clustered(mddev)) { if (md_cluster_ops->remove_disk(mddev, rdev)) goto busy; } md_kick_rdev_from_array(rdev); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); if (mddev->thread) md_wakeup_thread(mddev->thread); else md_update_sb(mddev, 1); md_new_event(); return 0; busy: pr_debug("md: cannot remove active disk %pg from %s ...\n", rdev->bdev, mdname(mddev)); return -EBUSY; } static int hot_add_disk(struct mddev *mddev, dev_t dev) { int err; struct md_rdev *rdev; if (!mddev->pers) return -ENODEV; if (mddev->major_version != 0) { pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n", mdname(mddev)); return -EINVAL; } if (!mddev->pers->hot_add_disk) { pr_warn("%s: personality does not support diskops!\n", mdname(mddev)); return -EINVAL; } rdev = md_import_device(dev, -1, 0); if (IS_ERR(rdev)) { pr_warn("md: error, md_import_device() returned %ld\n", PTR_ERR(rdev)); return -EINVAL; } if (mddev->persistent) rdev->sb_start = calc_dev_sboffset(rdev); else rdev->sb_start = bdev_nr_sectors(rdev->bdev); rdev->sectors = rdev->sb_start; if (test_bit(Faulty, &rdev->flags)) { pr_warn("md: can not hot-add faulty %pg disk to %s!\n", rdev->bdev, mdname(mddev)); err = -EINVAL; goto abort_export; } clear_bit(In_sync, &rdev->flags); rdev->desc_nr = -1; rdev->saved_raid_disk = -1; err = bind_rdev_to_array(rdev, mddev); if (err) goto abort_export; /* * The rest should better be atomic, we can have disk failures * noticed in interrupt contexts ... */ rdev->raid_disk = -1; set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); if (!mddev->thread) md_update_sb(mddev, 1); /* * If the new disk does not support REQ_NOWAIT, * disable on the whole MD. */ if (!bdev_nowait(rdev->bdev)) { pr_info("%s: Disabling nowait because %pg does not support nowait\n", mdname(mddev), rdev->bdev); blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->queue); } /* * Kick recovery, maybe this spare has to be added to the * array immediately. */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); md_new_event(); return 0; abort_export: export_rdev(rdev, mddev); return err; } static int set_bitmap_file(struct mddev *mddev, int fd) { int err = 0; if (mddev->pers) { if (!mddev->pers->quiesce || !mddev->thread) return -EBUSY; if (mddev->recovery || mddev->sync_thread) return -EBUSY; /* we should be able to change the bitmap.. */ } if (fd >= 0) { struct inode *inode; struct file *f; if (mddev->bitmap || mddev->bitmap_info.file) return -EEXIST; /* cannot add when bitmap is present */ if (!IS_ENABLED(CONFIG_MD_BITMAP_FILE)) { pr_warn("%s: bitmap files not supported by this kernel\n", mdname(mddev)); return -EINVAL; } pr_warn("%s: using deprecated bitmap file support\n", mdname(mddev)); f = fget(fd); if (f == NULL) { pr_warn("%s: error: failed to get bitmap file\n", mdname(mddev)); return -EBADF; } inode = f->f_mapping->host; if (!S_ISREG(inode->i_mode)) { pr_warn("%s: error: bitmap file must be a regular file\n", mdname(mddev)); err = -EBADF; } else if (!(f->f_mode & FMODE_WRITE)) { pr_warn("%s: error: bitmap file must open for write\n", mdname(mddev)); err = -EBADF; } else if (atomic_read(&inode->i_writecount) != 1) { pr_warn("%s: error: bitmap file is already in use\n", mdname(mddev)); err = -EBUSY; } if (err) { fput(f); return err; } mddev->bitmap_info.file = f; mddev->bitmap_info.offset = 0; /* file overrides offset */ } else if (mddev->bitmap == NULL) return -ENOENT; /* cannot remove what isn't there */ err = 0; if (mddev->pers) { if (fd >= 0) { struct bitmap *bitmap; bitmap = md_bitmap_create(mddev, -1); if (!IS_ERR(bitmap)) { mddev->bitmap = bitmap; err = md_bitmap_load(mddev); } else err = PTR_ERR(bitmap); if (err) { md_bitmap_destroy(mddev); fd = -1; } } else if (fd < 0) { md_bitmap_destroy(mddev); } } if (fd < 0) { struct file *f = mddev->bitmap_info.file; if (f) { spin_lock(&mddev->lock); mddev->bitmap_info.file = NULL; spin_unlock(&mddev->lock); fput(f); } } return err; } /* * md_set_array_info is used two different ways * The original usage is when creating a new array. * In this usage, raid_disks is > 0 and it together with * level, size, not_persistent,layout,chunksize determine the * shape of the array. * This will always create an array with a type-0.90.0 superblock. * The newer usage is when assembling an array. * In this case raid_disks will be 0, and the major_version field is * use to determine which style super-blocks are to be found on the devices. * The minor and patch _version numbers are also kept incase the * super_block handler wishes to interpret them. */ int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info) { if (info->raid_disks == 0) { /* just setting version number for superblock loading */ if (info->major_version < 0 || info->major_version >= ARRAY_SIZE(super_types) || super_types[info->major_version].name == NULL) { /* maybe try to auto-load a module? */ pr_warn("md: superblock version %d not known\n", info->major_version); return -EINVAL; } mddev->major_version = info->major_version; mddev->minor_version = info->minor_version; mddev->patch_version = info->patch_version; mddev->persistent = !info->not_persistent; /* ensure mddev_put doesn't delete this now that there * is some minimal configuration. */ mddev->ctime = ktime_get_real_seconds(); return 0; } mddev->major_version = MD_MAJOR_VERSION; mddev->minor_version = MD_MINOR_VERSION; mddev->patch_version = MD_PATCHLEVEL_VERSION; mddev->ctime = ktime_get_real_seconds(); mddev->level = info->level; mddev->clevel[0] = 0; mddev->dev_sectors = 2 * (sector_t)info->size; mddev->raid_disks = info->raid_disks; /* don't set md_minor, it is determined by which /dev/md* was * openned */ if (info->state & (1<<MD_SB_CLEAN)) mddev->recovery_cp = MaxSector; else mddev->recovery_cp = 0; mddev->persistent = ! info->not_persistent; mddev->external = 0; mddev->layout = info->layout; if (mddev->level == 0) /* Cannot trust RAID0 layout info here */ mddev->layout = -1; mddev->chunk_sectors = info->chunk_size >> 9; if (mddev->persistent) { mddev->max_disks = MD_SB_DISKS; mddev->flags = 0; mddev->sb_flags = 0; } set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); mddev->bitmap_info.offset = 0; mddev->reshape_position = MaxSector; /* * Generate a 128 bit UUID */ get_random_bytes(mddev->uuid, 16); mddev->new_level = mddev->level; mddev->new_chunk_sectors = mddev->chunk_sectors; mddev->new_layout = mddev->layout; mddev->delta_disks = 0; mddev->reshape_backwards = 0; return 0; } void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) { lockdep_assert_held(&mddev->reconfig_mutex); if (mddev->external_size) return; mddev->array_sectors = array_sectors; } EXPORT_SYMBOL(md_set_array_sectors); static int update_size(struct mddev *mddev, sector_t num_sectors) { struct md_rdev *rdev; int rv; int fit = (num_sectors == 0); sector_t old_dev_sectors = mddev->dev_sectors; if (mddev->pers->resize == NULL) return -EINVAL; /* The "num_sectors" is the number of sectors of each device that * is used. This can only make sense for arrays with redundancy. * linear and raid0 always use whatever space is available. We can only * consider changing this number if no resync or reconstruction is * happening, and if the new size is acceptable. It must fit before the * sb_start or, if that is <data_offset, it must fit before the size * of each device. If num_sectors is zero, we find the largest size * that fits. */ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || mddev->sync_thread) return -EBUSY; if (!md_is_rdwr(mddev)) return -EROFS; rdev_for_each(rdev, mddev) { sector_t avail = rdev->sectors; if (fit && (num_sectors == 0 || num_sectors > avail)) num_sectors = avail; if (avail < num_sectors) return -ENOSPC; } rv = mddev->pers->resize(mddev, num_sectors); if (!rv) { if (mddev_is_clustered(mddev)) md_cluster_ops->update_size(mddev, old_dev_sectors); else if (mddev->queue) { set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); } } return rv; } static int update_raid_disks(struct mddev *mddev, int raid_disks) { int rv; struct md_rdev *rdev; /* change the number of raid disks */ if (mddev->pers->check_reshape == NULL) return -EINVAL; if (!md_is_rdwr(mddev)) return -EROFS; if (raid_disks <= 0 || (mddev->max_disks && raid_disks >= mddev->max_disks)) return -EINVAL; if (mddev->sync_thread || test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) || mddev->reshape_position != MaxSector) return -EBUSY; rdev_for_each(rdev, mddev) { if (mddev->raid_disks < raid_disks && rdev->data_offset < rdev->new_data_offset) return -EINVAL; if (mddev->raid_disks > raid_disks && rdev->data_offset > rdev->new_data_offset) return -EINVAL; } mddev->delta_disks = raid_disks - mddev->raid_disks; if (mddev->delta_disks < 0) mddev->reshape_backwards = 1; else if (mddev->delta_disks > 0) mddev->reshape_backwards = 0; rv = mddev->pers->check_reshape(mddev); if (rv < 0) { mddev->delta_disks = 0; mddev->reshape_backwards = 0; } return rv; } /* * update_array_info is used to change the configuration of an * on-line array. * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size * fields in the info are checked against the array. * Any differences that cannot be handled will cause an error. * Normally, only one change can be managed at a time. */ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) { int rv = 0; int cnt = 0; int state = 0; /* calculate expected state,ignoring low bits */ if (mddev->bitmap && mddev->bitmap_info.offset) state |= (1 << MD_SB_BITMAP_PRESENT); if (mddev->major_version != info->major_version || mddev->minor_version != info->minor_version || /* mddev->patch_version != info->patch_version || */ mddev->ctime != info->ctime || mddev->level != info->level || /* mddev->layout != info->layout || */ mddev->persistent != !info->not_persistent || mddev->chunk_sectors != info->chunk_size >> 9 || /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ ((state^info->state) & 0xfffffe00) ) return -EINVAL; /* Check there is only one change */ if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) cnt++; if (mddev->raid_disks != info->raid_disks) cnt++; if (mddev->layout != info->layout) cnt++; if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++; if (cnt == 0) return 0; if (cnt > 1) return -EINVAL; if (mddev->layout != info->layout) { /* Change layout * we don't need to do anything at the md level, the * personality will take care of it all. */ if (mddev->pers->check_reshape == NULL) return -EINVAL; else { mddev->new_layout = info->layout; rv = mddev->pers->check_reshape(mddev); if (rv) mddev->new_layout = mddev->layout; return rv; } } if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) rv = update_size(mddev, (sector_t)info->size * 2); if (mddev->raid_disks != info->raid_disks) rv = update_raid_disks(mddev, info->raid_disks); if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { if (mddev->pers->quiesce == NULL || mddev->thread == NULL) { rv = -EINVAL; goto err; } if (mddev->recovery || mddev->sync_thread) { rv = -EBUSY; goto err; } if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { struct bitmap *bitmap; /* add the bitmap */ if (mddev->bitmap) { rv = -EEXIST; goto err; } if (mddev->bitmap_info.default_offset == 0) { rv = -EINVAL; goto err; } mddev->bitmap_info.offset = mddev->bitmap_info.default_offset; mddev->bitmap_info.space = mddev->bitmap_info.default_space; bitmap = md_bitmap_create(mddev, -1); if (!IS_ERR(bitmap)) { mddev->bitmap = bitmap; rv = md_bitmap_load(mddev); } else rv = PTR_ERR(bitmap); if (rv) md_bitmap_destroy(mddev); } else { /* remove the bitmap */ if (!mddev->bitmap) { rv = -ENOENT; goto err; } if (mddev->bitmap->storage.file) { rv = -EINVAL; goto err; } if (mddev->bitmap_info.nodes) { /* hold PW on all the bitmap lock */ if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) { pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n"); rv = -EPERM; md_cluster_ops->unlock_all_bitmaps(mddev); goto err; } mddev->bitmap_info.nodes = 0; md_cluster_ops->leave(mddev); module_put(md_cluster_mod); mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; } md_bitmap_destroy(mddev); mddev->bitmap_info.offset = 0; } } md_update_sb(mddev, 1); return rv; err: return rv; } static int set_disk_faulty(struct mddev *mddev, dev_t dev) { struct md_rdev *rdev; int err = 0; if (mddev->pers == NULL) return -ENODEV; rcu_read_lock(); rdev = md_find_rdev_rcu(mddev, dev); if (!rdev) err = -ENODEV; else { md_error(mddev, rdev); if (test_bit(MD_BROKEN, &mddev->flags)) err = -EBUSY; } rcu_read_unlock(); return err; } /* * We have a problem here : there is no easy way to give a CHS * virtual geometry. We currently pretend that we have a 2 heads * 4 sectors (with a BIG number of cylinders...). This drives * dosfs just mad... ;-) */ static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) { struct mddev *mddev = bdev->bd_disk->private_data; geo->heads = 2; geo->sectors = 4; geo->cylinders = mddev->array_sectors / 8; return 0; } static inline bool md_ioctl_valid(unsigned int cmd) { switch (cmd) { case ADD_NEW_DISK: case GET_ARRAY_INFO: case GET_BITMAP_FILE: case GET_DISK_INFO: case HOT_ADD_DISK: case HOT_REMOVE_DISK: case RAID_VERSION: case RESTART_ARRAY_RW: case RUN_ARRAY: case SET_ARRAY_INFO: case SET_BITMAP_FILE: case SET_DISK_FAULTY: case STOP_ARRAY: case STOP_ARRAY_RO: case CLUSTERED_DISK_NACK: return true; default: return false; } } static bool md_ioctl_need_suspend(unsigned int cmd) { switch (cmd) { case ADD_NEW_DISK: case HOT_ADD_DISK: case HOT_REMOVE_DISK: case SET_BITMAP_FILE: case SET_ARRAY_INFO: return true; default: return false; } } static int __md_set_array_info(struct mddev *mddev, void __user *argp) { mdu_array_info_t info; int err; if (!argp) memset(&info, 0, sizeof(info)); else if (copy_from_user(&info, argp, sizeof(info))) return -EFAULT; if (mddev->pers) { err = update_array_info(mddev, &info); if (err) pr_warn("md: couldn't update array info. %d\n", err); return err; } if (!list_empty(&mddev->disks)) { pr_warn("md: array %s already has disks!\n", mdname(mddev)); return -EBUSY; } if (mddev->raid_disks) { pr_warn("md: array %s already initialised!\n", mdname(mddev)); return -EBUSY; } err = md_set_array_info(mddev, &info); if (err) pr_warn("md: couldn't set array info. %d\n", err); return err; } static int md_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { int err = 0; void __user *argp = (void __user *)arg; struct mddev *mddev = NULL; bool did_set_md_closing = false; if (!md_ioctl_valid(cmd)) return -ENOTTY; switch (cmd) { case RAID_VERSION: case GET_ARRAY_INFO: case GET_DISK_INFO: break; default: if (!capable(CAP_SYS_ADMIN)) return -EACCES; } /* * Commands dealing with the RAID driver but not any * particular array: */ switch (cmd) { case RAID_VERSION: err = get_version(argp); goto out; default:; } /* * Commands creating/starting a new array: */ mddev = bdev->bd_disk->private_data; if (!mddev) { BUG(); goto out; } /* Some actions do not requires the mutex */ switch (cmd) { case GET_ARRAY_INFO: if (!mddev->raid_disks && !mddev->external) err = -ENODEV; else err = get_array_info(mddev, argp); goto out; case GET_DISK_INFO: if (!mddev->raid_disks && !mddev->external) err = -ENODEV; else err = get_disk_info(mddev, argp); goto out; case SET_DISK_FAULTY: err = set_disk_faulty(mddev, new_decode_dev(arg)); goto out; case GET_BITMAP_FILE: err = get_bitmap_file(mddev, argp); goto out; } if (cmd == HOT_REMOVE_DISK) /* need to ensure recovery thread has run */ wait_event_interruptible_timeout(mddev->sb_wait, !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery), msecs_to_jiffies(5000)); if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { /* Need to flush page cache, and ensure no-one else opens * and writes */ mutex_lock(&mddev->open_mutex); if (mddev->pers && atomic_read(&mddev->openers) > 1) { mutex_unlock(&mddev->open_mutex); err = -EBUSY; goto out; } if (test_and_set_bit(MD_CLOSING, &mddev->flags)) { mutex_unlock(&mddev->open_mutex); err = -EBUSY; goto out; } did_set_md_closing = true; mutex_unlock(&mddev->open_mutex); sync_blockdev(bdev); } if (!md_is_rdwr(mddev)) flush_work(&mddev->sync_work); err = md_ioctl_need_suspend(cmd) ? mddev_suspend_and_lock(mddev) : mddev_lock(mddev); if (err) { pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n", err, cmd); goto out; } if (cmd == SET_ARRAY_INFO) { err = __md_set_array_info(mddev, argp); goto unlock; } /* * Commands querying/configuring an existing array: */ /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ if ((!mddev->raid_disks && !mddev->external) && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE && cmd != GET_BITMAP_FILE) { err = -ENODEV; goto unlock; } /* * Commands even a read-only array can execute: */ switch (cmd) { case RESTART_ARRAY_RW: err = restart_array(mddev); goto unlock; case STOP_ARRAY: err = do_md_stop(mddev, 0, bdev); goto unlock; case STOP_ARRAY_RO: err = md_set_readonly(mddev, bdev); goto unlock; case HOT_REMOVE_DISK: err = hot_remove_disk(mddev, new_decode_dev(arg)); goto unlock; case ADD_NEW_DISK: /* We can support ADD_NEW_DISK on read-only arrays * only if we are re-adding a preexisting device. * So require mddev->pers and MD_DISK_SYNC. */ if (mddev->pers) { mdu_disk_info_t info; if (copy_from_user(&info, argp, sizeof(info))) err = -EFAULT; else if (!(info.state & (1<<MD_DISK_SYNC))) /* Need to clear read-only for this */ break; else err = md_add_new_disk(mddev, &info); goto unlock; } break; } /* * The remaining ioctls are changing the state of the * superblock, so we do not allow them on read-only arrays. */ if (!md_is_rdwr(mddev) && mddev->pers) { if (mddev->ro != MD_AUTO_READ) { err = -EROFS; goto unlock; } mddev->ro = MD_RDWR; sysfs_notify_dirent_safe(mddev->sysfs_state); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); /* mddev_unlock will wake thread */ /* If a device failed while we were read-only, we * need to make sure the metadata is updated now. */ if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) { mddev_unlock(mddev); wait_event(mddev->sb_wait, !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) && !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); mddev_lock_nointr(mddev); } } switch (cmd) { case ADD_NEW_DISK: { mdu_disk_info_t info; if (copy_from_user(&info, argp, sizeof(info))) err = -EFAULT; else err = md_add_new_disk(mddev, &info); goto unlock; } case CLUSTERED_DISK_NACK: if (mddev_is_clustered(mddev)) md_cluster_ops->new_disk_ack(mddev, false); else err = -EINVAL; goto unlock; case HOT_ADD_DISK: err = hot_add_disk(mddev, new_decode_dev(arg)); goto unlock; case RUN_ARRAY: err = do_md_run(mddev); goto unlock; case SET_BITMAP_FILE: err = set_bitmap_file(mddev, (int)arg); goto unlock; default: err = -EINVAL; goto unlock; } unlock: if (mddev->hold_active == UNTIL_IOCTL && err != -EINVAL) mddev->hold_active = 0; md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev); out: if(did_set_md_closing) clear_bit(MD_CLOSING, &mddev->flags); return err; } #ifdef CONFIG_COMPAT static int md_compat_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { switch (cmd) { case HOT_REMOVE_DISK: case HOT_ADD_DISK: case SET_DISK_FAULTY: case SET_BITMAP_FILE: /* These take in integer arg, do not convert */ break; default: arg = (unsigned long)compat_ptr(arg); break; } return md_ioctl(bdev, mode, cmd, arg); } #endif /* CONFIG_COMPAT */ static int md_set_read_only(struct block_device *bdev, bool ro) { struct mddev *mddev = bdev->bd_disk->private_data; int err; err = mddev_lock(mddev); if (err) return err; if (!mddev->raid_disks && !mddev->external) { err = -ENODEV; goto out_unlock; } /* * Transitioning to read-auto need only happen for arrays that call * md_write_start and which are not ready for writes yet. */ if (!ro && mddev->ro == MD_RDONLY && mddev->pers) { err = restart_array(mddev); if (err) goto out_unlock; mddev->ro = MD_AUTO_READ; } out_unlock: mddev_unlock(mddev); return err; } static int md_open(struct gendisk *disk, blk_mode_t mode) { struct mddev *mddev; int err; spin_lock(&all_mddevs_lock); mddev = mddev_get(disk->private_data); spin_unlock(&all_mddevs_lock); if (!mddev) return -ENODEV; err = mutex_lock_interruptible(&mddev->open_mutex); if (err) goto out; err = -ENODEV; if (test_bit(MD_CLOSING, &mddev->flags)) goto out_unlock; atomic_inc(&mddev->openers); mutex_unlock(&mddev->open_mutex); disk_check_media_change(disk); return 0; out_unlock: mutex_unlock(&mddev->open_mutex); out: mddev_put(mddev); return err; } static void md_release(struct gendisk *disk) { struct mddev *mddev = disk->private_data; BUG_ON(!mddev); atomic_dec(&mddev->openers); mddev_put(mddev); } static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing) { struct mddev *mddev = disk->private_data; unsigned int ret = 0; if (mddev->changed) ret = DISK_EVENT_MEDIA_CHANGE; mddev->changed = 0; return ret; } static void md_free_disk(struct gendisk *disk) { struct mddev *mddev = disk->private_data; mddev_free(mddev); } const struct block_device_operations md_fops = { .owner = THIS_MODULE, .submit_bio = md_submit_bio, .open = md_open, .release = md_release, .ioctl = md_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = md_compat_ioctl, #endif .getgeo = md_getgeo, .check_events = md_check_events, .set_read_only = md_set_read_only, .free_disk = md_free_disk, }; static int md_thread(void *arg) { struct md_thread *thread = arg; /* * md_thread is a 'system-thread', it's priority should be very * high. We avoid resource deadlocks individually in each * raid personality. (RAID5 does preallocation) We also use RR and * the very same RT priority as kswapd, thus we will never get * into a priority inversion deadlock. * * we definitely have to have equal or higher priority than * bdflush, otherwise bdflush will deadlock if there are too * many dirty RAID5 blocks. */ allow_signal(SIGKILL); while (!kthread_should_stop()) { /* We need to wait INTERRUPTIBLE so that * we don't add to the load-average. * That means we need to be sure no signals are * pending */ if (signal_pending(current)) flush_signals(current); wait_event_interruptible_timeout (thread->wqueue, test_bit(THREAD_WAKEUP, &thread->flags) || kthread_should_stop() || kthread_should_park(), thread->timeout); clear_bit(THREAD_WAKEUP, &thread->flags); if (kthread_should_park()) kthread_parkme(); if (!kthread_should_stop()) thread->run(thread); } return 0; } static void md_wakeup_thread_directly(struct md_thread __rcu *thread) { struct md_thread *t; rcu_read_lock(); t = rcu_dereference(thread); if (t) wake_up_process(t->tsk); rcu_read_unlock(); } void md_wakeup_thread(struct md_thread __rcu *thread) { struct md_thread *t; rcu_read_lock(); t = rcu_dereference(thread); if (t) { pr_debug("md: waking up MD thread %s.\n", t->tsk->comm); set_bit(THREAD_WAKEUP, &t->flags); wake_up(&t->wqueue); } rcu_read_unlock(); } EXPORT_SYMBOL(md_wakeup_thread); struct md_thread *md_register_thread(void (*run) (struct md_thread *), struct mddev *mddev, const char *name) { struct md_thread *thread; thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL); if (!thread) return NULL; init_waitqueue_head(&thread->wqueue); thread->run = run; thread->mddev = mddev; thread->timeout = MAX_SCHEDULE_TIMEOUT; thread->tsk = kthread_run(md_thread, thread, "%s_%s", mdname(thread->mddev), name); if (IS_ERR(thread->tsk)) { kfree(thread); return NULL; } return thread; } EXPORT_SYMBOL(md_register_thread); void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp) { struct md_thread *thread = rcu_dereference_protected(*threadp, lockdep_is_held(&mddev->reconfig_mutex)); if (!thread) return; rcu_assign_pointer(*threadp, NULL); synchronize_rcu(); pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); kthread_stop(thread->tsk); kfree(thread); } EXPORT_SYMBOL(md_unregister_thread); void md_error(struct mddev *mddev, struct md_rdev *rdev) { if (!rdev || test_bit(Faulty, &rdev->flags)) return; if (!mddev->pers || !mddev->pers->error_handler) return; mddev->pers->error_handler(mddev, rdev); if (mddev->pers->level == 0 || mddev->pers->level == LEVEL_LINEAR) return; if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags)) set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); sysfs_notify_dirent_safe(rdev->sysfs_state); set_bit(MD_RECOVERY_INTR, &mddev->recovery); if (!test_bit(MD_BROKEN, &mddev->flags)) { set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); } if (mddev->event_work.func) queue_work(md_misc_wq, &mddev->event_work); md_new_event(); } EXPORT_SYMBOL(md_error); /* seq_file implementation /proc/mdstat */ static void status_unused(struct seq_file *seq) { int i = 0; struct md_rdev *rdev; seq_printf(seq, "unused devices: "); list_for_each_entry(rdev, &pending_raid_disks, same_set) { i++; seq_printf(seq, "%pg ", rdev->bdev); } if (!i) seq_printf(seq, "<none>"); seq_printf(seq, "\n"); } static int status_resync(struct seq_file *seq, struct mddev *mddev) { sector_t max_sectors, resync, res; unsigned long dt, db = 0; sector_t rt, curr_mark_cnt, resync_mark_cnt; int scale, recovery_active; unsigned int per_milli; if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) max_sectors = mddev->resync_max_sectors; else max_sectors = mddev->dev_sectors; resync = mddev->curr_resync; if (resync < MD_RESYNC_ACTIVE) { if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) /* Still cleaning up */ resync = max_sectors; } else if (resync > max_sectors) { resync = max_sectors; } else { res = atomic_read(&mddev->recovery_active); /* * Resync has started, but the subtraction has overflowed or * yielded one of the special values. Force it to active to * ensure the status reports an active resync. */ if (resync < res || resync - res < MD_RESYNC_ACTIVE) resync = MD_RESYNC_ACTIVE; else resync -= res; } if (resync == MD_RESYNC_NONE) { if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) { struct md_rdev *rdev; rdev_for_each(rdev, mddev) if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags) && rdev->recovery_offset != MaxSector && rdev->recovery_offset) { seq_printf(seq, "\trecover=REMOTE"); return 1; } if (mddev->reshape_position != MaxSector) seq_printf(seq, "\treshape=REMOTE"); else seq_printf(seq, "\tresync=REMOTE"); return 1; } if (mddev->recovery_cp < MaxSector) { seq_printf(seq, "\tresync=PENDING"); return 1; } return 0; } if (resync < MD_RESYNC_ACTIVE) { seq_printf(seq, "\tresync=DELAYED"); return 1; } WARN_ON(max_sectors == 0); /* Pick 'scale' such that (resync>>scale)*1000 will fit * in a sector_t, and (max_sectors>>scale) will fit in a * u32, as those are the requirements for sector_div. * Thus 'scale' must be at least 10 */ scale = 10; if (sizeof(sector_t) > sizeof(unsigned long)) { while ( max_sectors/2 > (1ULL<<(scale+32))) scale++; } res = (resync>>scale)*1000; sector_div(res, (u32)((max_sectors>>scale)+1)); per_milli = res; { int i, x = per_milli/50, y = 20-x; seq_printf(seq, "["); for (i = 0; i < x; i++) seq_printf(seq, "="); seq_printf(seq, ">"); for (i = 0; i < y; i++) seq_printf(seq, "."); seq_printf(seq, "] "); } seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? "reshape" : (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? "check" : (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? "resync" : "recovery"))), per_milli/10, per_milli % 10, (unsigned long long) resync/2, (unsigned long long) max_sectors/2); /* * dt: time from mark until now * db: blocks written from mark until now * rt: remaining time * * rt is a sector_t, which is always 64bit now. We are keeping * the original algorithm, but it is not really necessary. * * Original algorithm: * So we divide before multiply in case it is 32bit and close * to the limit. * We scale the divisor (db) by 32 to avoid losing precision * near the end of resync when the number of remaining sectors * is close to 'db'. * We then divide rt by 32 after multiplying by db to compensate. * The '+1' avoids division by zero if db is very small. */ dt = ((jiffies - mddev->resync_mark) / HZ); if (!dt) dt++; curr_mark_cnt = mddev->curr_mark_cnt; recovery_active = atomic_read(&mddev->recovery_active); resync_mark_cnt = mddev->resync_mark_cnt; if (curr_mark_cnt >= (recovery_active + resync_mark_cnt)) db = curr_mark_cnt - (recovery_active + resync_mark_cnt); rt = max_sectors - resync; /* number of remaining sectors */ rt = div64_u64(rt, db/32+1); rt *= dt; rt >>= 5; seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60, ((unsigned long)rt % 60)/6); seq_printf(seq, " speed=%ldK/sec", db/2/dt); return 1; } static void *md_seq_start(struct seq_file *seq, loff_t *pos) __acquires(&all_mddevs_lock) { struct md_personality *pers; seq_puts(seq, "Personalities : "); spin_lock(&pers_lock); list_for_each_entry(pers, &pers_list, list) seq_printf(seq, "[%s] ", pers->name); spin_unlock(&pers_lock); seq_puts(seq, "\n"); seq->poll_event = atomic_read(&md_event_count); spin_lock(&all_mddevs_lock); return seq_list_start(&all_mddevs, *pos); } static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) { return seq_list_next(v, &all_mddevs, pos); } static void md_seq_stop(struct seq_file *seq, void *v) __releases(&all_mddevs_lock) { status_unused(seq); spin_unlock(&all_mddevs_lock); } static int md_seq_show(struct seq_file *seq, void *v) { struct mddev *mddev = list_entry(v, struct mddev, all_mddevs); sector_t sectors; struct md_rdev *rdev; if (!mddev_get(mddev)) return 0; spin_unlock(&all_mddevs_lock); spin_lock(&mddev->lock); if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { seq_printf(seq, "%s : %sactive", mdname(mddev), mddev->pers ? "" : "in"); if (mddev->pers) { if (mddev->ro == MD_RDONLY) seq_printf(seq, " (read-only)"); if (mddev->ro == MD_AUTO_READ) seq_printf(seq, " (auto-read-only)"); seq_printf(seq, " %s", mddev->pers->name); } sectors = 0; rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) { seq_printf(seq, " %pg[%d]", rdev->bdev, rdev->desc_nr); if (test_bit(WriteMostly, &rdev->flags)) seq_printf(seq, "(W)"); if (test_bit(Journal, &rdev->flags)) seq_printf(seq, "(J)"); if (test_bit(Faulty, &rdev->flags)) { seq_printf(seq, "(F)"); continue; } if (rdev->raid_disk < 0) seq_printf(seq, "(S)"); /* spare */ if (test_bit(Replacement, &rdev->flags)) seq_printf(seq, "(R)"); sectors += rdev->sectors; } rcu_read_unlock(); if (!list_empty(&mddev->disks)) { if (mddev->pers) seq_printf(seq, "\n %llu blocks", (unsigned long long) mddev->array_sectors / 2); else seq_printf(seq, "\n %llu blocks", (unsigned long long)sectors / 2); } if (mddev->persistent) { if (mddev->major_version != 0 || mddev->minor_version != 90) { seq_printf(seq," super %d.%d", mddev->major_version, mddev->minor_version); } } else if (mddev->external) seq_printf(seq, " super external:%s", mddev->metadata_type); else seq_printf(seq, " super non-persistent"); if (mddev->pers) { mddev->pers->status(seq, mddev); seq_printf(seq, "\n "); if (mddev->pers->sync_request) { if (status_resync(seq, mddev)) seq_printf(seq, "\n "); } } else seq_printf(seq, "\n "); md_bitmap_status(seq, mddev->bitmap); seq_printf(seq, "\n"); } spin_unlock(&mddev->lock); spin_lock(&all_mddevs_lock); if (atomic_dec_and_test(&mddev->active)) __mddev_put(mddev); return 0; } static const struct seq_operations md_seq_ops = { .start = md_seq_start, .next = md_seq_next, .stop = md_seq_stop, .show = md_seq_show, }; static int md_seq_open(struct inode *inode, struct file *file) { struct seq_file *seq; int error; error = seq_open(file, &md_seq_ops); if (error) return error; seq = file->private_data; seq->poll_event = atomic_read(&md_event_count); return error; } static int md_unloading; static __poll_t mdstat_poll(struct file *filp, poll_table *wait) { struct seq_file *seq = filp->private_data; __poll_t mask; if (md_unloading) return EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; poll_wait(filp, &md_event_waiters, wait); /* always allow read */ mask = EPOLLIN | EPOLLRDNORM; if (seq->poll_event != atomic_read(&md_event_count)) mask |= EPOLLERR | EPOLLPRI; return mask; } static const struct proc_ops mdstat_proc_ops = { .proc_open = md_seq_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = seq_release, .proc_poll = mdstat_poll, }; int register_md_personality(struct md_personality *p) { pr_debug("md: %s personality registered for level %d\n", p->name, p->level); spin_lock(&pers_lock); list_add_tail(&p->list, &pers_list); spin_unlock(&pers_lock); return 0; } EXPORT_SYMBOL(register_md_personality); int unregister_md_personality(struct md_personality *p) { pr_debug("md: %s personality unregistered\n", p->name); spin_lock(&pers_lock); list_del_init(&p->list); spin_unlock(&pers_lock); return 0; } EXPORT_SYMBOL(unregister_md_personality); int register_md_cluster_operations(struct md_cluster_operations *ops, struct module *module) { int ret = 0; spin_lock(&pers_lock); if (md_cluster_ops != NULL) ret = -EALREADY; else { md_cluster_ops = ops; md_cluster_mod = module; } spin_unlock(&pers_lock); return ret; } EXPORT_SYMBOL(register_md_cluster_operations); int unregister_md_cluster_operations(void) { spin_lock(&pers_lock); md_cluster_ops = NULL; spin_unlock(&pers_lock); return 0; } EXPORT_SYMBOL(unregister_md_cluster_operations); int md_setup_cluster(struct mddev *mddev, int nodes) { int ret; if (!md_cluster_ops) request_module("md-cluster"); spin_lock(&pers_lock); /* ensure module won't be unloaded */ if (!md_cluster_ops || !try_module_get(md_cluster_mod)) { pr_warn("can't find md-cluster module or get its reference.\n"); spin_unlock(&pers_lock); return -ENOENT; } spin_unlock(&pers_lock); ret = md_cluster_ops->join(mddev, nodes); if (!ret) mddev->safemode_delay = 0; return ret; } void md_cluster_stop(struct mddev *mddev) { if (!md_cluster_ops) return; md_cluster_ops->leave(mddev); module_put(md_cluster_mod); } static int is_mddev_idle(struct mddev *mddev, int init) { struct md_rdev *rdev; int idle; int curr_events; idle = 1; rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) { struct gendisk *disk = rdev->bdev->bd_disk; curr_events = (int)part_stat_read_accum(disk->part0, sectors) - atomic_read(&disk->sync_io); /* sync IO will cause sync_io to increase before the disk_stats * as sync_io is counted when a request starts, and * disk_stats is counted when it completes. * So resync activity will cause curr_events to be smaller than * when there was no such activity. * non-sync IO will cause disk_stat to increase without * increasing sync_io so curr_events will (eventually) * be larger than it was before. Once it becomes * substantially larger, the test below will cause * the array to appear non-idle, and resync will slow * down. * If there is a lot of outstanding resync activity when * we set last_event to curr_events, then all that activity * completing might cause the array to appear non-idle * and resync will be slowed down even though there might * not have been non-resync activity. This will only * happen once though. 'last_events' will soon reflect * the state where there is little or no outstanding * resync requests, and further resync activity will * always make curr_events less than last_events. * */ if (init || curr_events - rdev->last_events > 64) { rdev->last_events = curr_events; idle = 0; } } rcu_read_unlock(); return idle; } void md_done_sync(struct mddev *mddev, int blocks, int ok) { /* another "blocks" (512byte) blocks have been synced */ atomic_sub(blocks, &mddev->recovery_active); wake_up(&mddev->recovery_wait); if (!ok) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(MD_RECOVERY_ERROR, &mddev->recovery); md_wakeup_thread(mddev->thread); // stop recovery, signal do_sync .... } } EXPORT_SYMBOL(md_done_sync); /* md_write_start(mddev, bi) * If we need to update some array metadata (e.g. 'active' flag * in superblock) before writing, schedule a superblock update * and wait for it to complete. * A return value of 'false' means that the write wasn't recorded * and cannot proceed as the array is being suspend. */ bool md_write_start(struct mddev *mddev, struct bio *bi) { int did_change = 0; if (bio_data_dir(bi) != WRITE) return true; BUG_ON(mddev->ro == MD_RDONLY); if (mddev->ro == MD_AUTO_READ) { /* need to switch to read/write */ flush_work(&mddev->sync_work); mddev->ro = MD_RDWR; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); did_change = 1; } rcu_read_lock(); percpu_ref_get(&mddev->writes_pending); smp_mb(); /* Match smp_mb in set_in_sync() */ if (mddev->safemode == 1) mddev->safemode = 0; /* sync_checkers is always 0 when writes_pending is in per-cpu mode */ if (mddev->in_sync || mddev->sync_checkers) { spin_lock(&mddev->lock); if (mddev->in_sync) { mddev->in_sync = 0; set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); md_wakeup_thread(mddev->thread); did_change = 1; } spin_unlock(&mddev->lock); } rcu_read_unlock(); if (did_change) sysfs_notify_dirent_safe(mddev->sysfs_state); if (!mddev->has_superblocks) return true; wait_event(mddev->sb_wait, !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) || is_md_suspended(mddev)); if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { percpu_ref_put(&mddev->writes_pending); return false; } return true; } EXPORT_SYMBOL(md_write_start); /* md_write_inc can only be called when md_write_start() has * already been called at least once of the current request. * It increments the counter and is useful when a single request * is split into several parts. Each part causes an increment and * so needs a matching md_write_end(). * Unlike md_write_start(), it is safe to call md_write_inc() inside * a spinlocked region. */ void md_write_inc(struct mddev *mddev, struct bio *bi) { if (bio_data_dir(bi) != WRITE) return; WARN_ON_ONCE(mddev->in_sync || !md_is_rdwr(mddev)); percpu_ref_get(&mddev->writes_pending); } EXPORT_SYMBOL(md_write_inc); void md_write_end(struct mddev *mddev) { percpu_ref_put(&mddev->writes_pending); if (mddev->safemode == 2) md_wakeup_thread(mddev->thread); else if (mddev->safemode_delay) /* The roundup() ensures this only performs locking once * every ->safemode_delay jiffies */ mod_timer(&mddev->safemode_timer, roundup(jiffies, mddev->safemode_delay) + mddev->safemode_delay); } EXPORT_SYMBOL(md_write_end); /* This is used by raid0 and raid10 */ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, struct bio *bio, sector_t start, sector_t size) { struct bio *discard_bio = NULL; if (__blkdev_issue_discard(rdev->bdev, start, size, GFP_NOIO, &discard_bio) || !discard_bio) return; bio_chain(discard_bio, bio); bio_clone_blkg_association(discard_bio, bio); if (mddev->gendisk) trace_block_bio_remap(discard_bio, disk_devt(mddev->gendisk), bio->bi_iter.bi_sector); submit_bio_noacct(discard_bio); } EXPORT_SYMBOL_GPL(md_submit_discard_bio); static void md_end_clone_io(struct bio *bio) { struct md_io_clone *md_io_clone = bio->bi_private; struct bio *orig_bio = md_io_clone->orig_bio; struct mddev *mddev = md_io_clone->mddev; orig_bio->bi_status = bio->bi_status; if (md_io_clone->start_time) bio_end_io_acct(orig_bio, md_io_clone->start_time); bio_put(bio); bio_endio(orig_bio); percpu_ref_put(&mddev->active_io); } static void md_clone_bio(struct mddev *mddev, struct bio **bio) { struct block_device *bdev = (*bio)->bi_bdev; struct md_io_clone *md_io_clone; struct bio *clone = bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_clone_set); md_io_clone = container_of(clone, struct md_io_clone, bio_clone); md_io_clone->orig_bio = *bio; md_io_clone->mddev = mddev; if (blk_queue_io_stat(bdev->bd_disk->queue)) md_io_clone->start_time = bio_start_io_acct(*bio); clone->bi_end_io = md_end_clone_io; clone->bi_private = md_io_clone; *bio = clone; } void md_account_bio(struct mddev *mddev, struct bio **bio) { percpu_ref_get(&mddev->active_io); md_clone_bio(mddev, bio); } EXPORT_SYMBOL_GPL(md_account_bio); /* md_allow_write(mddev) * Calling this ensures that the array is marked 'active' so that writes * may proceed without blocking. It is important to call this before * attempting a GFP_KERNEL allocation while holding the mddev lock. * Must be called with mddev_lock held. */ void md_allow_write(struct mddev *mddev) { if (!mddev->pers) return; if (!md_is_rdwr(mddev)) return; if (!mddev->pers->sync_request) return; spin_lock(&mddev->lock); if (mddev->in_sync) { mddev->in_sync = 0; set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); if (mddev->safemode_delay && mddev->safemode == 0) mddev->safemode = 1; spin_unlock(&mddev->lock); md_update_sb(mddev, 0); sysfs_notify_dirent_safe(mddev->sysfs_state); /* wait for the dirty state to be recorded in the metadata */ wait_event(mddev->sb_wait, !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); } else spin_unlock(&mddev->lock); } EXPORT_SYMBOL_GPL(md_allow_write); #define SYNC_MARKS 10 #define SYNC_MARK_STEP (3*HZ) #define UPDATE_FREQUENCY (5*60*HZ) void md_do_sync(struct md_thread *thread) { struct mddev *mddev = thread->mddev; struct mddev *mddev2; unsigned int currspeed = 0, window; sector_t max_sectors,j, io_sectors, recovery_done; unsigned long mark[SYNC_MARKS]; unsigned long update_time; sector_t mark_cnt[SYNC_MARKS]; int last_mark,m; sector_t last_check; int skipped = 0; struct md_rdev *rdev; char *desc, *action = NULL; struct blk_plug plug; int ret; /* just incase thread restarts... */ if (test_bit(MD_RECOVERY_DONE, &mddev->recovery) || test_bit(MD_RECOVERY_WAIT, &mddev->recovery)) return; if (!md_is_rdwr(mddev)) {/* never try to sync a read-only array */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); return; } if (mddev_is_clustered(mddev)) { ret = md_cluster_ops->resync_start(mddev); if (ret) goto skip; set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags); if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) || test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) && ((unsigned long long)mddev->curr_resync_completed < (unsigned long long)mddev->resync_max_sectors)) goto skip; } if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { desc = "data-check"; action = "check"; } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { desc = "requested-resync"; action = "repair"; } else desc = "resync"; } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) desc = "reshape"; else desc = "recovery"; mddev->last_sync_action = action ?: desc; /* * Before starting a resync we must have set curr_resync to * 2, and then checked that every "conflicting" array has curr_resync * less than ours. When we find one that is the same or higher * we wait on resync_wait. To avoid deadlock, we reduce curr_resync * to 1 if we choose to yield (based arbitrarily on address of mddev structure). * This will mean we have to start checking from the beginning again. * */ do { int mddev2_minor = -1; mddev->curr_resync = MD_RESYNC_DELAYED; try_again: if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) goto skip; spin_lock(&all_mddevs_lock); list_for_each_entry(mddev2, &all_mddevs, all_mddevs) { if (test_bit(MD_DELETED, &mddev2->flags)) continue; if (mddev2 == mddev) continue; if (!mddev->parallel_resync && mddev2->curr_resync && match_mddev_units(mddev, mddev2)) { DEFINE_WAIT(wq); if (mddev < mddev2 && mddev->curr_resync == MD_RESYNC_DELAYED) { /* arbitrarily yield */ mddev->curr_resync = MD_RESYNC_YIELDED; wake_up(&resync_wait); } if (mddev > mddev2 && mddev->curr_resync == MD_RESYNC_YIELDED) /* no need to wait here, we can wait the next * time 'round when curr_resync == 2 */ continue; /* We need to wait 'interruptible' so as not to * contribute to the load average, and not to * be caught by 'softlockup' */ prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && mddev2->curr_resync >= mddev->curr_resync) { if (mddev2_minor != mddev2->md_minor) { mddev2_minor = mddev2->md_minor; pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n", desc, mdname(mddev), mdname(mddev2)); } spin_unlock(&all_mddevs_lock); if (signal_pending(current)) flush_signals(current); schedule(); finish_wait(&resync_wait, &wq); goto try_again; } finish_wait(&resync_wait, &wq); } } spin_unlock(&all_mddevs_lock); } while (mddev->curr_resync < MD_RESYNC_DELAYED); j = 0; if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { /* resync follows the size requested by the personality, * which defaults to physical size, but can be virtual size */ max_sectors = mddev->resync_max_sectors; atomic64_set(&mddev->resync_mismatches, 0); /* we don't use the checkpoint if there's a bitmap */ if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) j = mddev->resync_min; else if (!mddev->bitmap) j = mddev->recovery_cp; } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { max_sectors = mddev->resync_max_sectors; /* * If the original node aborts reshaping then we continue the * reshaping, so set j again to avoid restart reshape from the * first beginning */ if (mddev_is_clustered(mddev) && mddev->reshape_position != MaxSector) j = mddev->reshape_position; } else { /* recovery follows the physical size of devices */ max_sectors = mddev->dev_sectors; j = MaxSector; rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && rdev->recovery_offset < j) j = rdev->recovery_offset; rcu_read_unlock(); /* If there is a bitmap, we need to make sure all * writes that started before we added a spare * complete before we start doing a recovery. * Otherwise the write might complete and (via * bitmap_endwrite) set a bit in the bitmap after the * recovery has checked that bit and skipped that * region. */ if (mddev->bitmap) { mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); } } pr_info("md: %s of RAID array %s\n", desc, mdname(mddev)); pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n", speed_min(mddev)); pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n", speed_max(mddev), desc); is_mddev_idle(mddev, 1); /* this initializes IO event counters */ io_sectors = 0; for (m = 0; m < SYNC_MARKS; m++) { mark[m] = jiffies; mark_cnt[m] = io_sectors; } last_mark = 0; mddev->resync_mark = mark[last_mark]; mddev->resync_mark_cnt = mark_cnt[last_mark]; /* * Tune reconstruction: */ window = 32 * (PAGE_SIZE / 512); pr_debug("md: using %dk window, over a total of %lluk.\n", window/2, (unsigned long long)max_sectors/2); atomic_set(&mddev->recovery_active, 0); last_check = 0; if (j >= MD_RESYNC_ACTIVE) { pr_debug("md: resuming %s of %s from checkpoint.\n", desc, mdname(mddev)); mddev->curr_resync = j; } else mddev->curr_resync = MD_RESYNC_ACTIVE; /* no longer delayed */ mddev->curr_resync_completed = j; sysfs_notify_dirent_safe(mddev->sysfs_completed); md_new_event(); update_time = jiffies; blk_start_plug(&plug); while (j < max_sectors) { sector_t sectors; skipped = 0; if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && ((mddev->curr_resync > mddev->curr_resync_completed && (mddev->curr_resync - mddev->curr_resync_completed) > (max_sectors >> 4)) || time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) || (j - mddev->curr_resync_completed)*2 >= mddev->resync_max - mddev->curr_resync_completed || mddev->curr_resync_completed > mddev->resync_max )) { /* time to update curr_resync_completed */ wait_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active) == 0); mddev->curr_resync_completed = j; if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && j > mddev->recovery_cp) mddev->recovery_cp = j; update_time = jiffies; set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); sysfs_notify_dirent_safe(mddev->sysfs_completed); } while (j >= mddev->resync_max && !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { /* As this condition is controlled by user-space, * we can block indefinitely, so use '_interruptible' * to avoid triggering warnings. */ flush_signals(current); /* just in case */ wait_event_interruptible(mddev->recovery_wait, mddev->resync_max > j || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); } if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) break; sectors = mddev->pers->sync_request(mddev, j, &skipped); if (sectors == 0) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); break; } if (!skipped) { /* actual IO requested */ io_sectors += sectors; atomic_add(sectors, &mddev->recovery_active); } if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) break; j += sectors; if (j > max_sectors) /* when skipping, extra large numbers can be returned. */ j = max_sectors; if (j >= MD_RESYNC_ACTIVE) mddev->curr_resync = j; mddev->curr_mark_cnt = io_sectors; if (last_check == 0) /* this is the earliest that rebuild will be * visible in /proc/mdstat */ md_new_event(); if (last_check + window > io_sectors || j == max_sectors) continue; last_check = io_sectors; repeat: if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { /* step marks */ int next = (last_mark+1) % SYNC_MARKS; mddev->resync_mark = mark[next]; mddev->resync_mark_cnt = mark_cnt[next]; mark[next] = jiffies; mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); last_mark = next; } if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) break; /* * this loop exits only if either when we are slower than * the 'hard' speed limit, or the system was IO-idle for * a jiffy. * the system might be non-idle CPU-wise, but we only care * about not overloading the IO subsystem. (things like an * e2fsck being done on the RAID array should execute fast) */ cond_resched(); recovery_done = io_sectors - atomic_read(&mddev->recovery_active); currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2 /((jiffies-mddev->resync_mark)/HZ +1) +1; if (currspeed > speed_min(mddev)) { if (currspeed > speed_max(mddev)) { msleep(500); goto repeat; } if (!is_mddev_idle(mddev, 0)) { /* * Give other IO more of a chance. * The faster the devices, the less we wait. */ wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); } } } pr_info("md: %s: %s %s.\n",mdname(mddev), desc, test_bit(MD_RECOVERY_INTR, &mddev->recovery) ? "interrupted" : "done"); /* * this also signals 'finished resyncing' to md_stop */ blk_finish_plug(&plug); wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && mddev->curr_resync >= MD_RESYNC_ACTIVE) { mddev->curr_resync_completed = mddev->curr_resync; sysfs_notify_dirent_safe(mddev->sysfs_completed); } mddev->pers->sync_request(mddev, max_sectors, &skipped); if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && mddev->curr_resync > MD_RESYNC_ACTIVE) { if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { if (mddev->curr_resync >= mddev->recovery_cp) { pr_debug("md: checkpointing %s of %s.\n", desc, mdname(mddev)); if (test_bit(MD_RECOVERY_ERROR, &mddev->recovery)) mddev->recovery_cp = mddev->curr_resync_completed; else mddev->recovery_cp = mddev->curr_resync; } } else mddev->recovery_cp = MaxSector; } else { if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) mddev->curr_resync = MaxSector; if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) { rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) if (rdev->raid_disk >= 0 && mddev->delta_disks >= 0 && !test_bit(Journal, &rdev->flags) && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && rdev->recovery_offset < mddev->curr_resync) rdev->recovery_offset = mddev->curr_resync; rcu_read_unlock(); } } } skip: /* set CHANGE_PENDING here since maybe another update is needed, * so other nodes are informed. It should be harmless for normal * raid */ set_mask_bits(&mddev->sb_flags, 0, BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS)); if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && mddev->delta_disks > 0 && mddev->pers->finish_reshape && mddev->pers->size && mddev->queue) { mddev_lock_nointr(mddev); md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0)); mddev_unlock(mddev); if (!mddev_is_clustered(mddev)) set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); } spin_lock(&mddev->lock); if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { /* We completed so min/max setting can be forgotten if used. */ if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) mddev->resync_min = 0; mddev->resync_max = MaxSector; } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) mddev->resync_min = mddev->curr_resync_completed; set_bit(MD_RECOVERY_DONE, &mddev->recovery); mddev->curr_resync = MD_RESYNC_NONE; spin_unlock(&mddev->lock); wake_up(&resync_wait); md_wakeup_thread(mddev->thread); return; } EXPORT_SYMBOL_GPL(md_do_sync); static bool rdev_removeable(struct md_rdev *rdev) { /* rdev is not used. */ if (rdev->raid_disk < 0) return false; /* There are still inflight io, don't remove this rdev. */ if (atomic_read(&rdev->nr_pending)) return false; /* * An error occurred but has not yet been acknowledged by the metadata * handler, don't remove this rdev. */ if (test_bit(Blocked, &rdev->flags)) return false; /* Fautly rdev is not used, it's safe to remove it. */ if (test_bit(Faulty, &rdev->flags)) return true; /* Journal disk can only be removed if it's faulty. */ if (test_bit(Journal, &rdev->flags)) return false; /* * 'In_sync' is cleared while 'raid_disk' is valid, which means * replacement has just become active from pers->spare_active(), and * then pers->hot_remove_disk() will replace this rdev with replacement. */ if (!test_bit(In_sync, &rdev->flags)) return true; return false; } static bool rdev_is_spare(struct md_rdev *rdev) { return !test_bit(Candidate, &rdev->flags) && rdev->raid_disk >= 0 && !test_bit(In_sync, &rdev->flags) && !test_bit(Journal, &rdev->flags) && !test_bit(Faulty, &rdev->flags); } static bool rdev_addable(struct md_rdev *rdev) { /* rdev is already used, don't add it again. */ if (test_bit(Candidate, &rdev->flags) || rdev->raid_disk >= 0 || test_bit(Faulty, &rdev->flags)) return false; /* Allow to add journal disk. */ if (test_bit(Journal, &rdev->flags)) return true; /* Allow to add if array is read-write. */ if (md_is_rdwr(rdev->mddev)) return true; /* * For read-only array, only allow to readd a rdev. And if bitmap is * used, don't allow to readd a rdev that is too old. */ if (rdev->saved_raid_disk >= 0 && !test_bit(Bitmap_sync, &rdev->flags)) return true; return false; } static bool md_spares_need_change(struct mddev *mddev) { struct md_rdev *rdev; rdev_for_each(rdev, mddev) if (rdev_removeable(rdev) || rdev_addable(rdev)) return true; return false; } static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this) { struct md_rdev *rdev; int spares = 0; int removed = 0; bool remove_some = false; if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) /* Mustn't remove devices when resync thread is running */ return 0; rdev_for_each(rdev, mddev) { if ((this == NULL || rdev == this) && rdev->raid_disk >= 0 && !test_bit(Blocked, &rdev->flags) && test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)==0) { /* Faulty non-Blocked devices with nr_pending == 0 * never get nr_pending incremented, * never get Faulty cleared, and never get Blocked set. * So we can synchronize_rcu now rather than once per device */ remove_some = true; set_bit(RemoveSynchronized, &rdev->flags); } } if (remove_some) synchronize_rcu(); rdev_for_each(rdev, mddev) { if ((this == NULL || rdev == this) && (test_bit(RemoveSynchronized, &rdev->flags) || rdev_removeable(rdev))) { if (mddev->pers->hot_remove_disk( mddev, rdev) == 0) { sysfs_unlink_rdev(mddev, rdev); rdev->saved_raid_disk = rdev->raid_disk; rdev->raid_disk = -1; removed++; } } if (remove_some && test_bit(RemoveSynchronized, &rdev->flags)) clear_bit(RemoveSynchronized, &rdev->flags); } if (removed && mddev->kobj.sd) sysfs_notify_dirent_safe(mddev->sysfs_degraded); if (this && removed) goto no_add; rdev_for_each(rdev, mddev) { if (this && this != rdev) continue; if (rdev_is_spare(rdev)) spares++; if (!rdev_addable(rdev)) continue; if (!test_bit(Journal, &rdev->flags)) rdev->recovery_offset = 0; if (mddev->pers->hot_add_disk(mddev, rdev) == 0) { /* failure here is OK */ sysfs_link_rdev(mddev, rdev); if (!test_bit(Journal, &rdev->flags)) spares++; md_new_event(); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); } } no_add: if (removed) set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); return spares; } static bool md_choose_sync_action(struct mddev *mddev, int *spares) { /* Check if reshape is in progress first. */ if (mddev->reshape_position != MaxSector) { if (mddev->pers->check_reshape == NULL || mddev->pers->check_reshape(mddev) != 0) return false; set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); return true; } /* * Remove any failed drives, then add spares if possible. Spares are * also removed and re-added, to allow the personality to fail the * re-add. */ *spares = remove_and_add_spares(mddev, NULL); if (*spares) { clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); /* Start new recovery. */ set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); return true; } /* Check if recovery is in progress. */ if (mddev->recovery_cp < MaxSector) { set_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); return true; } /* Delay to choose resync/check/repair in md_do_sync(). */ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) return true; /* Nothing to be done */ return false; } static void md_start_sync(struct work_struct *ws) { struct mddev *mddev = container_of(ws, struct mddev, sync_work); int spares = 0; bool suspend = false; if (md_spares_need_change(mddev)) suspend = true; suspend ? mddev_suspend_and_lock_nointr(mddev) : mddev_lock_nointr(mddev); if (!md_is_rdwr(mddev)) { /* * On a read-only array we can: * - remove failed devices * - add already-in_sync devices if the array itself is in-sync. * As we only add devices that are already in-sync, we can * activate the spares immediately. */ remove_and_add_spares(mddev, NULL); goto not_running; } if (!md_choose_sync_action(mddev, &spares)) goto not_running; if (!mddev->pers->sync_request) goto not_running; /* * We are adding a device or devices to an array which has the bitmap * stored on all devices. So make sure all bitmap pages get written. */ if (spares) md_bitmap_write_all(mddev->bitmap); rcu_assign_pointer(mddev->sync_thread, md_register_thread(md_do_sync, mddev, "resync")); if (!mddev->sync_thread) { pr_warn("%s: could not start resync thread...\n", mdname(mddev)); /* leave the spares where they are, it shouldn't hurt */ goto not_running; } suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev); md_wakeup_thread(mddev->sync_thread); sysfs_notify_dirent_safe(mddev->sysfs_action); md_new_event(); return; not_running: clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev); wake_up(&resync_wait); if (test_and_clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && mddev->sysfs_action) sysfs_notify_dirent_safe(mddev->sysfs_action); } /* * This routine is regularly called by all per-raid-array threads to * deal with generic issues like resync and super-block update. * Raid personalities that don't have a thread (linear/raid0) do not * need this as they never do any recovery or update the superblock. * * It does not do any resync itself, but rather "forks" off other threads * to do that as needed. * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in * "->recovery" and create a thread at ->sync_thread. * When the thread finishes it sets MD_RECOVERY_DONE * and wakeups up this thread which will reap the thread and finish up. * This thread also removes any faulty devices (with nr_pending == 0). * * The overall approach is: * 1/ if the superblock needs updating, update it. * 2/ If a recovery thread is running, don't do anything else. * 3/ If recovery has finished, clean up, possibly marking spares active. * 4/ If there are any faulty devices, remove them. * 5/ If array is degraded, try to add spares devices * 6/ If array has spares or is not in-sync, start a resync thread. */ void md_check_recovery(struct mddev *mddev) { if (READ_ONCE(mddev->suspended)) return; if (mddev->bitmap) md_bitmap_daemon_work(mddev); if (signal_pending(current)) { if (mddev->pers->sync_request && !mddev->external) { pr_debug("md: %s in immediate safe mode\n", mdname(mddev)); mddev->safemode = 2; } flush_signals(current); } if (!md_is_rdwr(mddev) && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) return; if ( ! ( (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || test_bit(MD_RECOVERY_DONE, &mddev->recovery) || (mddev->external == 0 && mddev->safemode == 1) || (mddev->safemode == 2 && !mddev->in_sync && mddev->recovery_cp == MaxSector) )) return; if (mddev_trylock(mddev)) { bool try_set_sync = mddev->safemode != 0; if (!mddev->external && mddev->safemode == 1) mddev->safemode = 0; if (!md_is_rdwr(mddev)) { struct md_rdev *rdev; if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { /* sync_work already queued. */ clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); goto unlock; } if (!mddev->external && mddev->in_sync) /* * 'Blocked' flag not needed as failed devices * will be recorded if array switched to read/write. * Leaving it set will prevent the device * from being removed. */ rdev_for_each(rdev, mddev) clear_bit(Blocked, &rdev->flags); /* * There is no thread, but we need to call * ->spare_active and clear saved_raid_disk */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); md_reap_sync_thread(mddev); /* * Let md_start_sync() to remove and add rdevs to the * array. */ if (md_spares_need_change(mddev)) { set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); queue_work(md_misc_wq, &mddev->sync_work); } clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); goto unlock; } if (mddev_is_clustered(mddev)) { struct md_rdev *rdev, *tmp; /* kick the device if another node issued a * remove disk. */ rdev_for_each_safe(rdev, tmp, mddev) { if (test_and_clear_bit(ClusterRemove, &rdev->flags) && rdev->raid_disk < 0) md_kick_rdev_from_array(rdev); } } if (try_set_sync && !mddev->external && !mddev->in_sync) { spin_lock(&mddev->lock); set_in_sync(mddev); spin_unlock(&mddev->lock); } if (mddev->sb_flags) md_update_sb(mddev, 0); /* * Never start a new sync thread if MD_RECOVERY_RUNNING is * still set. */ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { /* resync/recovery still happening */ clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); goto unlock; } if (WARN_ON_ONCE(!mddev->sync_thread)) goto unlock; md_reap_sync_thread(mddev); goto unlock; } /* Set RUNNING before clearing NEEDED to avoid * any transients in the value of "sync_action". */ mddev->curr_resync_completed = 0; spin_lock(&mddev->lock); set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); spin_unlock(&mddev->lock); /* Clear some bits that don't mean anything, but * might be left set */ clear_bit(MD_RECOVERY_INTR, &mddev->recovery); clear_bit(MD_RECOVERY_DONE, &mddev->recovery); if (test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { queue_work(md_misc_wq, &mddev->sync_work); } else { clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); wake_up(&resync_wait); } unlock: wake_up(&mddev->sb_wait); mddev_unlock(mddev); } } EXPORT_SYMBOL(md_check_recovery); void md_reap_sync_thread(struct mddev *mddev) { struct md_rdev *rdev; sector_t old_dev_sectors = mddev->dev_sectors; bool is_reshaped = false; /* resync has finished, collect result */ md_unregister_thread(mddev, &mddev->sync_thread); atomic_inc(&mddev->sync_seq); if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && mddev->degraded != mddev->raid_disks) { /* success...*/ /* activate any spares */ if (mddev->pers->spare_active(mddev)) { sysfs_notify_dirent_safe(mddev->sysfs_degraded); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); } } if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && mddev->pers->finish_reshape) { mddev->pers->finish_reshape(mddev); if (mddev_is_clustered(mddev)) is_reshaped = true; } /* If array is no-longer degraded, then any saved_raid_disk * information must be scrapped. */ if (!mddev->degraded) rdev_for_each(rdev, mddev) rdev->saved_raid_disk = -1; md_update_sb(mddev, 1); /* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by * clustered raid */ if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags)) md_cluster_ops->resync_finish(mddev); clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); clear_bit(MD_RECOVERY_DONE, &mddev->recovery); clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); /* * We call md_cluster_ops->update_size here because sync_size could * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared, * so it is time to update size across cluster. */ if (mddev_is_clustered(mddev) && is_reshaped && !test_bit(MD_CLOSING, &mddev->flags)) md_cluster_ops->update_size(mddev, old_dev_sectors); /* flag recovery needed just to double check */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); sysfs_notify_dirent_safe(mddev->sysfs_completed); sysfs_notify_dirent_safe(mddev->sysfs_action); md_new_event(); if (mddev->event_work.func) queue_work(md_misc_wq, &mddev->event_work); wake_up(&resync_wait); } EXPORT_SYMBOL(md_reap_sync_thread); void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) { sysfs_notify_dirent_safe(rdev->sysfs_state); wait_event_timeout(rdev->blocked_wait, !test_bit(Blocked, &rdev->flags) && !test_bit(BlockedBadBlocks, &rdev->flags), msecs_to_jiffies(5000)); rdev_dec_pending(rdev, mddev); } EXPORT_SYMBOL(md_wait_for_blocked_rdev); void md_finish_reshape(struct mddev *mddev) { /* called be personality module when reshape completes. */ struct md_rdev *rdev; rdev_for_each(rdev, mddev) { if (rdev->data_offset > rdev->new_data_offset) rdev->sectors += rdev->data_offset - rdev->new_data_offset; else rdev->sectors -= rdev->new_data_offset - rdev->data_offset; rdev->data_offset = rdev->new_data_offset; } } EXPORT_SYMBOL(md_finish_reshape); /* Bad block management */ /* Returns 1 on success, 0 on failure */ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new) { struct mddev *mddev = rdev->mddev; int rv; if (is_new) s += rdev->new_data_offset; else s += rdev->data_offset; rv = badblocks_set(&rdev->badblocks, s, sectors, 0); if (rv == 0) { /* Make sure they get written out promptly */ if (test_bit(ExternalBbl, &rdev->flags)) sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks); sysfs_notify_dirent_safe(rdev->sysfs_state); set_mask_bits(&mddev->sb_flags, 0, BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); md_wakeup_thread(rdev->mddev->thread); return 1; } else return 0; } EXPORT_SYMBOL_GPL(rdev_set_badblocks); int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new) { int rv; if (is_new) s += rdev->new_data_offset; else s += rdev->data_offset; rv = badblocks_clear(&rdev->badblocks, s, sectors); if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags)) sysfs_notify_dirent_safe(rdev->sysfs_badblocks); return rv; } EXPORT_SYMBOL_GPL(rdev_clear_badblocks); static int md_notify_reboot(struct notifier_block *this, unsigned long code, void *x) { struct mddev *mddev, *n; int need_delay = 0; spin_lock(&all_mddevs_lock); list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { if (!mddev_get(mddev)) continue; spin_unlock(&all_mddevs_lock); if (mddev_trylock(mddev)) { if (mddev->pers) __md_stop_writes(mddev); if (mddev->persistent) mddev->safemode = 2; mddev_unlock(mddev); } need_delay = 1; mddev_put(mddev); spin_lock(&all_mddevs_lock); } spin_unlock(&all_mddevs_lock); /* * certain more exotic SCSI devices are known to be * volatile wrt too early system reboots. While the * right place to handle this issue is the given * driver, we do want to have a safe RAID driver ... */ if (need_delay) msleep(1000); return NOTIFY_DONE; } static struct notifier_block md_notifier = { .notifier_call = md_notify_reboot, .next = NULL, .priority = INT_MAX, /* before any real devices */ }; static void md_geninit(void) { pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); proc_create("mdstat", S_IRUGO, NULL, &mdstat_proc_ops); } static int __init md_init(void) { int ret = -ENOMEM; md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); if (!md_wq) goto err_wq; md_misc_wq = alloc_workqueue("md_misc", 0, 0); if (!md_misc_wq) goto err_misc_wq; md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND, 0); if (!md_bitmap_wq) goto err_bitmap_wq; ret = __register_blkdev(MD_MAJOR, "md", md_probe); if (ret < 0) goto err_md; ret = __register_blkdev(0, "mdp", md_probe); if (ret < 0) goto err_mdp; mdp_major = ret; register_reboot_notifier(&md_notifier); raid_table_header = register_sysctl("dev/raid", raid_table); md_geninit(); return 0; err_mdp: unregister_blkdev(MD_MAJOR, "md"); err_md: destroy_workqueue(md_bitmap_wq); err_bitmap_wq: destroy_workqueue(md_misc_wq); err_misc_wq: destroy_workqueue(md_wq); err_wq: return ret; } static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) { struct mdp_superblock_1 *sb = page_address(rdev->sb_page); struct md_rdev *rdev2, *tmp; int role, ret; /* * If size is changed in another node then we need to * do resize as well. */ if (mddev->dev_sectors != le64_to_cpu(sb->size)) { ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size)); if (ret) pr_info("md-cluster: resize failed\n"); else md_bitmap_update_sb(mddev->bitmap); } /* Check for change of roles in the active devices */ rdev_for_each_safe(rdev2, tmp, mddev) { if (test_bit(Faulty, &rdev2->flags)) continue; /* Check if the roles changed */ role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); if (test_bit(Candidate, &rdev2->flags)) { if (role == MD_DISK_ROLE_FAULTY) { pr_info("md: Removing Candidate device %pg because add failed\n", rdev2->bdev); md_kick_rdev_from_array(rdev2); continue; } else clear_bit(Candidate, &rdev2->flags); } if (role != rdev2->raid_disk) { /* * got activated except reshape is happening. */ if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE && !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { rdev2->saved_raid_disk = role; ret = remove_and_add_spares(mddev, rdev2); pr_info("Activated spare: %pg\n", rdev2->bdev); /* wakeup mddev->thread here, so array could * perform resync with the new activated disk */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); } /* device faulty * We just want to do the minimum to mark the disk * as faulty. The recovery is performed by the * one who initiated the error. */ if (role == MD_DISK_ROLE_FAULTY || role == MD_DISK_ROLE_JOURNAL) { md_error(mddev, rdev2); clear_bit(Blocked, &rdev2->flags); } } } if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) { ret = update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); if (ret) pr_warn("md: updating array disks failed. %d\n", ret); } /* * Since mddev->delta_disks has already updated in update_raid_disks, * so it is time to check reshape. */ if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { /* * reshape is happening in the remote node, we need to * update reshape_position and call start_reshape. */ mddev->reshape_position = le64_to_cpu(sb->reshape_position); if (mddev->pers->update_reshape_pos) mddev->pers->update_reshape_pos(mddev); if (mddev->pers->start_reshape) mddev->pers->start_reshape(mddev); } else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && mddev->reshape_position != MaxSector && !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { /* reshape is just done in another node. */ mddev->reshape_position = MaxSector; if (mddev->pers->update_reshape_pos) mddev->pers->update_reshape_pos(mddev); } /* Finally set the event to be up to date */ mddev->events = le64_to_cpu(sb->events); } static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) { int err; struct page *swapout = rdev->sb_page; struct mdp_superblock_1 *sb; /* Store the sb page of the rdev in the swapout temporary * variable in case we err in the future */ rdev->sb_page = NULL; err = alloc_disk_sb(rdev); if (err == 0) { ClearPageUptodate(rdev->sb_page); rdev->sb_loaded = 0; err = super_types[mddev->major_version]. load_super(rdev, NULL, mddev->minor_version); } if (err < 0) { pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n", __func__, __LINE__, rdev->desc_nr, err); if (rdev->sb_page) put_page(rdev->sb_page); rdev->sb_page = swapout; rdev->sb_loaded = 1; return err; } sb = page_address(rdev->sb_page); /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET * is not set */ if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET)) rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); /* The other node finished recovery, call spare_active to set * device In_sync and mddev->degraded */ if (rdev->recovery_offset == MaxSector && !test_bit(In_sync, &rdev->flags) && mddev->pers->spare_active(mddev)) sysfs_notify_dirent_safe(mddev->sysfs_degraded); put_page(swapout); return 0; } void md_reload_sb(struct mddev *mddev, int nr) { struct md_rdev *rdev = NULL, *iter; int err; /* Find the rdev */ rdev_for_each_rcu(iter, mddev) { if (iter->desc_nr == nr) { rdev = iter; break; } } if (!rdev) { pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr); return; } err = read_rdev(mddev, rdev); if (err < 0) return; check_sb_changes(mddev, rdev); /* Read all rdev's to update recovery_offset */ rdev_for_each_rcu(rdev, mddev) { if (!test_bit(Faulty, &rdev->flags)) read_rdev(mddev, rdev); } } EXPORT_SYMBOL(md_reload_sb); #ifndef MODULE /* * Searches all registered partitions for autorun RAID arrays * at boot time. */ static DEFINE_MUTEX(detected_devices_mutex); static LIST_HEAD(all_detected_devices); struct detected_devices_node { struct list_head list; dev_t dev; }; void md_autodetect_dev(dev_t dev) { struct detected_devices_node *node_detected_dev; node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL); if (node_detected_dev) { node_detected_dev->dev = dev; mutex_lock(&detected_devices_mutex); list_add_tail(&node_detected_dev->list, &all_detected_devices); mutex_unlock(&detected_devices_mutex); } } void md_autostart_arrays(int part) { struct md_rdev *rdev; struct detected_devices_node *node_detected_dev; dev_t dev; int i_scanned, i_passed; i_scanned = 0; i_passed = 0; pr_info("md: Autodetecting RAID arrays.\n"); mutex_lock(&detected_devices_mutex); while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { i_scanned++; node_detected_dev = list_entry(all_detected_devices.next, struct detected_devices_node, list); list_del(&node_detected_dev->list); dev = node_detected_dev->dev; kfree(node_detected_dev); mutex_unlock(&detected_devices_mutex); rdev = md_import_device(dev,0, 90); mutex_lock(&detected_devices_mutex); if (IS_ERR(rdev)) continue; if (test_bit(Faulty, &rdev->flags)) continue; set_bit(AutoDetected, &rdev->flags); list_add(&rdev->same_set, &pending_raid_disks); i_passed++; } mutex_unlock(&detected_devices_mutex); pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed); autorun_devices(part); } #endif /* !MODULE */ static __exit void md_exit(void) { struct mddev *mddev, *n; int delay = 1; unregister_blkdev(MD_MAJOR,"md"); unregister_blkdev(mdp_major, "mdp"); unregister_reboot_notifier(&md_notifier); unregister_sysctl_table(raid_table_header); /* We cannot unload the modules while some process is * waiting for us in select() or poll() - wake them up */ md_unloading = 1; while (waitqueue_active(&md_event_waiters)) { /* not safe to leave yet */ wake_up(&md_event_waiters); msleep(delay); delay += delay; } remove_proc_entry("mdstat", NULL); spin_lock(&all_mddevs_lock); list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { if (!mddev_get(mddev)) continue; spin_unlock(&all_mddevs_lock); export_array(mddev); mddev->ctime = 0; mddev->hold_active = 0; /* * As the mddev is now fully clear, mddev_put will schedule * the mddev for destruction by a workqueue, and the * destroy_workqueue() below will wait for that to complete. */ mddev_put(mddev); spin_lock(&all_mddevs_lock); } spin_unlock(&all_mddevs_lock); destroy_workqueue(md_misc_wq); destroy_workqueue(md_bitmap_wq); destroy_workqueue(md_wq); } subsys_initcall(md_init); module_exit(md_exit) static int get_ro(char *buffer, const struct kernel_param *kp) { return sprintf(buffer, "%d\n", start_readonly); } static int set_ro(const char *val, const struct kernel_param *kp) { return kstrtouint(val, 10, (unsigned int *)&start_readonly); } module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); module_param(create_on_open, bool, S_IRUSR|S_IWUSR); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("MD RAID framework"); MODULE_ALIAS("md"); MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);
129 129 174 1 174 173 2 174 174 174 173 36 35 36 36 36 36 36 36 36 36 36 36 35 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 12 12 12 12 6 12 195 190 194 187 175 185 142 143 150 23 195 154 42 9 37 42 138 138 138 138 138 138 138 138 138 138 138 138 138 138 138 138 138 138 138 138 138 138 138 138 138 138 138 138 2 2 2 2 138 2 138 138 138 2 138 138 138 2 138 138 135 138 138 138 138 138 2 138 2 2 138 137 138 138 138 138 142 142 142 138 138 138 138 138 138 138 137 138 138 138 138 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPv6 output functions * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Based on linux/net/ipv4/ip_output.c * * Changes: * A.N.Kuznetsov : airthmetics in fragmentation. * extension headers are implemented. * route changes now work. * ip6_forward does not confuse sniffers. * etc. * * H. von Brand : Added missing #include <linux/string.h> * Imran Patel : frag id should be in NBO * Kazunori MIYAZAWA @USAGI * : add ip6_append_data and related functions * for datagram xmit */ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/in6.h> #include <linux/tcp.h> #include <linux/route.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/bpf-cgroup.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> #include <net/sock.h> #include <net/snmp.h> #include <net/gso.h> #include <net/ipv6.h> #include <net/ndisc.h> #include <net/protocol.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/rawv6.h> #include <net/icmp.h> #include <net/xfrm.h> #include <net/checksum.h> #include <linux/mroute6.h> #include <net/l3mdev.h> #include <net/lwtunnel.h> #include <net/ip_tunnels.h> static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct net_device *dev = dst->dev; struct inet6_dev *idev = ip6_dst_idev(dst); unsigned int hh_len = LL_RESERVED_SPACE(dev); const struct in6_addr *daddr, *nexthop; struct ipv6hdr *hdr; struct neighbour *neigh; int ret; /* Be paranoid, rather than too clever. */ if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) { skb = skb_expand_head(skb, hh_len); if (!skb) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); return -ENOMEM; } } hdr = ipv6_hdr(skb); daddr = &hdr->daddr; if (ipv6_addr_is_multicast(daddr)) { if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) && ((mroute6_is_socket(net, skb) && !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) { struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); /* Do not check for IFF_ALLMULTI; multicast routing is not supported in any case. */ if (newskb) NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, sk, newskb, NULL, newskb->dev, dev_loopback_xmit); if (hdr->hop_limit == 0) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); kfree_skb(skb); return 0; } } IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len); if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL && !(dev->flags & IFF_LOOPBACK)) { kfree_skb(skb); return 0; } } if (lwtunnel_xmit_redirect(dst->lwtstate)) { int res = lwtunnel_xmit(skb); if (res != LWTUNNEL_XMIT_CONTINUE) return res; } IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); rcu_read_lock(); nexthop = rt6_nexthop((struct rt6_info *)dst, daddr); neigh = __ipv6_neigh_lookup_noref(dev, nexthop); if (unlikely(IS_ERR_OR_NULL(neigh))) { if (unlikely(!neigh)) neigh = __neigh_create(&nd_tbl, nexthop, dev, false); if (IS_ERR(neigh)) { rcu_read_unlock(); IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES); kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL); return -EINVAL; } } sock_confirm_neigh(skb, neigh); ret = neigh_output(neigh, skb, false); rcu_read_unlock(); return ret; } static int ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int mtu) { struct sk_buff *segs, *nskb; netdev_features_t features; int ret = 0; /* Please see corresponding comment in ip_finish_output_gso * describing the cases where GSO segment length exceeds the * egress MTU. */ features = netif_skb_features(skb); segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) { kfree_skb(skb); return -ENOMEM; } consume_skb(skb); skb_list_walk_safe(segs, segs, nskb) { int err; skb_mark_not_on_list(segs); /* Last GSO segment can be smaller than gso_size (and MTU). * Adding a fragment header would produce an "atomic fragment", * which is considered harmful (RFC-8021). Avoid that. */ err = segs->len > mtu ? ip6_fragment(net, sk, segs, ip6_finish_output2) : ip6_finish_output2(net, sk, segs); if (err && ret == 0) ret = err; } return ret; } static int ip6_finish_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int mtu) { if (!(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) && !skb_gso_validate_network_len(skb, mtu)) return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu); return ip6_finish_output2(net, sk, skb); } static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { unsigned int mtu; #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ if (skb_dst(skb)->xfrm) { IP6CB(skb)->flags |= IP6SKB_REROUTED; return dst_output(net, sk, skb); } #endif mtu = ip6_skb_dst_mtu(skb); if (skb_is_gso(skb)) return ip6_finish_output_gso(net, sk, skb, mtu); if (skb->len > mtu || (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) return ip6_fragment(net, sk, skb, ip6_finish_output2); return ip6_finish_output2(net, sk, skb); } static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { int ret; ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); switch (ret) { case NET_XMIT_SUCCESS: case NET_XMIT_CN: return __ip6_finish_output(net, sk, skb) ? : ret; default: kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS); return ret; } } int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev; struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); skb->protocol = htons(ETH_P_IPV6); skb->dev = dev; if (unlikely(idev->cnf.disable_ipv6)) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED); return 0; } return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, sk, skb, indev, dev, ip6_finish_output, !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } EXPORT_SYMBOL(ip6_output); bool ip6_autoflowlabel(struct net *net, const struct sock *sk) { if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk)) return ip6_default_np_autolabel(net); return inet6_test_bit(AUTOFLOWLABEL, sk); } /* * xmit an sk_buff (used by TCP, SCTP and DCCP) * Note : socket lock is not held for SYNACK packets, but might be modified * by calls to skb_set_owner_w() and ipv6_local_error(), * which are using proper atomic operations or spinlocks. */ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority) { struct net *net = sock_net(sk); const struct ipv6_pinfo *np = inet6_sk(sk); struct in6_addr *first_hop = &fl6->daddr; struct dst_entry *dst = skb_dst(skb); struct net_device *dev = dst->dev; struct inet6_dev *idev = ip6_dst_idev(dst); struct hop_jumbo_hdr *hop_jumbo; int hoplen = sizeof(*hop_jumbo); unsigned int head_room; struct ipv6hdr *hdr; u8 proto = fl6->flowi6_proto; int seg_len = skb->len; int hlimit = -1; u32 mtu; head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev); if (opt) head_room += opt->opt_nflen + opt->opt_flen; if (unlikely(head_room > skb_headroom(skb))) { skb = skb_expand_head(skb, head_room); if (!skb) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); return -ENOBUFS; } } if (opt) { seg_len += opt->opt_nflen + opt->opt_flen; if (opt->opt_flen) ipv6_push_frag_opts(skb, opt, &proto); if (opt->opt_nflen) ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop, &fl6->saddr); } if (unlikely(seg_len > IPV6_MAXPLEN)) { hop_jumbo = skb_push(skb, hoplen); hop_jumbo->nexthdr = proto; hop_jumbo->hdrlen = 0; hop_jumbo->tlv_type = IPV6_TLV_JUMBO; hop_jumbo->tlv_len = 4; hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen); proto = IPPROTO_HOPOPTS; seg_len = 0; IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO; } skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); hdr = ipv6_hdr(skb); /* * Fill in the IPv6 header */ if (np) hlimit = READ_ONCE(np->hop_limit); if (hlimit < 0) hlimit = ip6_dst_hoplimit(dst); ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, ip6_autoflowlabel(net, sk), fl6)); hdr->payload_len = htons(seg_len); hdr->nexthdr = proto; hdr->hop_limit = hlimit; hdr->saddr = fl6->saddr; hdr->daddr = *first_hop; skb->protocol = htons(ETH_P_IPV6); skb->priority = priority; skb->mark = mark; mtu = dst_mtu(dst); if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); /* if egress device is enslaved to an L3 master device pass the * skb to its handler for processing */ skb = l3mdev_ip6_out((struct sock *)sk, skb); if (unlikely(!skb)) return 0; /* hooks should never assume socket lock is held. * we promote our socket to non const */ return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, (struct sock *)sk, skb, NULL, dev, dst_output); } skb->dev = dev; /* ipv6_local_error() does not require socket lock, * we promote our socket to non const */ ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu); IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS); kfree_skb(skb); return -EMSGSIZE; } EXPORT_SYMBOL(ip6_xmit); static int ip6_call_ra_chain(struct sk_buff *skb, int sel) { struct ip6_ra_chain *ra; struct sock *last = NULL; read_lock(&ip6_ra_lock); for (ra = ip6_ra_chain; ra; ra = ra->next) { struct sock *sk = ra->sk; if (sk && ra->sel == sel && (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == skb->dev->ifindex)) { if (inet6_test_bit(RTALERT_ISOLATE, sk) && !net_eq(sock_net(sk), dev_net(skb->dev))) { continue; } if (last) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) rawv6_rcv(last, skb2); } last = sk; } } if (last) { rawv6_rcv(last, skb); read_unlock(&ip6_ra_lock); return 1; } read_unlock(&ip6_ra_lock); return 0; } static int ip6_forward_proxy_check(struct sk_buff *skb) { struct ipv6hdr *hdr = ipv6_hdr(skb); u8 nexthdr = hdr->nexthdr; __be16 frag_off; int offset; if (ipv6_ext_hdr(nexthdr)) { offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off); if (offset < 0) return 0; } else offset = sizeof(struct ipv6hdr); if (nexthdr == IPPROTO_ICMPV6) { struct icmp6hdr *icmp6; if (!pskb_may_pull(skb, (skb_network_header(skb) + offset + 1 - skb->data))) return 0; icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset); switch (icmp6->icmp6_type) { case NDISC_ROUTER_SOLICITATION: case NDISC_ROUTER_ADVERTISEMENT: case NDISC_NEIGHBOUR_SOLICITATION: case NDISC_NEIGHBOUR_ADVERTISEMENT: case NDISC_REDIRECT: /* For reaction involving unicast neighbor discovery * message destined to the proxied address, pass it to * input function. */ return 1; default: break; } } /* * The proxying router can't forward traffic sent to a link-local * address, so signal the sender and discard the packet. This * behavior is clarified by the MIPv6 specification. */ if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) { dst_link_failure(skb); return -1; } return 0; } static inline int ip6_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { #ifdef CONFIG_NET_SWITCHDEV if (skb->offload_l3_fwd_mark) { consume_skb(skb); return 0; } #endif skb_clear_tstamp(skb); return dst_output(net, sk, skb); } static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) { if (skb->len <= mtu) return false; /* ipv6 conntrack defrag sets max_frag_size + ignore_df */ if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu) return true; if (skb->ignore_df) return false; if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) return false; return true; } int ip6_forward(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct ipv6hdr *hdr = ipv6_hdr(skb); struct inet6_skb_parm *opt = IP6CB(skb); struct net *net = dev_net(dst->dev); struct inet6_dev *idev; SKB_DR(reason); u32 mtu; idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); if (net->ipv6.devconf_all->forwarding == 0) goto error; if (skb->pkt_type != PACKET_HOST) goto drop; if (unlikely(skb->sk)) goto drop; if (skb_warn_if_lro(skb)) goto drop; if (!net->ipv6.devconf_all->disable_policy && (!idev || !idev->cnf.disable_policy) && !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); goto drop; } skb_forward_csum(skb); /* * We DO NOT make any processing on * RA packets, pushing them to user level AS IS * without ane WARRANTY that application will be able * to interpret them. The reason is that we * cannot make anything clever here. * * We are not end-node, so that if packet contains * AH/ESP, we cannot make anything. * Defragmentation also would be mistake, RA packets * cannot be fragmented, because there is no warranty * that different fragments will go along one path. --ANK */ if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) { if (ip6_call_ra_chain(skb, ntohs(opt->ra))) return 0; } /* * check and decrement ttl */ if (hdr->hop_limit <= 1) { icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR); return -ETIMEDOUT; } /* XXX: idev->cnf.proxy_ndp? */ if (net->ipv6.devconf_all->proxy_ndp && pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) { int proxied = ip6_forward_proxy_check(skb); if (proxied > 0) { /* It's tempting to decrease the hop limit * here by 1, as we do at the end of the * function too. * * But that would be incorrect, as proxying is * not forwarding. The ip6_input function * will handle this packet locally, and it * depends on the hop limit being unchanged. * * One example is the NDP hop limit, that * always has to stay 255, but other would be * similar checks around RA packets, where the * user can even change the desired limit. */ return ip6_input(skb); } else if (proxied < 0) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); goto drop; } } if (!xfrm6_route_forward(skb)) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); SKB_DR_SET(reason, XFRM_POLICY); goto drop; } dst = skb_dst(skb); /* IPv6 specs say nothing about it, but it is clear that we cannot send redirects to source routed frames. We don't send redirects to frames decapsulated from IPsec. */ if (IP6CB(skb)->iif == dst->dev->ifindex && opt->srcrt == 0 && !skb_sec_path(skb)) { struct in6_addr *target = NULL; struct inet_peer *peer; struct rt6_info *rt; /* * incoming and outgoing devices are the same * send a redirect. */ rt = (struct rt6_info *) dst; if (rt->rt6i_flags & RTF_GATEWAY) target = &rt->rt6i_gateway; else target = &hdr->daddr; peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1); /* Limit redirects both by destination (here) and by source (inside ndisc_send_redirect) */ if (inet_peer_xrlim_allow(peer, 1*HZ)) ndisc_send_redirect(skb, target); if (peer) inet_putpeer(peer); } else { int addrtype = ipv6_addr_type(&hdr->saddr); /* This check is security critical. */ if (addrtype == IPV6_ADDR_ANY || addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK)) goto error; if (addrtype & IPV6_ADDR_LINKLOCAL) { icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOT_NEIGHBOUR, 0); goto error; } } __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); mtu = ip6_dst_mtu_maybe_forward(dst, true); if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; if (ip6_pkt_too_big(skb, mtu)) { /* Again, force OUTPUT device used as source address */ skb->dev = dst->dev; icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS); __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS); kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); return -EMSGSIZE; } if (skb_cow(skb, dst->dev->hard_header_len)) { __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS); goto drop; } hdr = ipv6_hdr(skb); /* Mangling hops number delayed to point after skb COW */ hdr->hop_limit--; return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, net, NULL, skb, skb->dev, dst->dev, ip6_forward_finish); error: __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); SKB_DR_SET(reason, IP_INADDRERRORS); drop: kfree_skb_reason(skb, reason); return -EINVAL; } static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) { to->pkt_type = from->pkt_type; to->priority = from->priority; to->protocol = from->protocol; skb_dst_drop(to); skb_dst_set(to, dst_clone(skb_dst(from))); to->dev = from->dev; to->mark = from->mark; skb_copy_hash(to, from); #ifdef CONFIG_NET_SCHED to->tc_index = from->tc_index; #endif nf_copy(to, from); skb_ext_copy(to, from); skb_copy_secmark(to, from); } int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr, u8 nexthdr, __be32 frag_id, struct ip6_fraglist_iter *iter) { unsigned int first_len; struct frag_hdr *fh; /* BUILD HEADER */ *prevhdr = NEXTHDR_FRAGMENT; iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); if (!iter->tmp_hdr) return -ENOMEM; iter->frag = skb_shinfo(skb)->frag_list; skb_frag_list_init(skb); iter->offset = 0; iter->hlen = hlen; iter->frag_id = frag_id; iter->nexthdr = nexthdr; __skb_pull(skb, hlen); fh = __skb_push(skb, sizeof(struct frag_hdr)); __skb_push(skb, hlen); skb_reset_network_header(skb); memcpy(skb_network_header(skb), iter->tmp_hdr, hlen); fh->nexthdr = nexthdr; fh->reserved = 0; fh->frag_off = htons(IP6_MF); fh->identification = frag_id; first_len = skb_pagelen(skb); skb->data_len = first_len - skb_headlen(skb); skb->len = first_len; ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr)); return 0; } EXPORT_SYMBOL(ip6_fraglist_init); void ip6_fraglist_prepare(struct sk_buff *skb, struct ip6_fraglist_iter *iter) { struct sk_buff *frag = iter->frag; unsigned int hlen = iter->hlen; struct frag_hdr *fh; frag->ip_summed = CHECKSUM_NONE; skb_reset_transport_header(frag); fh = __skb_push(frag, sizeof(struct frag_hdr)); __skb_push(frag, hlen); skb_reset_network_header(frag); memcpy(skb_network_header(frag), iter->tmp_hdr, hlen); iter->offset += skb->len - hlen - sizeof(struct frag_hdr); fh->nexthdr = iter->nexthdr; fh->reserved = 0; fh->frag_off = htons(iter->offset); if (frag->next) fh->frag_off |= htons(IP6_MF); fh->identification = iter->frag_id; ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); ip6_copy_metadata(frag, skb); } EXPORT_SYMBOL(ip6_fraglist_prepare); void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu, unsigned short needed_tailroom, int hdr_room, u8 *prevhdr, u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state) { state->prevhdr = prevhdr; state->nexthdr = nexthdr; state->frag_id = frag_id; state->hlen = hlen; state->mtu = mtu; state->left = skb->len - hlen; /* Space per frame */ state->ptr = hlen; /* Where to start from */ state->hroom = hdr_room; state->troom = needed_tailroom; state->offset = 0; } EXPORT_SYMBOL(ip6_frag_init); struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state) { u8 *prevhdr = state->prevhdr, *fragnexthdr_offset; struct sk_buff *frag; struct frag_hdr *fh; unsigned int len; len = state->left; /* IF: it doesn't fit, use 'mtu' - the data space left */ if (len > state->mtu) len = state->mtu; /* IF: we are not sending up to and including the packet end then align the next start on an eight byte boundary */ if (len < state->left) len &= ~7; /* Allocate buffer */ frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) + state->hroom + state->troom, GFP_ATOMIC); if (!frag) return ERR_PTR(-ENOMEM); /* * Set up data on packet */ ip6_copy_metadata(frag, skb); skb_reserve(frag, state->hroom); skb_put(frag, len + state->hlen + sizeof(struct frag_hdr)); skb_reset_network_header(frag); fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen); frag->transport_header = (frag->network_header + state->hlen + sizeof(struct frag_hdr)); /* * Charge the memory for the fragment to any owner * it might possess */ if (skb->sk) skb_set_owner_w(frag, skb->sk); /* * Copy the packet header into the new buffer. */ skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen); fragnexthdr_offset = skb_network_header(frag); fragnexthdr_offset += prevhdr - skb_network_header(skb); *fragnexthdr_offset = NEXTHDR_FRAGMENT; /* * Build fragment header. */ fh->nexthdr = state->nexthdr; fh->reserved = 0; fh->identification = state->frag_id; /* * Copy a block of the IP datagram. */ BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag), len)); state->left -= len; fh->frag_off = htons(state->offset); if (state->left > 0) fh->frag_off |= htons(IP6_MF); ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); state->ptr += len; state->offset += len; return frag; } EXPORT_SYMBOL(ip6_frag_next); int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *)) { struct sk_buff *frag; struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? inet6_sk(skb->sk) : NULL; bool mono_delivery_time = skb->mono_delivery_time; struct ip6_frag_state state; unsigned int mtu, hlen, nexthdr_offset; ktime_t tstamp = skb->tstamp; int hroom, err = 0; __be32 frag_id; u8 *prevhdr, nexthdr = 0; err = ip6_find_1stfragopt(skb, &prevhdr); if (err < 0) goto fail; hlen = err; nexthdr = *prevhdr; nexthdr_offset = prevhdr - skb_network_header(skb); mtu = ip6_skb_dst_mtu(skb); /* We must not fragment if the socket is set to force MTU discovery * or if the skb it not generated by a local socket. */ if (unlikely(!skb->ignore_df && skb->len > mtu)) goto fail_toobig; if (IP6CB(skb)->frag_max_size) { if (IP6CB(skb)->frag_max_size > mtu) goto fail_toobig; /* don't send fragments larger than what we received */ mtu = IP6CB(skb)->frag_max_size; if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; } if (np) { u32 frag_size = READ_ONCE(np->frag_size); if (frag_size && frag_size < mtu) mtu = frag_size; } if (mtu < hlen + sizeof(struct frag_hdr) + 8) goto fail_toobig; mtu -= hlen + sizeof(struct frag_hdr); frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, &ipv6_hdr(skb)->saddr); if (skb->ip_summed == CHECKSUM_PARTIAL && (err = skb_checksum_help(skb))) goto fail; prevhdr = skb_network_header(skb) + nexthdr_offset; hroom = LL_RESERVED_SPACE(rt->dst.dev); if (skb_has_frag_list(skb)) { unsigned int first_len = skb_pagelen(skb); struct ip6_fraglist_iter iter; struct sk_buff *frag2; if (first_len - hlen > mtu || ((first_len - hlen) & 7) || skb_cloned(skb) || skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) goto slow_path; skb_walk_frags(skb, frag) { /* Correct geometry. */ if (frag->len > mtu || ((frag->len & 7) && frag->next) || skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr))) goto slow_path_clean; /* Partially cloned skb? */ if (skb_shared(frag)) goto slow_path_clean; BUG_ON(frag->sk); if (skb->sk) { frag->sk = skb->sk; frag->destructor = sock_wfree; } skb->truesize -= frag->truesize; } err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id, &iter); if (err < 0) goto fail; /* We prevent @rt from being freed. */ rcu_read_lock(); for (;;) { /* Prepare header of the next frame, * before previous one went down. */ if (iter.frag) ip6_fraglist_prepare(skb, &iter); skb_set_delivery_time(skb, tstamp, mono_delivery_time); err = output(net, sk, skb); if (!err) IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGCREATES); if (err || !iter.frag) break; skb = ip6_fraglist_next(&iter); } kfree(iter.tmp_hdr); if (err == 0) { IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGOKS); rcu_read_unlock(); return 0; } kfree_skb_list(iter.frag); IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGFAILS); rcu_read_unlock(); return err; slow_path_clean: skb_walk_frags(skb, frag2) { if (frag2 == frag) break; frag2->sk = NULL; frag2->destructor = NULL; skb->truesize += frag2->truesize; } } slow_path: /* * Fragment the datagram. */ ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom, LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id, &state); /* * Keep copying data until we run out. */ while (state.left > 0) { frag = ip6_frag_next(skb, &state); if (IS_ERR(frag)) { err = PTR_ERR(frag); goto fail; } /* * Put this fragment into the sending queue. */ skb_set_delivery_time(frag, tstamp, mono_delivery_time); err = output(net, sk, frag); if (err) goto fail; IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGCREATES); } IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGOKS); consume_skb(skb); return err; fail_toobig: icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); err = -EMSGSIZE; fail: IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); kfree_skb(skb); return err; } static inline int ip6_rt_check(const struct rt6key *rt_key, const struct in6_addr *fl_addr, const struct in6_addr *addr_cache) { return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache)); } static struct dst_entry *ip6_sk_dst_check(struct sock *sk, struct dst_entry *dst, const struct flowi6 *fl6) { struct ipv6_pinfo *np = inet6_sk(sk); struct rt6_info *rt; if (!dst) goto out; if (dst->ops->family != AF_INET6) { dst_release(dst); return NULL; } rt = (struct rt6_info *)dst; /* Yes, checking route validity in not connected * case is not very simple. Take into account, * that we do not support routing by source, TOS, * and MSG_DONTROUTE --ANK (980726) * * 1. ip6_rt_check(): If route was host route, * check that cached destination is current. * If it is network route, we still may * check its validity using saved pointer * to the last used address: daddr_cache. * We do not want to save whole address now, * (because main consumer of this service * is tcp, which has not this problem), * so that the last trick works only on connected * sockets. * 2. oif also should be the same. */ if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || #ifdef CONFIG_IPV6_SUBTREES ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || #endif (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) { dst_release(dst); dst = NULL; } out: return dst; } static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6) { #ifdef CONFIG_IPV6_OPTIMISTIC_DAD struct neighbour *n; struct rt6_info *rt; #endif int err; int flags = 0; /* The correct way to handle this would be to do * ip6_route_get_saddr, and then ip6_route_output; however, * the route-specific preferred source forces the * ip6_route_output call _before_ ip6_route_get_saddr. * * In source specific routing (no src=any default route), * ip6_route_output will fail given src=any saddr, though, so * that's why we try it again later. */ if (ipv6_addr_any(&fl6->saddr)) { struct fib6_info *from; struct rt6_info *rt; *dst = ip6_route_output(net, sk, fl6); rt = (*dst)->error ? NULL : (struct rt6_info *)*dst; rcu_read_lock(); from = rt ? rcu_dereference(rt->from) : NULL; err = ip6_route_get_saddr(net, from, &fl6->daddr, sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0, &fl6->saddr); rcu_read_unlock(); if (err) goto out_err_release; /* If we had an erroneous initial result, pretend it * never existed and let the SA-enabled version take * over. */ if ((*dst)->error) { dst_release(*dst); *dst = NULL; } if (fl6->flowi6_oif) flags |= RT6_LOOKUP_F_IFACE; } if (!*dst) *dst = ip6_route_output_flags(net, sk, fl6, flags); err = (*dst)->error; if (err) goto out_err_release; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD /* * Here if the dst entry we've looked up * has a neighbour entry that is in the INCOMPLETE * state and the src address from the flow is * marked as OPTIMISTIC, we release the found * dst entry and replace it instead with the * dst entry of the nexthop router */ rt = (struct rt6_info *) *dst; rcu_read_lock(); n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt, &fl6->daddr)); err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0; rcu_read_unlock(); if (err) { struct inet6_ifaddr *ifp; struct flowi6 fl_gw6; int redirect; ifp = ipv6_get_ifaddr(net, &fl6->saddr, (*dst)->dev, 1); redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); if (ifp) in6_ifa_put(ifp); if (redirect) { /* * We need to get the dst entry for the * default router instead */ dst_release(*dst); memcpy(&fl_gw6, fl6, sizeof(struct flowi6)); memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr)); *dst = ip6_route_output(net, sk, &fl_gw6); err = (*dst)->error; if (err) goto out_err_release; } } #endif if (ipv6_addr_v4mapped(&fl6->saddr) && !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) { err = -EAFNOSUPPORT; goto out_err_release; } return 0; out_err_release: dst_release(*dst); *dst = NULL; if (err == -ENETUNREACH) IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); return err; } /** * ip6_dst_lookup - perform route lookup on flow * @net: Network namespace to perform lookup in * @sk: socket which provides route info * @dst: pointer to dst_entry * for result * @fl6: flow to lookup * * This function performs a route lookup on the given flow. * * It returns zero on success, or a standard errno code on error. */ int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6) { *dst = NULL; return ip6_dst_lookup_tail(net, sk, dst, fl6); } EXPORT_SYMBOL_GPL(ip6_dst_lookup); /** * ip6_dst_lookup_flow - perform route lookup on flow with ipsec * @net: Network namespace to perform lookup in * @sk: socket which provides route info * @fl6: flow to lookup * @final_dst: final destination address for ipsec lookup * * This function performs a route lookup on the given flow. * * It returns a valid dst pointer on success, or a pointer encoded * error code. */ struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6, const struct in6_addr *final_dst) { struct dst_entry *dst = NULL; int err; err = ip6_dst_lookup_tail(net, sk, &dst, fl6); if (err) return ERR_PTR(err); if (final_dst) fl6->daddr = *final_dst; return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0); } EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); /** * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow * @sk: socket which provides the dst cache and route info * @fl6: flow to lookup * @final_dst: final destination address for ipsec lookup * @connected: whether @sk is connected or not * * This function performs a route lookup on the given flow with the * possibility of using the cached route in the socket if it is valid. * It will take the socket dst lock when operating on the dst cache. * As a result, this function can only be used in process context. * * In addition, for a connected socket, cache the dst in the socket * if the current cache is not valid. * * It returns a valid dst pointer on success, or a pointer encoded * error code. */ struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, const struct in6_addr *final_dst, bool connected) { struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); dst = ip6_sk_dst_check(sk, dst, fl6); if (dst) return dst; dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst); if (connected && !IS_ERR(dst)) ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6); return dst; } EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src, gfp_t gfp) { return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; } static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src, gfp_t gfp) { return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; } static void ip6_append_data_mtu(unsigned int *mtu, int *maxfraglen, unsigned int fragheaderlen, struct sk_buff *skb, struct rt6_info *rt, unsigned int orig_mtu) { if (!(rt->dst.flags & DST_XFRM_TUNNEL)) { if (!skb) { /* first fragment, reserve header_len */ *mtu = orig_mtu - rt->dst.header_len; } else { /* * this fragment is not first, the headers * space is regarded as data space. */ *mtu = orig_mtu; } *maxfraglen = ((*mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr); } } static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6, struct rt6_info *rt) { struct ipv6_pinfo *np = inet6_sk(sk); unsigned int mtu, frag_size; struct ipv6_txoptions *nopt, *opt = ipc6->opt; /* callers pass dst together with a reference, set it first so * ip6_cork_release() can put it down even in case of an error. */ cork->base.dst = &rt->dst; /* * setup for corking */ if (opt) { if (WARN_ON(v6_cork->opt)) return -EINVAL; nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation); if (unlikely(!nopt)) return -ENOBUFS; nopt->tot_len = sizeof(*opt); nopt->opt_flen = opt->opt_flen; nopt->opt_nflen = opt->opt_nflen; nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation); if (opt->dst0opt && !nopt->dst0opt) return -ENOBUFS; nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation); if (opt->dst1opt && !nopt->dst1opt) return -ENOBUFS; nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation); if (opt->hopopt && !nopt->hopopt) return -ENOBUFS; nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation); if (opt->srcrt && !nopt->srcrt) return -ENOBUFS; /* need source address above miyazawa*/ } v6_cork->hop_limit = ipc6->hlimit; v6_cork->tclass = ipc6->tclass; if (rt->dst.flags & DST_XFRM_TUNNEL) mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ? READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst); else mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ? READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst)); frag_size = READ_ONCE(np->frag_size); if (frag_size && frag_size < mtu) mtu = frag_size; cork->base.fragsize = mtu; cork->base.gso_size = ipc6->gso_size; cork->base.tx_flags = 0; cork->base.mark = ipc6->sockc.mark; sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags); cork->base.length = 0; cork->base.transmit_time = ipc6->sockc.transmit_time; return 0; } static int __ip6_append_data(struct sock *sk, struct sk_buff_head *queue, struct inet_cork_full *cork_full, struct inet6_cork *v6_cork, struct page_frag *pfrag, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, size_t length, int transhdrlen, unsigned int flags, struct ipcm6_cookie *ipc6) { struct sk_buff *skb, *skb_prev = NULL; struct inet_cork *cork = &cork_full->base; struct flowi6 *fl6 = &cork_full->fl.u.ip6; unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; struct ubuf_info *uarg = NULL; int exthdrlen = 0; int dst_exthdrlen = 0; int hh_len; int copy; int err; int offset = 0; bool zc = false; u32 tskey = 0; struct rt6_info *rt = (struct rt6_info *)cork->dst; struct ipv6_txoptions *opt = v6_cork->opt; int csummode = CHECKSUM_NONE; unsigned int maxnonfragsize, headersize; unsigned int wmem_alloc_delta = 0; bool paged, extra_uref = false; skb = skb_peek_tail(queue); if (!skb) { exthdrlen = opt ? opt->opt_flen : 0; dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len; } paged = !!cork->gso_size; mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize; orig_mtu = mtu; if (cork->tx_flags & SKBTX_ANY_TSTAMP && READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) tskey = atomic_inc_return(&sk->sk_tskey) - 1; hh_len = LL_RESERVED_SPACE(rt->dst.dev); fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + (opt ? opt->opt_nflen : 0); headersize = sizeof(struct ipv6hdr) + (opt ? opt->opt_flen + opt->opt_nflen : 0) + rt->rt6i_nfheader_len; if (mtu <= fragheaderlen || ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr)) goto emsgsize; maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr); /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit * the first fragment */ if (headersize + transhdrlen > mtu) goto emsgsize; if (cork->length + length > mtu - headersize && ipc6->dontfrag && (sk->sk_protocol == IPPROTO_UDP || sk->sk_protocol == IPPROTO_ICMPV6 || sk->sk_protocol == IPPROTO_RAW)) { ipv6_local_rxpmtu(sk, fl6, mtu - headersize + sizeof(struct ipv6hdr)); goto emsgsize; } if (ip6_sk_ignore_df(sk)) maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN; else maxnonfragsize = mtu; if (cork->length + length > maxnonfragsize - headersize) { emsgsize: pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0); ipv6_local_error(sk, EMSGSIZE, fl6, pmtu); return -EMSGSIZE; } /* CHECKSUM_PARTIAL only with no extension headers and when * we are not going to fragment */ if (transhdrlen && sk->sk_protocol == IPPROTO_UDP && headersize == sizeof(struct ipv6hdr) && length <= mtu - headersize && (!(flags & MSG_MORE) || cork->gso_size) && rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) csummode = CHECKSUM_PARTIAL; if ((flags & MSG_ZEROCOPY) && length) { struct msghdr *msg = from; if (getfrag == ip_generic_getfrag && msg->msg_ubuf) { if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb)) return -EINVAL; /* Leave uarg NULL if can't zerocopy, callers should * be able to handle it. */ if ((rt->dst.dev->features & NETIF_F_SG) && csummode == CHECKSUM_PARTIAL) { paged = true; zc = true; uarg = msg->msg_ubuf; } } else if (sock_flag(sk, SOCK_ZEROCOPY)) { uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb)); if (!uarg) return -ENOBUFS; extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ if (rt->dst.dev->features & NETIF_F_SG && csummode == CHECKSUM_PARTIAL) { paged = true; zc = true; } else { uarg_to_msgzc(uarg)->zerocopy = 0; skb_zcopy_set(skb, uarg, &extra_uref); } } } else if ((flags & MSG_SPLICE_PAGES) && length) { if (inet_test_bit(HDRINCL, sk)) return -EPERM; if (rt->dst.dev->features & NETIF_F_SG && getfrag == ip_generic_getfrag) /* We need an empty buffer to attach stuff to */ paged = true; else flags &= ~MSG_SPLICE_PAGES; } /* * Let's try using as much space as possible. * Use MTU if total length of the message fits into the MTU. * Otherwise, we need to reserve fragment header and * fragment alignment (= 8-15 octects, in total). * * Note that we may need to "move" the data from the tail * of the buffer to the new fragment when we split * the message. * * FIXME: It may be fragmented into multiple chunks * at once if non-fragmentable extension headers * are too large. * --yoshfuji */ cork->length += length; if (!skb) goto alloc_new_skb; while (length > 0) { /* Check if the remaining data fits into current packet. */ copy = (cork->length <= mtu ? mtu : maxfraglen) - skb->len; if (copy < length) copy = maxfraglen - skb->len; if (copy <= 0) { char *data; unsigned int datalen; unsigned int fraglen; unsigned int fraggap; unsigned int alloclen, alloc_extra; unsigned int pagedlen; alloc_new_skb: /* There's no room in the current skb */ if (skb) fraggap = skb->len - maxfraglen; else fraggap = 0; /* update mtu and maxfraglen if necessary */ if (!skb || !skb_prev) ip6_append_data_mtu(&mtu, &maxfraglen, fragheaderlen, skb, rt, orig_mtu); skb_prev = skb; /* * If remaining data exceeds the mtu, * we know we need more fragment(s). */ datalen = length + fraggap; if (datalen > (cork->length <= mtu ? mtu : maxfraglen) - fragheaderlen) datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; fraglen = datalen + fragheaderlen; pagedlen = 0; alloc_extra = hh_len; alloc_extra += dst_exthdrlen; alloc_extra += rt->dst.trailer_len; /* We just reserve space for fragment header. * Note: this may be overallocation if the message * (without MSG_MORE) fits into the MTU. */ alloc_extra += sizeof(struct frag_hdr); if ((flags & MSG_MORE) && !(rt->dst.dev->features&NETIF_F_SG)) alloclen = mtu; else if (!paged && (fraglen + alloc_extra < SKB_MAX_ALLOC || !(rt->dst.dev->features & NETIF_F_SG))) alloclen = fraglen; else { alloclen = fragheaderlen + transhdrlen; pagedlen = datalen - transhdrlen; } alloclen += alloc_extra; if (datalen != length + fraggap) { /* * this is not the last fragment, the trailer * space is regarded as data space. */ datalen += rt->dst.trailer_len; } fraglen = datalen + fragheaderlen; copy = datalen - transhdrlen - fraggap - pagedlen; /* [!] NOTE: copy may be negative if pagedlen>0 * because then the equation may reduces to -fraggap. */ if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) { err = -EINVAL; goto error; } if (transhdrlen) { skb = sock_alloc_send_skb(sk, alloclen, (flags & MSG_DONTWAIT), &err); } else { skb = NULL; if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <= 2 * sk->sk_sndbuf) skb = alloc_skb(alloclen, sk->sk_allocation); if (unlikely(!skb)) err = -ENOBUFS; } if (!skb) goto error; /* * Fill in the control structures */ skb->protocol = htons(ETH_P_IPV6); skb->ip_summed = csummode; skb->csum = 0; /* reserve for fragmentation and ipsec header */ skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + dst_exthdrlen); /* * Find where to start putting bytes */ data = skb_put(skb, fraglen - pagedlen); skb_set_network_header(skb, exthdrlen); data += fragheaderlen; skb->transport_header = (skb->network_header + fragheaderlen); if (fraggap) { skb->csum = skb_copy_and_csum_bits( skb_prev, maxfraglen, data + transhdrlen, fraggap); skb_prev->csum = csum_sub(skb_prev->csum, skb->csum); data += fraggap; pskb_trim_unique(skb_prev, maxfraglen); } if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) { err = -EFAULT; kfree_skb(skb); goto error; } else if (flags & MSG_SPLICE_PAGES) { copy = 0; } offset += copy; length -= copy + transhdrlen; transhdrlen = 0; exthdrlen = 0; dst_exthdrlen = 0; /* Only the initial fragment is time stamped */ skb_shinfo(skb)->tx_flags = cork->tx_flags; cork->tx_flags = 0; skb_shinfo(skb)->tskey = tskey; tskey = 0; skb_zcopy_set(skb, uarg, &extra_uref); if ((flags & MSG_CONFIRM) && !skb_prev) skb_set_dst_pending_confirm(skb, 1); /* * Put the packet on the pending queue */ if (!skb->destructor) { skb->destructor = sock_wfree; skb->sk = sk; wmem_alloc_delta += skb->truesize; } __skb_queue_tail(queue, skb); continue; } if (copy > length) copy = length; if (!(rt->dst.dev->features&NETIF_F_SG) && skb_tailroom(skb) >= copy) { unsigned int off; off = skb->len; if (getfrag(from, skb_put(skb, copy), offset, copy, off, skb) < 0) { __skb_trim(skb, off); err = -EFAULT; goto error; } } else if (flags & MSG_SPLICE_PAGES) { struct msghdr *msg = from; err = -EIO; if (WARN_ON_ONCE(copy > msg->msg_iter.count)) goto error; err = skb_splice_from_iter(skb, &msg->msg_iter, copy, sk->sk_allocation); if (err < 0) goto error; copy = err; wmem_alloc_delta += copy; } else if (!zc) { int i = skb_shinfo(skb)->nr_frags; err = -ENOMEM; if (!sk_page_frag_refill(sk, pfrag)) goto error; skb_zcopy_downgrade_managed(skb); if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) { err = -EMSGSIZE; if (i == MAX_SKB_FRAGS) goto error; __skb_fill_page_desc(skb, i, pfrag->page, pfrag->offset, 0); skb_shinfo(skb)->nr_frags = ++i; get_page(pfrag->page); } copy = min_t(int, copy, pfrag->size - pfrag->offset); if (getfrag(from, page_address(pfrag->page) + pfrag->offset, offset, copy, skb->len, skb) < 0) goto error_efault; pfrag->offset += copy; skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); skb->len += copy; skb->data_len += copy; skb->truesize += copy; wmem_alloc_delta += copy; } else { err = skb_zerocopy_iter_dgram(skb, from, copy); if (err < 0) goto error; } offset += copy; length -= copy; } if (wmem_alloc_delta) refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); return 0; error_efault: err = -EFAULT; error: net_zcopy_put_abort(uarg, extra_uref); cork->length -= length; IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); return err; } int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, size_t length, int transhdrlen, struct ipcm6_cookie *ipc6, struct flowi6 *fl6, struct rt6_info *rt, unsigned int flags) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); int exthdrlen; int err; if (flags&MSG_PROBE) return 0; if (skb_queue_empty(&sk->sk_write_queue)) { /* * setup for corking */ dst_hold(&rt->dst); err = ip6_setup_cork(sk, &inet->cork, &np->cork, ipc6, rt); if (err) return err; inet->cork.fl.u.ip6 = *fl6; exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); length += exthdrlen; transhdrlen += exthdrlen; } else { transhdrlen = 0; } return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork, &np->cork, sk_page_frag(sk), getfrag, from, length, transhdrlen, flags, ipc6); } EXPORT_SYMBOL_GPL(ip6_append_data); static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork) { struct dst_entry *dst = cork->base.dst; cork->base.dst = NULL; skb_dst_set(skb, dst); } static void ip6_cork_release(struct inet_cork_full *cork, struct inet6_cork *v6_cork) { if (v6_cork->opt) { struct ipv6_txoptions *opt = v6_cork->opt; kfree(opt->dst0opt); kfree(opt->dst1opt); kfree(opt->hopopt); kfree(opt->srcrt); kfree(opt); v6_cork->opt = NULL; } if (cork->base.dst) { dst_release(cork->base.dst); cork->base.dst = NULL; } } struct sk_buff *__ip6_make_skb(struct sock *sk, struct sk_buff_head *queue, struct inet_cork_full *cork, struct inet6_cork *v6_cork) { struct sk_buff *skb, *tmp_skb; struct sk_buff **tail_skb; struct in6_addr *final_dst; struct net *net = sock_net(sk); struct ipv6hdr *hdr; struct ipv6_txoptions *opt = v6_cork->opt; struct rt6_info *rt = (struct rt6_info *)cork->base.dst; struct flowi6 *fl6 = &cork->fl.u.ip6; unsigned char proto = fl6->flowi6_proto; skb = __skb_dequeue(queue); if (!skb) goto out; tail_skb = &(skb_shinfo(skb)->frag_list); /* move skb->data to ip header from ext header */ if (skb->data < skb_network_header(skb)) __skb_pull(skb, skb_network_offset(skb)); while ((tmp_skb = __skb_dequeue(queue)) != NULL) { __skb_pull(tmp_skb, skb_network_header_len(skb)); *tail_skb = tmp_skb; tail_skb = &(tmp_skb->next); skb->len += tmp_skb->len; skb->data_len += tmp_skb->len; skb->truesize += tmp_skb->truesize; tmp_skb->destructor = NULL; tmp_skb->sk = NULL; } /* Allow local fragmentation. */ skb->ignore_df = ip6_sk_ignore_df(sk); __skb_pull(skb, skb_network_header_len(skb)); final_dst = &fl6->daddr; if (opt && opt->opt_flen) ipv6_push_frag_opts(skb, opt, &proto); if (opt && opt->opt_nflen) ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr); skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); hdr = ipv6_hdr(skb); ip6_flow_hdr(hdr, v6_cork->tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, ip6_autoflowlabel(net, sk), fl6)); hdr->hop_limit = v6_cork->hop_limit; hdr->nexthdr = proto; hdr->saddr = fl6->saddr; hdr->daddr = *final_dst; skb->priority = READ_ONCE(sk->sk_priority); skb->mark = cork->base.mark; skb->tstamp = cork->base.transmit_time; ip6_cork_steal_dst(skb, cork); IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS); if (proto == IPPROTO_ICMPV6) { struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); u8 icmp6_type; if (sk->sk_socket->type == SOCK_RAW && !inet_test_bit(HDRINCL, sk)) icmp6_type = fl6->fl6_icmp_type; else icmp6_type = icmp6_hdr(skb)->icmp6_type; ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type); ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); } ip6_cork_release(cork, v6_cork); out: return skb; } int ip6_send_skb(struct sk_buff *skb) { struct net *net = sock_net(skb->sk); struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); int err; err = ip6_local_out(net, skb->sk, skb); if (err) { if (err > 0) err = net_xmit_errno(err); if (err) IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); } return err; } int ip6_push_pending_frames(struct sock *sk) { struct sk_buff *skb; skb = ip6_finish_skb(sk); if (!skb) return 0; return ip6_send_skb(skb); } EXPORT_SYMBOL_GPL(ip6_push_pending_frames); static void __ip6_flush_pending_frames(struct sock *sk, struct sk_buff_head *queue, struct inet_cork_full *cork, struct inet6_cork *v6_cork) { struct sk_buff *skb; while ((skb = __skb_dequeue_tail(queue)) != NULL) { if (skb_dst(skb)) IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTDISCARDS); kfree_skb(skb); } ip6_cork_release(cork, v6_cork); } void ip6_flush_pending_frames(struct sock *sk) { __ip6_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork, &inet6_sk(sk)->cork); } EXPORT_SYMBOL_GPL(ip6_flush_pending_frames); struct sk_buff *ip6_make_skb(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, size_t length, int transhdrlen, struct ipcm6_cookie *ipc6, struct rt6_info *rt, unsigned int flags, struct inet_cork_full *cork) { struct inet6_cork v6_cork; struct sk_buff_head queue; int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); int err; if (flags & MSG_PROBE) { dst_release(&rt->dst); return NULL; } __skb_queue_head_init(&queue); cork->base.flags = 0; cork->base.addr = 0; cork->base.opt = NULL; v6_cork.opt = NULL; err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt); if (err) { ip6_cork_release(cork, &v6_cork); return ERR_PTR(err); } if (ipc6->dontfrag < 0) ipc6->dontfrag = inet6_test_bit(DONTFRAG, sk); err = __ip6_append_data(sk, &queue, cork, &v6_cork, &current->task_frag, getfrag, from, length + exthdrlen, transhdrlen + exthdrlen, flags, ipc6); if (err) { __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork); return ERR_PTR(err); } return __ip6_make_skb(sk, &queue, cork, &v6_cork); }
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 /* * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/in.h> #include <linux/ipv6.h> #include "rds.h" #include "loop.h" static char * const rds_trans_modules[] = { [RDS_TRANS_IB] = "rds_rdma", [RDS_TRANS_GAP] = NULL, [RDS_TRANS_TCP] = "rds_tcp", }; static struct rds_transport *transports[RDS_TRANS_COUNT]; static DECLARE_RWSEM(rds_trans_sem); void rds_trans_register(struct rds_transport *trans) { BUG_ON(strlen(trans->t_name) + 1 > TRANSNAMSIZ); down_write(&rds_trans_sem); if (transports[trans->t_type]) printk(KERN_ERR "RDS Transport type %d already registered\n", trans->t_type); else { transports[trans->t_type] = trans; printk(KERN_INFO "Registered RDS/%s transport\n", trans->t_name); } up_write(&rds_trans_sem); } EXPORT_SYMBOL_GPL(rds_trans_register); void rds_trans_unregister(struct rds_transport *trans) { down_write(&rds_trans_sem); transports[trans->t_type] = NULL; printk(KERN_INFO "Unregistered RDS/%s transport\n", trans->t_name); up_write(&rds_trans_sem); } EXPORT_SYMBOL_GPL(rds_trans_unregister); void rds_trans_put(struct rds_transport *trans) { if (trans) module_put(trans->t_owner); } struct rds_transport *rds_trans_get_preferred(struct net *net, const struct in6_addr *addr, __u32 scope_id) { struct rds_transport *ret = NULL; struct rds_transport *trans; unsigned int i; if (ipv6_addr_v4mapped(addr)) { if (*(u_int8_t *)&addr->s6_addr32[3] == IN_LOOPBACKNET) return &rds_loop_transport; } else if (ipv6_addr_loopback(addr)) { return &rds_loop_transport; } down_read(&rds_trans_sem); for (i = 0; i < RDS_TRANS_COUNT; i++) { trans = transports[i]; if (trans && (trans->laddr_check(net, addr, scope_id) == 0) && (!trans->t_owner || try_module_get(trans->t_owner))) { ret = trans; break; } } up_read(&rds_trans_sem); return ret; } struct rds_transport *rds_trans_get(int t_type) { struct rds_transport *ret = NULL; struct rds_transport *trans; down_read(&rds_trans_sem); trans = transports[t_type]; if (!trans) { up_read(&rds_trans_sem); if (rds_trans_modules[t_type]) request_module(rds_trans_modules[t_type]); down_read(&rds_trans_sem); trans = transports[t_type]; } if (trans && trans->t_type == t_type && (!trans->t_owner || try_module_get(trans->t_owner))) ret = trans; up_read(&rds_trans_sem); return ret; } /* * This returns the number of stats entries in the snapshot and only * copies them using the iter if there is enough space for them. The * caller passes in the global stats so that we can size and copy while * holding the lock. */ unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, unsigned int avail) { struct rds_transport *trans; unsigned int total = 0; unsigned int part; int i; rds_info_iter_unmap(iter); down_read(&rds_trans_sem); for (i = 0; i < RDS_TRANS_COUNT; i++) { trans = transports[i]; if (!trans || !trans->stats_info_copy) continue; part = trans->stats_info_copy(iter, avail); avail -= min(avail, part); total += part; } up_read(&rds_trans_sem); return total; }
7 7 7 7 7 7 7 7 7 7 7 7 7 7 4 4 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 8 8 8 2 2 2 7 7 7 7 7 7 3 3 3 3 3 3 2 2 2 2 2 2 2 2 8 7 7 7 7 7 7 8 7 7 7 7 7 7 7 7 7 2 2 2 2 2 2 2 8 8 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 // SPDX-License-Identifier: GPL-2.0-only /* * The industrial I/O core * * Copyright (c) 2008 Jonathan Cameron * * Based on elements of hwmon and input subsystems. */ #define pr_fmt(fmt) "iio-core: " fmt #include <linux/anon_inodes.h> #include <linux/cdev.h> #include <linux/debugfs.h> #include <linux/device.h> #include <linux/err.h> #include <linux/fs.h> #include <linux/idr.h> #include <linux/kdev_t.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/poll.h> #include <linux/property.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/wait.h> #include <linux/iio/buffer.h> #include <linux/iio/buffer_impl.h> #include <linux/iio/events.h> #include <linux/iio/iio-opaque.h> #include <linux/iio/iio.h> #include <linux/iio/sysfs.h> #include "iio_core.h" #include "iio_core_trigger.h" /* IDA to assign each registered device a unique id */ static DEFINE_IDA(iio_ida); static dev_t iio_devt; #define IIO_DEV_MAX 256 struct bus_type iio_bus_type = { .name = "iio", }; EXPORT_SYMBOL(iio_bus_type); static struct dentry *iio_debugfs_dentry; static const char * const iio_direction[] = { [0] = "in", [1] = "out", }; static const char * const iio_chan_type_name_spec[] = { [IIO_VOLTAGE] = "voltage", [IIO_CURRENT] = "current", [IIO_POWER] = "power", [IIO_ACCEL] = "accel", [IIO_ANGL_VEL] = "anglvel", [IIO_MAGN] = "magn", [IIO_LIGHT] = "illuminance", [IIO_INTENSITY] = "intensity", [IIO_PROXIMITY] = "proximity", [IIO_TEMP] = "temp", [IIO_INCLI] = "incli", [IIO_ROT] = "rot", [IIO_ANGL] = "angl", [IIO_TIMESTAMP] = "timestamp", [IIO_CAPACITANCE] = "capacitance", [IIO_ALTVOLTAGE] = "altvoltage", [IIO_CCT] = "cct", [IIO_PRESSURE] = "pressure", [IIO_HUMIDITYRELATIVE] = "humidityrelative", [IIO_ACTIVITY] = "activity", [IIO_STEPS] = "steps", [IIO_ENERGY] = "energy", [IIO_DISTANCE] = "distance", [IIO_VELOCITY] = "velocity", [IIO_CONCENTRATION] = "concentration", [IIO_RESISTANCE] = "resistance", [IIO_PH] = "ph", [IIO_UVINDEX] = "uvindex", [IIO_ELECTRICALCONDUCTIVITY] = "electricalconductivity", [IIO_COUNT] = "count", [IIO_INDEX] = "index", [IIO_GRAVITY] = "gravity", [IIO_POSITIONRELATIVE] = "positionrelative", [IIO_PHASE] = "phase", [IIO_MASSCONCENTRATION] = "massconcentration", [IIO_DELTA_ANGL] = "deltaangl", [IIO_DELTA_VELOCITY] = "deltavelocity", [IIO_COLORTEMP] = "colortemp", [IIO_CHROMATICITY] = "chromaticity", }; static const char * const iio_modifier_names[] = { [IIO_MOD_X] = "x", [IIO_MOD_Y] = "y", [IIO_MOD_Z] = "z", [IIO_MOD_X_AND_Y] = "x&y", [IIO_MOD_X_AND_Z] = "x&z", [IIO_MOD_Y_AND_Z] = "y&z", [IIO_MOD_X_AND_Y_AND_Z] = "x&y&z", [IIO_MOD_X_OR_Y] = "x|y", [IIO_MOD_X_OR_Z] = "x|z", [IIO_MOD_Y_OR_Z] = "y|z", [IIO_MOD_X_OR_Y_OR_Z] = "x|y|z", [IIO_MOD_ROOT_SUM_SQUARED_X_Y] = "sqrt(x^2+y^2)", [IIO_MOD_SUM_SQUARED_X_Y_Z] = "x^2+y^2+z^2", [IIO_MOD_LIGHT_BOTH] = "both", [IIO_MOD_LIGHT_IR] = "ir", [IIO_MOD_LIGHT_CLEAR] = "clear", [IIO_MOD_LIGHT_RED] = "red", [IIO_MOD_LIGHT_GREEN] = "green", [IIO_MOD_LIGHT_BLUE] = "blue", [IIO_MOD_LIGHT_UV] = "uv", [IIO_MOD_LIGHT_DUV] = "duv", [IIO_MOD_QUATERNION] = "quaternion", [IIO_MOD_TEMP_AMBIENT] = "ambient", [IIO_MOD_TEMP_OBJECT] = "object", [IIO_MOD_NORTH_MAGN] = "from_north_magnetic", [IIO_MOD_NORTH_TRUE] = "from_north_true", [IIO_MOD_NORTH_MAGN_TILT_COMP] = "from_north_magnetic_tilt_comp", [IIO_MOD_NORTH_TRUE_TILT_COMP] = "from_north_true_tilt_comp", [IIO_MOD_RUNNING] = "running", [IIO_MOD_JOGGING] = "jogging", [IIO_MOD_WALKING] = "walking", [IIO_MOD_STILL] = "still", [IIO_MOD_ROOT_SUM_SQUARED_X_Y_Z] = "sqrt(x^2+y^2+z^2)", [IIO_MOD_I] = "i", [IIO_MOD_Q] = "q", [IIO_MOD_CO2] = "co2", [IIO_MOD_VOC] = "voc", [IIO_MOD_PM1] = "pm1", [IIO_MOD_PM2P5] = "pm2p5", [IIO_MOD_PM4] = "pm4", [IIO_MOD_PM10] = "pm10", [IIO_MOD_ETHANOL] = "ethanol", [IIO_MOD_H2] = "h2", [IIO_MOD_O2] = "o2", [IIO_MOD_LINEAR_X] = "linear_x", [IIO_MOD_LINEAR_Y] = "linear_y", [IIO_MOD_LINEAR_Z] = "linear_z", [IIO_MOD_PITCH] = "pitch", [IIO_MOD_YAW] = "yaw", [IIO_MOD_ROLL] = "roll", }; /* relies on pairs of these shared then separate */ static const char * const iio_chan_info_postfix[] = { [IIO_CHAN_INFO_RAW] = "raw", [IIO_CHAN_INFO_PROCESSED] = "input", [IIO_CHAN_INFO_SCALE] = "scale", [IIO_CHAN_INFO_OFFSET] = "offset", [IIO_CHAN_INFO_CALIBSCALE] = "calibscale", [IIO_CHAN_INFO_CALIBBIAS] = "calibbias", [IIO_CHAN_INFO_PEAK] = "peak_raw", [IIO_CHAN_INFO_PEAK_SCALE] = "peak_scale", [IIO_CHAN_INFO_QUADRATURE_CORRECTION_RAW] = "quadrature_correction_raw", [IIO_CHAN_INFO_AVERAGE_RAW] = "mean_raw", [IIO_CHAN_INFO_LOW_PASS_FILTER_3DB_FREQUENCY] = "filter_low_pass_3db_frequency", [IIO_CHAN_INFO_HIGH_PASS_FILTER_3DB_FREQUENCY] = "filter_high_pass_3db_frequency", [IIO_CHAN_INFO_SAMP_FREQ] = "sampling_frequency", [IIO_CHAN_INFO_FREQUENCY] = "frequency", [IIO_CHAN_INFO_PHASE] = "phase", [IIO_CHAN_INFO_HARDWAREGAIN] = "hardwaregain", [IIO_CHAN_INFO_HYSTERESIS] = "hysteresis", [IIO_CHAN_INFO_HYSTERESIS_RELATIVE] = "hysteresis_relative", [IIO_CHAN_INFO_INT_TIME] = "integration_time", [IIO_CHAN_INFO_ENABLE] = "en", [IIO_CHAN_INFO_CALIBHEIGHT] = "calibheight", [IIO_CHAN_INFO_CALIBWEIGHT] = "calibweight", [IIO_CHAN_INFO_DEBOUNCE_COUNT] = "debounce_count", [IIO_CHAN_INFO_DEBOUNCE_TIME] = "debounce_time", [IIO_CHAN_INFO_CALIBEMISSIVITY] = "calibemissivity", [IIO_CHAN_INFO_OVERSAMPLING_RATIO] = "oversampling_ratio", [IIO_CHAN_INFO_THERMOCOUPLE_TYPE] = "thermocouple_type", [IIO_CHAN_INFO_CALIBAMBIENT] = "calibambient", [IIO_CHAN_INFO_ZEROPOINT] = "zeropoint", }; /** * iio_device_id() - query the unique ID for the device * @indio_dev: Device structure whose ID is being queried * * The IIO device ID is a unique index used for example for the naming * of the character device /dev/iio\:device[ID]. * * Returns: Unique ID for the device. */ int iio_device_id(struct iio_dev *indio_dev) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); return iio_dev_opaque->id; } EXPORT_SYMBOL_GPL(iio_device_id); /** * iio_buffer_enabled() - helper function to test if the buffer is enabled * @indio_dev: IIO device structure for device * * Returns: True, if the buffer is enabled. */ bool iio_buffer_enabled(struct iio_dev *indio_dev) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); return iio_dev_opaque->currentmode & (INDIO_BUFFER_HARDWARE | INDIO_BUFFER_SOFTWARE | INDIO_BUFFER_TRIGGERED); } EXPORT_SYMBOL_GPL(iio_buffer_enabled); #if defined(CONFIG_DEBUG_FS) /* * There's also a CONFIG_DEBUG_FS guard in include/linux/iio/iio.h for * iio_get_debugfs_dentry() to make it inline if CONFIG_DEBUG_FS is undefined */ struct dentry *iio_get_debugfs_dentry(struct iio_dev *indio_dev) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); return iio_dev_opaque->debugfs_dentry; } EXPORT_SYMBOL_GPL(iio_get_debugfs_dentry); #endif /** * iio_find_channel_from_si() - get channel from its scan index * @indio_dev: device * @si: scan index to match * * Returns: * Constant pointer to iio_chan_spec, if scan index matches, NULL on failure. */ const struct iio_chan_spec *iio_find_channel_from_si(struct iio_dev *indio_dev, int si) { int i; for (i = 0; i < indio_dev->num_channels; i++) if (indio_dev->channels[i].scan_index == si) return &indio_dev->channels[i]; return NULL; } /* This turns up an awful lot */ ssize_t iio_read_const_attr(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "%s\n", to_iio_const_attr(attr)->string); } EXPORT_SYMBOL(iio_read_const_attr); /** * iio_device_set_clock() - Set current timestamping clock for the device * @indio_dev: IIO device structure containing the device * @clock_id: timestamping clock POSIX identifier to set. * * Returns: 0 on success, or a negative error code. */ int iio_device_set_clock(struct iio_dev *indio_dev, clockid_t clock_id) { int ret; struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); const struct iio_event_interface *ev_int = iio_dev_opaque->event_interface; ret = mutex_lock_interruptible(&iio_dev_opaque->mlock); if (ret) return ret; if ((ev_int && iio_event_enabled(ev_int)) || iio_buffer_enabled(indio_dev)) { mutex_unlock(&iio_dev_opaque->mlock); return -EBUSY; } iio_dev_opaque->clock_id = clock_id; mutex_unlock(&iio_dev_opaque->mlock); return 0; } EXPORT_SYMBOL(iio_device_set_clock); /** * iio_device_get_clock() - Retrieve current timestamping clock for the device * @indio_dev: IIO device structure containing the device * * Returns: Clock ID of the current timestamping clock for the device. */ clockid_t iio_device_get_clock(const struct iio_dev *indio_dev) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); return iio_dev_opaque->clock_id; } EXPORT_SYMBOL(iio_device_get_clock); /** * iio_get_time_ns() - utility function to get a time stamp for events etc * @indio_dev: device * * Returns: Timestamp of the event in nanoseconds. */ s64 iio_get_time_ns(const struct iio_dev *indio_dev) { struct timespec64 tp; switch (iio_device_get_clock(indio_dev)) { case CLOCK_REALTIME: return ktime_get_real_ns(); case CLOCK_MONOTONIC: return ktime_get_ns(); case CLOCK_MONOTONIC_RAW: return ktime_get_raw_ns(); case CLOCK_REALTIME_COARSE: return ktime_to_ns(ktime_get_coarse_real()); case CLOCK_MONOTONIC_COARSE: ktime_get_coarse_ts64(&tp); return timespec64_to_ns(&tp); case CLOCK_BOOTTIME: return ktime_get_boottime_ns(); case CLOCK_TAI: return ktime_get_clocktai_ns(); default: BUG(); } } EXPORT_SYMBOL(iio_get_time_ns); static int __init iio_init(void) { int ret; /* Register sysfs bus */ ret = bus_register(&iio_bus_type); if (ret < 0) { pr_err("could not register bus type\n"); goto error_nothing; } ret = alloc_chrdev_region(&iio_devt, 0, IIO_DEV_MAX, "iio"); if (ret < 0) { pr_err("failed to allocate char dev region\n"); goto error_unregister_bus_type; } iio_debugfs_dentry = debugfs_create_dir("iio", NULL); return 0; error_unregister_bus_type: bus_unregister(&iio_bus_type); error_nothing: return ret; } static void __exit iio_exit(void) { if (iio_devt) unregister_chrdev_region(iio_devt, IIO_DEV_MAX); bus_unregister(&iio_bus_type); debugfs_remove(iio_debugfs_dentry); } #if defined(CONFIG_DEBUG_FS) static ssize_t iio_debugfs_read_reg(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { struct iio_dev *indio_dev = file->private_data; struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); unsigned int val = 0; int ret; if (*ppos > 0) return simple_read_from_buffer(userbuf, count, ppos, iio_dev_opaque->read_buf, iio_dev_opaque->read_buf_len); ret = indio_dev->info->debugfs_reg_access(indio_dev, iio_dev_opaque->cached_reg_addr, 0, &val); if (ret) { dev_err(indio_dev->dev.parent, "%s: read failed\n", __func__); return ret; } iio_dev_opaque->read_buf_len = snprintf(iio_dev_opaque->read_buf, sizeof(iio_dev_opaque->read_buf), "0x%X\n", val); return simple_read_from_buffer(userbuf, count, ppos, iio_dev_opaque->read_buf, iio_dev_opaque->read_buf_len); } static ssize_t iio_debugfs_write_reg(struct file *file, const char __user *userbuf, size_t count, loff_t *ppos) { struct iio_dev *indio_dev = file->private_data; struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); unsigned int reg, val; char buf[80]; int ret; count = min(count, sizeof(buf) - 1); if (copy_from_user(buf, userbuf, count)) return -EFAULT; buf[count] = 0; ret = sscanf(buf, "%i %i", &reg, &val); switch (ret) { case 1: iio_dev_opaque->cached_reg_addr = reg; break; case 2: iio_dev_opaque->cached_reg_addr = reg; ret = indio_dev->info->debugfs_reg_access(indio_dev, reg, val, NULL); if (ret) { dev_err(indio_dev->dev.parent, "%s: write failed\n", __func__); return ret; } break; default: return -EINVAL; } return count; } static const struct file_operations iio_debugfs_reg_fops = { .open = simple_open, .read = iio_debugfs_read_reg, .write = iio_debugfs_write_reg, }; static void iio_device_unregister_debugfs(struct iio_dev *indio_dev) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); debugfs_remove_recursive(iio_dev_opaque->debugfs_dentry); } static void iio_device_register_debugfs(struct iio_dev *indio_dev) { struct iio_dev_opaque *iio_dev_opaque; if (indio_dev->info->debugfs_reg_access == NULL) return; if (!iio_debugfs_dentry) return; iio_dev_opaque = to_iio_dev_opaque(indio_dev); iio_dev_opaque->debugfs_dentry = debugfs_create_dir(dev_name(&indio_dev->dev), iio_debugfs_dentry); debugfs_create_file("direct_reg_access", 0644, iio_dev_opaque->debugfs_dentry, indio_dev, &iio_debugfs_reg_fops); } #else static void iio_device_register_debugfs(struct iio_dev *indio_dev) { } static void iio_device_unregister_debugfs(struct iio_dev *indio_dev) { } #endif /* CONFIG_DEBUG_FS */ static ssize_t iio_read_channel_ext_info(struct device *dev, struct device_attribute *attr, char *buf) { struct iio_dev *indio_dev = dev_to_iio_dev(dev); struct iio_dev_attr *this_attr = to_iio_dev_attr(attr); const struct iio_chan_spec_ext_info *ext_info; ext_info = &this_attr->c->ext_info[this_attr->address]; return ext_info->read(indio_dev, ext_info->private, this_attr->c, buf); } static ssize_t iio_write_channel_ext_info(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { struct iio_dev *indio_dev = dev_to_iio_dev(dev); struct iio_dev_attr *this_attr = to_iio_dev_attr(attr); const struct iio_chan_spec_ext_info *ext_info; ext_info = &this_attr->c->ext_info[this_attr->address]; return ext_info->write(indio_dev, ext_info->private, this_attr->c, buf, len); } ssize_t iio_enum_available_read(struct iio_dev *indio_dev, uintptr_t priv, const struct iio_chan_spec *chan, char *buf) { const struct iio_enum *e = (const struct iio_enum *)priv; unsigned int i; size_t len = 0; if (!e->num_items) return 0; for (i = 0; i < e->num_items; ++i) { if (!e->items[i]) continue; len += sysfs_emit_at(buf, len, "%s ", e->items[i]); } /* replace last space with a newline */ buf[len - 1] = '\n'; return len; } EXPORT_SYMBOL_GPL(iio_enum_available_read); ssize_t iio_enum_read(struct iio_dev *indio_dev, uintptr_t priv, const struct iio_chan_spec *chan, char *buf) { const struct iio_enum *e = (const struct iio_enum *)priv; int i; if (!e->get) return -EINVAL; i = e->get(indio_dev, chan); if (i < 0) return i; if (i >= e->num_items || !e->items[i]) return -EINVAL; return sysfs_emit(buf, "%s\n", e->items[i]); } EXPORT_SYMBOL_GPL(iio_enum_read); ssize_t iio_enum_write(struct iio_dev *indio_dev, uintptr_t priv, const struct iio_chan_spec *chan, const char *buf, size_t len) { const struct iio_enum *e = (const struct iio_enum *)priv; int ret; if (!e->set) return -EINVAL; ret = __sysfs_match_string(e->items, e->num_items, buf); if (ret < 0) return ret; ret = e->set(indio_dev, chan, ret); return ret ? ret : len; } EXPORT_SYMBOL_GPL(iio_enum_write); static const struct iio_mount_matrix iio_mount_idmatrix = { .rotation = { "1", "0", "0", "0", "1", "0", "0", "0", "1" } }; static int iio_setup_mount_idmatrix(const struct device *dev, struct iio_mount_matrix *matrix) { *matrix = iio_mount_idmatrix; dev_info(dev, "mounting matrix not found: using identity...\n"); return 0; } ssize_t iio_show_mount_matrix(struct iio_dev *indio_dev, uintptr_t priv, const struct iio_chan_spec *chan, char *buf) { const struct iio_mount_matrix *mtx; mtx = ((iio_get_mount_matrix_t *)priv)(indio_dev, chan); if (IS_ERR(mtx)) return PTR_ERR(mtx); if (!mtx) mtx = &iio_mount_idmatrix; return sysfs_emit(buf, "%s, %s, %s; %s, %s, %s; %s, %s, %s\n", mtx->rotation[0], mtx->rotation[1], mtx->rotation[2], mtx->rotation[3], mtx->rotation[4], mtx->rotation[5], mtx->rotation[6], mtx->rotation[7], mtx->rotation[8]); } EXPORT_SYMBOL_GPL(iio_show_mount_matrix); /** * iio_read_mount_matrix() - retrieve iio device mounting matrix from * device "mount-matrix" property * @dev: device the mounting matrix property is assigned to * @matrix: where to store retrieved matrix * * If device is assigned no mounting matrix property, a default 3x3 identity * matrix will be filled in. * * Returns: 0 if success, or a negative error code on failure. */ int iio_read_mount_matrix(struct device *dev, struct iio_mount_matrix *matrix) { size_t len = ARRAY_SIZE(iio_mount_idmatrix.rotation); int err; err = device_property_read_string_array(dev, "mount-matrix", matrix->rotation, len); if (err == len) return 0; if (err >= 0) /* Invalid number of matrix entries. */ return -EINVAL; if (err != -EINVAL) /* Invalid matrix declaration format. */ return err; /* Matrix was not declared at all: fallback to identity. */ return iio_setup_mount_idmatrix(dev, matrix); } EXPORT_SYMBOL(iio_read_mount_matrix); static ssize_t __iio_format_value(char *buf, size_t offset, unsigned int type, int size, const int *vals) { int tmp0, tmp1; s64 tmp2; bool scale_db = false; switch (type) { case IIO_VAL_INT: return sysfs_emit_at(buf, offset, "%d", vals[0]); case IIO_VAL_INT_PLUS_MICRO_DB: scale_db = true; fallthrough; case IIO_VAL_INT_PLUS_MICRO: if (vals[1] < 0) return sysfs_emit_at(buf, offset, "-%d.%06u%s", abs(vals[0]), -vals[1], scale_db ? " dB" : ""); else return sysfs_emit_at(buf, offset, "%d.%06u%s", vals[0], vals[1], scale_db ? " dB" : ""); case IIO_VAL_INT_PLUS_NANO: if (vals[1] < 0) return sysfs_emit_at(buf, offset, "-%d.%09u", abs(vals[0]), -vals[1]); else return sysfs_emit_at(buf, offset, "%d.%09u", vals[0], vals[1]); case IIO_VAL_FRACTIONAL: tmp2 = div_s64((s64)vals[0] * 1000000000LL, vals[1]); tmp1 = vals[1]; tmp0 = (int)div_s64_rem(tmp2, 1000000000, &tmp1); if ((tmp2 < 0) && (tmp0 == 0)) return sysfs_emit_at(buf, offset, "-0.%09u", abs(tmp1)); else return sysfs_emit_at(buf, offset, "%d.%09u", tmp0, abs(tmp1)); case IIO_VAL_FRACTIONAL_LOG2: tmp2 = shift_right((s64)vals[0] * 1000000000LL, vals[1]); tmp0 = (int)div_s64_rem(tmp2, 1000000000LL, &tmp1); if (tmp0 == 0 && tmp2 < 0) return sysfs_emit_at(buf, offset, "-0.%09u", abs(tmp1)); else return sysfs_emit_at(buf, offset, "%d.%09u", tmp0, abs(tmp1)); case IIO_VAL_INT_MULTIPLE: { int i; int l = 0; for (i = 0; i < size; ++i) l += sysfs_emit_at(buf, offset + l, "%d ", vals[i]); return l; } case IIO_VAL_CHAR: return sysfs_emit_at(buf, offset, "%c", (char)vals[0]); case IIO_VAL_INT_64: tmp2 = (s64)((((u64)vals[1]) << 32) | (u32)vals[0]); return sysfs_emit_at(buf, offset, "%lld", tmp2); default: return 0; } } /** * iio_format_value() - Formats a IIO value into its string representation * @buf: The buffer to which the formatted value gets written * which is assumed to be big enough (i.e. PAGE_SIZE). * @type: One of the IIO_VAL_* constants. This decides how the val * and val2 parameters are formatted. * @size: Number of IIO value entries contained in vals * @vals: Pointer to the values, exact meaning depends on the * type parameter. * * Returns: * 0 by default, a negative number on failure or the total number of characters * written for a type that belongs to the IIO_VAL_* constant. */ ssize_t iio_format_value(char *buf, unsigned int type, int size, int *vals) { ssize_t len; len = __iio_format_value(buf, 0, type, size, vals); if (len >= PAGE_SIZE - 1) return -EFBIG; return len + sysfs_emit_at(buf, len, "\n"); } EXPORT_SYMBOL_GPL(iio_format_value); static ssize_t iio_read_channel_label(struct device *dev, struct device_attribute *attr, char *buf) { struct iio_dev *indio_dev = dev_to_iio_dev(dev); struct iio_dev_attr *this_attr = to_iio_dev_attr(attr); if (indio_dev->info->read_label) return indio_dev->info->read_label(indio_dev, this_attr->c, buf); if (this_attr->c->extend_name) return sysfs_emit(buf, "%s\n", this_attr->c->extend_name); return -EINVAL; } static ssize_t iio_read_channel_info(struct device *dev, struct device_attribute *attr, char *buf) { struct iio_dev *indio_dev = dev_to_iio_dev(dev); struct iio_dev_attr *this_attr = to_iio_dev_attr(attr); int vals[INDIO_MAX_RAW_ELEMENTS]; int ret; int val_len = 2; if (indio_dev->info->read_raw_multi) ret = indio_dev->info->read_raw_multi(indio_dev, this_attr->c, INDIO_MAX_RAW_ELEMENTS, vals, &val_len, this_attr->address); else ret = indio_dev->info->read_raw(indio_dev, this_attr->c, &vals[0], &vals[1], this_attr->address); if (ret < 0) return ret; return iio_format_value(buf, ret, val_len, vals); } static ssize_t iio_format_list(char *buf, const int *vals, int type, int length, const char *prefix, const char *suffix) { ssize_t len; int stride; int i; switch (type) { case IIO_VAL_INT: stride = 1; break; default: stride = 2; break; } len = sysfs_emit(buf, prefix); for (i = 0; i <= length - stride; i += stride) { if (i != 0) { len += sysfs_emit_at(buf, len, " "); if (len >= PAGE_SIZE) return -EFBIG; } len += __iio_format_value(buf, len, type, stride, &vals[i]); if (len >= PAGE_SIZE) return -EFBIG; } len += sysfs_emit_at(buf, len, "%s\n", suffix); return len; } static ssize_t iio_format_avail_list(char *buf, const int *vals, int type, int length) { return iio_format_list(buf, vals, type, length, "", ""); } static ssize_t iio_format_avail_range(char *buf, const int *vals, int type) { int length; /* * length refers to the array size , not the number of elements. * The purpose is to print the range [min , step ,max] so length should * be 3 in case of int, and 6 for other types. */ switch (type) { case IIO_VAL_INT: length = 3; break; default: length = 6; break; } return iio_format_list(buf, vals, type, length, "[", "]"); } static ssize_t iio_read_channel_info_avail(struct device *dev, struct device_attribute *attr, char *buf) { struct iio_dev *indio_dev = dev_to_iio_dev(dev); struct iio_dev_attr *this_attr = to_iio_dev_attr(attr); const int *vals; int ret; int length; int type; ret = indio_dev->info->read_avail(indio_dev, this_attr->c, &vals, &type, &length, this_attr->address); if (ret < 0) return ret; switch (ret) { case IIO_AVAIL_LIST: return iio_format_avail_list(buf, vals, type, length); case IIO_AVAIL_RANGE: return iio_format_avail_range(buf, vals, type); default: return -EINVAL; } } /** * __iio_str_to_fixpoint() - Parse a fixed-point number from a string * @str: The string to parse * @fract_mult: Multiplier for the first decimal place, should be a power of 10 * @integer: The integer part of the number * @fract: The fractional part of the number * @scale_db: True if this should parse as dB * * Returns: * 0 on success, or a negative error code if the string could not be parsed. */ static int __iio_str_to_fixpoint(const char *str, int fract_mult, int *integer, int *fract, bool scale_db) { int i = 0, f = 0; bool integer_part = true, negative = false; if (fract_mult == 0) { *fract = 0; return kstrtoint(str, 0, integer); } if (str[0] == '-') { negative = true; str++; } else if (str[0] == '+') { str++; } while (*str) { if ('0' <= *str && *str <= '9') { if (integer_part) { i = i * 10 + *str - '0'; } else { f += fract_mult * (*str - '0'); fract_mult /= 10; } } else if (*str == '\n') { if (*(str + 1) == '\0') break; return -EINVAL; } else if (!strncmp(str, " dB", sizeof(" dB") - 1) && scale_db) { /* Ignore the dB suffix */ str += sizeof(" dB") - 1; continue; } else if (!strncmp(str, "dB", sizeof("dB") - 1) && scale_db) { /* Ignore the dB suffix */ str += sizeof("dB") - 1; continue; } else if (*str == '.' && integer_part) { integer_part = false; } else { return -EINVAL; } str++; } if (negative) { if (i) i = -i; else f = -f; } *integer = i; *fract = f; return 0; } /** * iio_str_to_fixpoint() - Parse a fixed-point number from a string * @str: The string to parse * @fract_mult: Multiplier for the first decimal place, should be a power of 10 * @integer: The integer part of the number * @fract: The fractional part of the number * * Returns: * 0 on success, or a negative error code if the string could not be parsed. */ int iio_str_to_fixpoint(const char *str, int fract_mult, int *integer, int *fract) { return __iio_str_to_fixpoint(str, fract_mult, integer, fract, false); } EXPORT_SYMBOL_GPL(iio_str_to_fixpoint); static ssize_t iio_write_channel_info(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { struct iio_dev *indio_dev = dev_to_iio_dev(dev); struct iio_dev_attr *this_attr = to_iio_dev_attr(attr); int ret, fract_mult = 100000; int integer, fract = 0; bool is_char = false; bool scale_db = false; /* Assumes decimal - precision based on number of digits */ if (!indio_dev->info->write_raw) return -EINVAL; if (indio_dev->info->write_raw_get_fmt) switch (indio_dev->info->write_raw_get_fmt(indio_dev, this_attr->c, this_attr->address)) { case IIO_VAL_INT: fract_mult = 0; break; case IIO_VAL_INT_PLUS_MICRO_DB: scale_db = true; fallthrough; case IIO_VAL_INT_PLUS_MICRO: fract_mult = 100000; break; case IIO_VAL_INT_PLUS_NANO: fract_mult = 100000000; break; case IIO_VAL_CHAR: is_char = true; break; default: return -EINVAL; } if (is_char) { char ch; if (sscanf(buf, "%c", &ch) != 1) return -EINVAL; integer = ch; } else { ret = __iio_str_to_fixpoint(buf, fract_mult, &integer, &fract, scale_db); if (ret) return ret; } ret = indio_dev->info->write_raw(indio_dev, this_attr->c, integer, fract, this_attr->address); if (ret) return ret; return len; } static int __iio_device_attr_init(struct device_attribute *dev_attr, const char *postfix, struct iio_chan_spec const *chan, ssize_t (*readfunc)(struct device *dev, struct device_attribute *attr, char *buf), ssize_t (*writefunc)(struct device *dev, struct device_attribute *attr, const char *buf, size_t len), enum iio_shared_by shared_by) { int ret = 0; char *name = NULL; char *full_postfix; sysfs_attr_init(&dev_attr->attr); /* Build up postfix of <extend_name>_<modifier>_postfix */ if (chan->modified && (shared_by == IIO_SEPARATE)) { if (chan->extend_name) full_postfix = kasprintf(GFP_KERNEL, "%s_%s_%s", iio_modifier_names[chan->channel2], chan->extend_name, postfix); else full_postfix = kasprintf(GFP_KERNEL, "%s_%s", iio_modifier_names[chan->channel2], postfix); } else { if (chan->extend_name == NULL || shared_by != IIO_SEPARATE) full_postfix = kstrdup(postfix, GFP_KERNEL); else full_postfix = kasprintf(GFP_KERNEL, "%s_%s", chan->extend_name, postfix); } if (full_postfix == NULL) return -ENOMEM; if (chan->differential) { /* Differential can not have modifier */ switch (shared_by) { case IIO_SHARED_BY_ALL: name = kasprintf(GFP_KERNEL, "%s", full_postfix); break; case IIO_SHARED_BY_DIR: name = kasprintf(GFP_KERNEL, "%s_%s", iio_direction[chan->output], full_postfix); break; case IIO_SHARED_BY_TYPE: name = kasprintf(GFP_KERNEL, "%s_%s-%s_%s", iio_direction[chan->output], iio_chan_type_name_spec[chan->type], iio_chan_type_name_spec[chan->type], full_postfix); break; case IIO_SEPARATE: if (!chan->indexed) { WARN(1, "Differential channels must be indexed\n"); ret = -EINVAL; goto error_free_full_postfix; } name = kasprintf(GFP_KERNEL, "%s_%s%d-%s%d_%s", iio_direction[chan->output], iio_chan_type_name_spec[chan->type], chan->channel, iio_chan_type_name_spec[chan->type], chan->channel2, full_postfix); break; } } else { /* Single ended */ switch (shared_by) { case IIO_SHARED_BY_ALL: name = kasprintf(GFP_KERNEL, "%s", full_postfix); break; case IIO_SHARED_BY_DIR: name = kasprintf(GFP_KERNEL, "%s_%s", iio_direction[chan->output], full_postfix); break; case IIO_SHARED_BY_TYPE: name = kasprintf(GFP_KERNEL, "%s_%s_%s", iio_direction[chan->output], iio_chan_type_name_spec[chan->type], full_postfix); break; case IIO_SEPARATE: if (chan->indexed) name = kasprintf(GFP_KERNEL, "%s_%s%d_%s", iio_direction[chan->output], iio_chan_type_name_spec[chan->type], chan->channel, full_postfix); else name = kasprintf(GFP_KERNEL, "%s_%s_%s", iio_direction[chan->output], iio_chan_type_name_spec[chan->type], full_postfix); break; } } if (name == NULL) { ret = -ENOMEM; goto error_free_full_postfix; } dev_attr->attr.name = name; if (readfunc) { dev_attr->attr.mode |= 0444; dev_attr->show = readfunc; } if (writefunc) { dev_attr->attr.mode |= 0200; dev_attr->store = writefunc; } error_free_full_postfix: kfree(full_postfix); return ret; } static void __iio_device_attr_deinit(struct device_attribute *dev_attr) { kfree(dev_attr->attr.name); } int __iio_add_chan_devattr(const char *postfix, struct iio_chan_spec const *chan, ssize_t (*readfunc)(struct device *dev, struct device_attribute *attr, char *buf), ssize_t (*writefunc)(struct device *dev, struct device_attribute *attr, const char *buf, size_t len), u64 mask, enum iio_shared_by shared_by, struct device *dev, struct iio_buffer *buffer, struct list_head *attr_list) { int ret; struct iio_dev_attr *iio_attr, *t; iio_attr = kzalloc(sizeof(*iio_attr), GFP_KERNEL); if (iio_attr == NULL) return -ENOMEM; ret = __iio_device_attr_init(&iio_attr->dev_attr, postfix, chan, readfunc, writefunc, shared_by); if (ret) goto error_iio_dev_attr_free; iio_attr->c = chan; iio_attr->address = mask; iio_attr->buffer = buffer; list_for_each_entry(t, attr_list, l) if (strcmp(t->dev_attr.attr.name, iio_attr->dev_attr.attr.name) == 0) { if (shared_by == IIO_SEPARATE) dev_err(dev, "tried to double register : %s\n", t->dev_attr.attr.name); ret = -EBUSY; goto error_device_attr_deinit; } list_add(&iio_attr->l, attr_list); return 0; error_device_attr_deinit: __iio_device_attr_deinit(&iio_attr->dev_attr); error_iio_dev_attr_free: kfree(iio_attr); return ret; } static int iio_device_add_channel_label(struct iio_dev *indio_dev, struct iio_chan_spec const *chan) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); int ret; if (!indio_dev->info->read_label && !chan->extend_name) return 0; ret = __iio_add_chan_devattr("label", chan, &iio_read_channel_label, NULL, 0, IIO_SEPARATE, &indio_dev->dev, NULL, &iio_dev_opaque->channel_attr_list); if (ret < 0) return ret; return 1; } static int iio_device_add_info_mask_type(struct iio_dev *indio_dev, struct iio_chan_spec const *chan, enum iio_shared_by shared_by, const long *infomask) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); int i, ret, attrcount = 0; for_each_set_bit(i, infomask, sizeof(*infomask)*8) { if (i >= ARRAY_SIZE(iio_chan_info_postfix)) return -EINVAL; ret = __iio_add_chan_devattr(iio_chan_info_postfix[i], chan, &iio_read_channel_info, &iio_write_channel_info, i, shared_by, &indio_dev->dev, NULL, &iio_dev_opaque->channel_attr_list); if ((ret == -EBUSY) && (shared_by != IIO_SEPARATE)) continue; if (ret < 0) return ret; attrcount++; } return attrcount; } static int iio_device_add_info_mask_type_avail(struct iio_dev *indio_dev, struct iio_chan_spec const *chan, enum iio_shared_by shared_by, const long *infomask) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); int i, ret, attrcount = 0; char *avail_postfix; for_each_set_bit(i, infomask, sizeof(*infomask) * 8) { if (i >= ARRAY_SIZE(iio_chan_info_postfix)) return -EINVAL; avail_postfix = kasprintf(GFP_KERNEL, "%s_available", iio_chan_info_postfix[i]); if (!avail_postfix) return -ENOMEM; ret = __iio_add_chan_devattr(avail_postfix, chan, &iio_read_channel_info_avail, NULL, i, shared_by, &indio_dev->dev, NULL, &iio_dev_opaque->channel_attr_list); kfree(avail_postfix); if ((ret == -EBUSY) && (shared_by != IIO_SEPARATE)) continue; if (ret < 0) return ret; attrcount++; } return attrcount; } static int iio_device_add_channel_sysfs(struct iio_dev *indio_dev, struct iio_chan_spec const *chan) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); int ret, attrcount = 0; const struct iio_chan_spec_ext_info *ext_info; if (chan->channel < 0) return 0; ret = iio_device_add_info_mask_type(indio_dev, chan, IIO_SEPARATE, &chan->info_mask_separate); if (ret < 0) return ret; attrcount += ret; ret = iio_device_add_info_mask_type_avail(indio_dev, chan, IIO_SEPARATE, &chan->info_mask_separate_available); if (ret < 0) return ret; attrcount += ret; ret = iio_device_add_info_mask_type(indio_dev, chan, IIO_SHARED_BY_TYPE, &chan->info_mask_shared_by_type); if (ret < 0) return ret; attrcount += ret; ret = iio_device_add_info_mask_type_avail(indio_dev, chan, IIO_SHARED_BY_TYPE, &chan->info_mask_shared_by_type_available); if (ret < 0) return ret; attrcount += ret; ret = iio_device_add_info_mask_type(indio_dev, chan, IIO_SHARED_BY_DIR, &chan->info_mask_shared_by_dir); if (ret < 0) return ret; attrcount += ret; ret = iio_device_add_info_mask_type_avail(indio_dev, chan, IIO_SHARED_BY_DIR, &chan->info_mask_shared_by_dir_available); if (ret < 0) return ret; attrcount += ret; ret = iio_device_add_info_mask_type(indio_dev, chan, IIO_SHARED_BY_ALL, &chan->info_mask_shared_by_all); if (ret < 0) return ret; attrcount += ret; ret = iio_device_add_info_mask_type_avail(indio_dev, chan, IIO_SHARED_BY_ALL, &chan->info_mask_shared_by_all_available); if (ret < 0) return ret; attrcount += ret; ret = iio_device_add_channel_label(indio_dev, chan); if (ret < 0) return ret; attrcount += ret; if (chan->ext_info) { unsigned int i = 0; for (ext_info = chan->ext_info; ext_info->name; ext_info++) { ret = __iio_add_chan_devattr(ext_info->name, chan, ext_info->read ? &iio_read_channel_ext_info : NULL, ext_info->write ? &iio_write_channel_ext_info : NULL, i, ext_info->shared, &indio_dev->dev, NULL, &iio_dev_opaque->channel_attr_list); i++; if (ret == -EBUSY && ext_info->shared) continue; if (ret) return ret; attrcount++; } } return attrcount; } /** * iio_free_chan_devattr_list() - Free a list of IIO device attributes * @attr_list: List of IIO device attributes * * This function frees the memory allocated for each of the IIO device * attributes in the list. */ void iio_free_chan_devattr_list(struct list_head *attr_list) { struct iio_dev_attr *p, *n; list_for_each_entry_safe(p, n, attr_list, l) { kfree_const(p->dev_attr.attr.name); list_del(&p->l); kfree(p); } } static ssize_t name_show(struct device *dev, struct device_attribute *attr, char *buf) { struct iio_dev *indio_dev = dev_to_iio_dev(dev); return sysfs_emit(buf, "%s\n", indio_dev->name); } static DEVICE_ATTR_RO(name); static ssize_t label_show(struct device *dev, struct device_attribute *attr, char *buf) { struct iio_dev *indio_dev = dev_to_iio_dev(dev); return sysfs_emit(buf, "%s\n", indio_dev->label); } static DEVICE_ATTR_RO(label); static const char * const clock_names[] = { [CLOCK_REALTIME] = "realtime", [CLOCK_MONOTONIC] = "monotonic", [CLOCK_PROCESS_CPUTIME_ID] = "process_cputime_id", [CLOCK_THREAD_CPUTIME_ID] = "thread_cputime_id", [CLOCK_MONOTONIC_RAW] = "monotonic_raw", [CLOCK_REALTIME_COARSE] = "realtime_coarse", [CLOCK_MONOTONIC_COARSE] = "monotonic_coarse", [CLOCK_BOOTTIME] = "boottime", [CLOCK_REALTIME_ALARM] = "realtime_alarm", [CLOCK_BOOTTIME_ALARM] = "boottime_alarm", [CLOCK_SGI_CYCLE] = "sgi_cycle", [CLOCK_TAI] = "tai", }; static ssize_t current_timestamp_clock_show(struct device *dev, struct device_attribute *attr, char *buf) { const struct iio_dev *indio_dev = dev_to_iio_dev(dev); const clockid_t clk = iio_device_get_clock(indio_dev); switch (clk) { case CLOCK_REALTIME: case CLOCK_MONOTONIC: case CLOCK_MONOTONIC_RAW: case CLOCK_REALTIME_COARSE: case CLOCK_MONOTONIC_COARSE: case CLOCK_BOOTTIME: case CLOCK_TAI: break; default: BUG(); } return sysfs_emit(buf, "%s\n", clock_names[clk]); } static ssize_t current_timestamp_clock_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { clockid_t clk; int ret; ret = sysfs_match_string(clock_names, buf); if (ret < 0) return ret; clk = ret; switch (clk) { case CLOCK_REALTIME: case CLOCK_MONOTONIC: case CLOCK_MONOTONIC_RAW: case CLOCK_REALTIME_COARSE: case CLOCK_MONOTONIC_COARSE: case CLOCK_BOOTTIME: case CLOCK_TAI: break; default: return -EINVAL; } ret = iio_device_set_clock(dev_to_iio_dev(dev), clk); if (ret) return ret; return len; } int iio_device_register_sysfs_group(struct iio_dev *indio_dev, const struct attribute_group *group) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); const struct attribute_group **new, **old = iio_dev_opaque->groups; unsigned int cnt = iio_dev_opaque->groupcounter; new = krealloc_array(old, cnt + 2, sizeof(*new), GFP_KERNEL); if (!new) return -ENOMEM; new[iio_dev_opaque->groupcounter++] = group; new[iio_dev_opaque->groupcounter] = NULL; iio_dev_opaque->groups = new; return 0; } static DEVICE_ATTR_RW(current_timestamp_clock); static int iio_device_register_sysfs(struct iio_dev *indio_dev) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); int i, ret = 0, attrcount, attrn, attrcount_orig = 0; struct iio_dev_attr *p; struct attribute **attr, *clk = NULL; /* First count elements in any existing group */ if (indio_dev->info->attrs) { attr = indio_dev->info->attrs->attrs; while (*attr++ != NULL) attrcount_orig++; } attrcount = attrcount_orig; /* * New channel registration method - relies on the fact a group does * not need to be initialized if its name is NULL. */ if (indio_dev->channels) for (i = 0; i < indio_dev->num_channels; i++) { const struct iio_chan_spec *chan = &indio_dev->channels[i]; if (chan->type == IIO_TIMESTAMP) clk = &dev_attr_current_timestamp_clock.attr; ret = iio_device_add_channel_sysfs(indio_dev, chan); if (ret < 0) goto error_clear_attrs; attrcount += ret; } if (iio_dev_opaque->event_interface) clk = &dev_attr_current_timestamp_clock.attr; if (indio_dev->name) attrcount++; if (indio_dev->label) attrcount++; if (clk) attrcount++; iio_dev_opaque->chan_attr_group.attrs = kcalloc(attrcount + 1, sizeof(iio_dev_opaque->chan_attr_group.attrs[0]), GFP_KERNEL); if (iio_dev_opaque->chan_attr_group.attrs == NULL) { ret = -ENOMEM; goto error_clear_attrs; } /* Copy across original attributes, and point to original binary attributes */ if (indio_dev->info->attrs) { memcpy(iio_dev_opaque->chan_attr_group.attrs, indio_dev->info->attrs->attrs, sizeof(iio_dev_opaque->chan_attr_group.attrs[0]) *attrcount_orig); iio_dev_opaque->chan_attr_group.is_visible = indio_dev->info->attrs->is_visible; iio_dev_opaque->chan_attr_group.bin_attrs = indio_dev->info->attrs->bin_attrs; } attrn = attrcount_orig; /* Add all elements from the list. */ list_for_each_entry(p, &iio_dev_opaque->channel_attr_list, l) iio_dev_opaque->chan_attr_group.attrs[attrn++] = &p->dev_attr.attr; if (indio_dev->name) iio_dev_opaque->chan_attr_group.attrs[attrn++] = &dev_attr_name.attr; if (indio_dev->label) iio_dev_opaque->chan_attr_group.attrs[attrn++] = &dev_attr_label.attr; if (clk) iio_dev_opaque->chan_attr_group.attrs[attrn++] = clk; ret = iio_device_register_sysfs_group(indio_dev, &iio_dev_opaque->chan_attr_group); if (ret) goto error_clear_attrs; return 0; error_clear_attrs: iio_free_chan_devattr_list(&iio_dev_opaque->channel_attr_list); return ret; } static void iio_device_unregister_sysfs(struct iio_dev *indio_dev) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); iio_free_chan_devattr_list(&iio_dev_opaque->channel_attr_list); kfree(iio_dev_opaque->chan_attr_group.attrs); iio_dev_opaque->chan_attr_group.attrs = NULL; kfree(iio_dev_opaque->groups); iio_dev_opaque->groups = NULL; } static void iio_dev_release(struct device *device) { struct iio_dev *indio_dev = dev_to_iio_dev(device); struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); if (indio_dev->modes & INDIO_ALL_TRIGGERED_MODES) iio_device_unregister_trigger_consumer(indio_dev); iio_device_unregister_eventset(indio_dev); iio_device_unregister_sysfs(indio_dev); iio_device_detach_buffers(indio_dev); lockdep_unregister_key(&iio_dev_opaque->mlock_key); ida_free(&iio_ida, iio_dev_opaque->id); kfree(iio_dev_opaque); } const struct device_type iio_device_type = { .name = "iio_device", .release = iio_dev_release, }; /** * iio_device_alloc() - allocate an iio_dev from a driver * @parent: Parent device. * @sizeof_priv: Space to allocate for private structure. * * Returns: * Pointer to allocated iio_dev on success, NULL on failure. */ struct iio_dev *iio_device_alloc(struct device *parent, int sizeof_priv) { struct iio_dev_opaque *iio_dev_opaque; struct iio_dev *indio_dev; size_t alloc_size; alloc_size = sizeof(struct iio_dev_opaque); if (sizeof_priv) { alloc_size = ALIGN(alloc_size, IIO_DMA_MINALIGN); alloc_size += sizeof_priv; } iio_dev_opaque = kzalloc(alloc_size, GFP_KERNEL); if (!iio_dev_opaque) return NULL; indio_dev = &iio_dev_opaque->indio_dev; indio_dev->priv = (char *)iio_dev_opaque + ALIGN(sizeof(struct iio_dev_opaque), IIO_DMA_MINALIGN); indio_dev->dev.parent = parent; indio_dev->dev.type = &iio_device_type; indio_dev->dev.bus = &iio_bus_type; device_initialize(&indio_dev->dev); mutex_init(&iio_dev_opaque->mlock); mutex_init(&iio_dev_opaque->info_exist_lock); INIT_LIST_HEAD(&iio_dev_opaque->channel_attr_list); iio_dev_opaque->id = ida_alloc(&iio_ida, GFP_KERNEL); if (iio_dev_opaque->id < 0) { /* cannot use a dev_err as the name isn't available */ pr_err("failed to get device id\n"); kfree(iio_dev_opaque); return NULL; } if (dev_set_name(&indio_dev->dev, "iio:device%d", iio_dev_opaque->id)) { ida_free(&iio_ida, iio_dev_opaque->id); kfree(iio_dev_opaque); return NULL; } INIT_LIST_HEAD(&iio_dev_opaque->buffer_list); INIT_LIST_HEAD(&iio_dev_opaque->ioctl_handlers); lockdep_register_key(&iio_dev_opaque->mlock_key); lockdep_set_class(&iio_dev_opaque->mlock, &iio_dev_opaque->mlock_key); return indio_dev; } EXPORT_SYMBOL(iio_device_alloc); /** * iio_device_free() - free an iio_dev from a driver * @dev: the iio_dev associated with the device */ void iio_device_free(struct iio_dev *dev) { if (dev) put_device(&dev->dev); } EXPORT_SYMBOL(iio_device_free); static void devm_iio_device_release(void *iio_dev) { iio_device_free(iio_dev); } /** * devm_iio_device_alloc - Resource-managed iio_device_alloc() * @parent: Device to allocate iio_dev for, and parent for this IIO device * @sizeof_priv: Space to allocate for private structure. * * Managed iio_device_alloc. iio_dev allocated with this function is * automatically freed on driver detach. * * Returns: * Pointer to allocated iio_dev on success, NULL on failure. */ struct iio_dev *devm_iio_device_alloc(struct device *parent, int sizeof_priv) { struct iio_dev *iio_dev; int ret; iio_dev = iio_device_alloc(parent, sizeof_priv); if (!iio_dev) return NULL; ret = devm_add_action_or_reset(parent, devm_iio_device_release, iio_dev); if (ret) return NULL; return iio_dev; } EXPORT_SYMBOL_GPL(devm_iio_device_alloc); /** * iio_chrdev_open() - chrdev file open for buffer access and ioctls * @inode: Inode structure for identifying the device in the file system * @filp: File structure for iio device used to keep and later access * private data * * Returns: 0 on success or -EBUSY if the device is already opened */ static int iio_chrdev_open(struct inode *inode, struct file *filp) { struct iio_dev_opaque *iio_dev_opaque = container_of(inode->i_cdev, struct iio_dev_opaque, chrdev); struct iio_dev *indio_dev = &iio_dev_opaque->indio_dev; struct iio_dev_buffer_pair *ib; if (test_and_set_bit(IIO_BUSY_BIT_POS, &iio_dev_opaque->flags)) return -EBUSY; iio_device_get(indio_dev); ib = kmalloc(sizeof(*ib), GFP_KERNEL); if (!ib) { iio_device_put(indio_dev); clear_bit(IIO_BUSY_BIT_POS, &iio_dev_opaque->flags); return -ENOMEM; } ib->indio_dev = indio_dev; ib->buffer = indio_dev->buffer; filp->private_data = ib; return 0; } /** * iio_chrdev_release() - chrdev file close buffer access and ioctls * @inode: Inode structure pointer for the char device * @filp: File structure pointer for the char device * * Returns: 0 for successful release. */ static int iio_chrdev_release(struct inode *inode, struct file *filp) { struct iio_dev_buffer_pair *ib = filp->private_data; struct iio_dev_opaque *iio_dev_opaque = container_of(inode->i_cdev, struct iio_dev_opaque, chrdev); struct iio_dev *indio_dev = &iio_dev_opaque->indio_dev; kfree(ib); clear_bit(IIO_BUSY_BIT_POS, &iio_dev_opaque->flags); iio_device_put(indio_dev); return 0; } void iio_device_ioctl_handler_register(struct iio_dev *indio_dev, struct iio_ioctl_handler *h) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); list_add_tail(&h->entry, &iio_dev_opaque->ioctl_handlers); } void iio_device_ioctl_handler_unregister(struct iio_ioctl_handler *h) { list_del(&h->entry); } static long iio_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct iio_dev_buffer_pair *ib = filp->private_data; struct iio_dev *indio_dev = ib->indio_dev; struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); struct iio_ioctl_handler *h; int ret = -ENODEV; mutex_lock(&iio_dev_opaque->info_exist_lock); /* * The NULL check here is required to prevent crashing when a device * is being removed while userspace would still have open file handles * to try to access this device. */ if (!indio_dev->info) goto out_unlock; list_for_each_entry(h, &iio_dev_opaque->ioctl_handlers, entry) { ret = h->ioctl(indio_dev, filp, cmd, arg); if (ret != IIO_IOCTL_UNHANDLED) break; } if (ret == IIO_IOCTL_UNHANDLED) ret = -ENODEV; out_unlock: mutex_unlock(&iio_dev_opaque->info_exist_lock); return ret; } static const struct file_operations iio_buffer_fileops = { .owner = THIS_MODULE, .llseek = noop_llseek, .read = iio_buffer_read_outer_addr, .write = iio_buffer_write_outer_addr, .poll = iio_buffer_poll_addr, .unlocked_ioctl = iio_ioctl, .compat_ioctl = compat_ptr_ioctl, .open = iio_chrdev_open, .release = iio_chrdev_release, }; static const struct file_operations iio_event_fileops = { .owner = THIS_MODULE, .llseek = noop_llseek, .unlocked_ioctl = iio_ioctl, .compat_ioctl = compat_ptr_ioctl, .open = iio_chrdev_open, .release = iio_chrdev_release, }; static int iio_check_unique_scan_index(struct iio_dev *indio_dev) { int i, j; const struct iio_chan_spec *channels = indio_dev->channels; if (!(indio_dev->modes & INDIO_ALL_BUFFER_MODES)) return 0; for (i = 0; i < indio_dev->num_channels - 1; i++) { if (channels[i].scan_index < 0) continue; for (j = i + 1; j < indio_dev->num_channels; j++) if (channels[i].scan_index == channels[j].scan_index) { dev_err(&indio_dev->dev, "Duplicate scan index %d\n", channels[i].scan_index); return -EINVAL; } } return 0; } static int iio_check_extended_name(const struct iio_dev *indio_dev) { unsigned int i; if (!indio_dev->info->read_label) return 0; for (i = 0; i < indio_dev->num_channels; i++) { if (indio_dev->channels[i].extend_name) { dev_err(&indio_dev->dev, "Cannot use labels and extend_name at the same time\n"); return -EINVAL; } } return 0; } static const struct iio_buffer_setup_ops noop_ring_setup_ops; int __iio_device_register(struct iio_dev *indio_dev, struct module *this_mod) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); struct fwnode_handle *fwnode = NULL; int ret; if (!indio_dev->info) return -EINVAL; iio_dev_opaque->driver_module = this_mod; /* If the calling driver did not initialize firmware node, do it here */ if (dev_fwnode(&indio_dev->dev)) fwnode = dev_fwnode(&indio_dev->dev); /* The default dummy IIO device has no parent */ else if (indio_dev->dev.parent) fwnode = dev_fwnode(indio_dev->dev.parent); device_set_node(&indio_dev->dev, fwnode); fwnode_property_read_string(fwnode, "label", &indio_dev->label); ret = iio_check_unique_scan_index(indio_dev); if (ret < 0) return ret; ret = iio_check_extended_name(indio_dev); if (ret < 0) return ret; iio_device_register_debugfs(indio_dev); ret = iio_buffers_alloc_sysfs_and_mask(indio_dev); if (ret) { dev_err(indio_dev->dev.parent, "Failed to create buffer sysfs interfaces\n"); goto error_unreg_debugfs; } ret = iio_device_register_sysfs(indio_dev); if (ret) { dev_err(indio_dev->dev.parent, "Failed to register sysfs interfaces\n"); goto error_buffer_free_sysfs; } ret = iio_device_register_eventset(indio_dev); if (ret) { dev_err(indio_dev->dev.parent, "Failed to register event set\n"); goto error_free_sysfs; } if (indio_dev->modes & INDIO_ALL_TRIGGERED_MODES) iio_device_register_trigger_consumer(indio_dev); if ((indio_dev->modes & INDIO_ALL_BUFFER_MODES) && indio_dev->setup_ops == NULL) indio_dev->setup_ops = &noop_ring_setup_ops; if (iio_dev_opaque->attached_buffers_cnt) cdev_init(&iio_dev_opaque->chrdev, &iio_buffer_fileops); else if (iio_dev_opaque->event_interface) cdev_init(&iio_dev_opaque->chrdev, &iio_event_fileops); if (iio_dev_opaque->attached_buffers_cnt || iio_dev_opaque->event_interface) { indio_dev->dev.devt = MKDEV(MAJOR(iio_devt), iio_dev_opaque->id); iio_dev_opaque->chrdev.owner = this_mod; } /* assign device groups now; they should be all registered now */ indio_dev->dev.groups = iio_dev_opaque->groups; ret = cdev_device_add(&iio_dev_opaque->chrdev, &indio_dev->dev); if (ret < 0) goto error_unreg_eventset; return 0; error_unreg_eventset: iio_device_unregister_eventset(indio_dev); error_free_sysfs: iio_device_unregister_sysfs(indio_dev); error_buffer_free_sysfs: iio_buffers_free_sysfs_and_mask(indio_dev); error_unreg_debugfs: iio_device_unregister_debugfs(indio_dev); return ret; } EXPORT_SYMBOL(__iio_device_register); /** * iio_device_unregister() - unregister a device from the IIO subsystem * @indio_dev: Device structure representing the device. */ void iio_device_unregister(struct iio_dev *indio_dev) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); cdev_device_del(&iio_dev_opaque->chrdev, &indio_dev->dev); mutex_lock(&iio_dev_opaque->info_exist_lock); iio_device_unregister_debugfs(indio_dev); iio_disable_all_buffers(indio_dev); indio_dev->info = NULL; iio_device_wakeup_eventset(indio_dev); iio_buffer_wakeup_poll(indio_dev); mutex_unlock(&iio_dev_opaque->info_exist_lock); iio_buffers_free_sysfs_and_mask(indio_dev); } EXPORT_SYMBOL(iio_device_unregister); static void devm_iio_device_unreg(void *indio_dev) { iio_device_unregister(indio_dev); } int __devm_iio_device_register(struct device *dev, struct iio_dev *indio_dev, struct module *this_mod) { int ret; ret = __iio_device_register(indio_dev, this_mod); if (ret) return ret; return devm_add_action_or_reset(dev, devm_iio_device_unreg, indio_dev); } EXPORT_SYMBOL_GPL(__devm_iio_device_register); /** * iio_device_claim_direct_mode - Keep device in direct mode * @indio_dev: the iio_dev associated with the device * * If the device is in direct mode it is guaranteed to stay * that way until iio_device_release_direct_mode() is called. * * Use with iio_device_release_direct_mode() * * Returns: 0 on success, -EBUSY on failure. */ int iio_device_claim_direct_mode(struct iio_dev *indio_dev) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); mutex_lock(&iio_dev_opaque->mlock); if (iio_buffer_enabled(indio_dev)) { mutex_unlock(&iio_dev_opaque->mlock); return -EBUSY; } return 0; } EXPORT_SYMBOL_GPL(iio_device_claim_direct_mode); /** * iio_device_release_direct_mode - releases claim on direct mode * @indio_dev: the iio_dev associated with the device * * Release the claim. Device is no longer guaranteed to stay * in direct mode. * * Use with iio_device_claim_direct_mode() */ void iio_device_release_direct_mode(struct iio_dev *indio_dev) { mutex_unlock(&to_iio_dev_opaque(indio_dev)->mlock); } EXPORT_SYMBOL_GPL(iio_device_release_direct_mode); /** * iio_device_claim_buffer_mode - Keep device in buffer mode * @indio_dev: the iio_dev associated with the device * * If the device is in buffer mode it is guaranteed to stay * that way until iio_device_release_buffer_mode() is called. * * Use with iio_device_release_buffer_mode(). * * Returns: 0 on success, -EBUSY on failure. */ int iio_device_claim_buffer_mode(struct iio_dev *indio_dev) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); mutex_lock(&iio_dev_opaque->mlock); if (iio_buffer_enabled(indio_dev)) return 0; mutex_unlock(&iio_dev_opaque->mlock); return -EBUSY; } EXPORT_SYMBOL_GPL(iio_device_claim_buffer_mode); /** * iio_device_release_buffer_mode - releases claim on buffer mode * @indio_dev: the iio_dev associated with the device * * Release the claim. Device is no longer guaranteed to stay * in buffer mode. * * Use with iio_device_claim_buffer_mode(). */ void iio_device_release_buffer_mode(struct iio_dev *indio_dev) { mutex_unlock(&to_iio_dev_opaque(indio_dev)->mlock); } EXPORT_SYMBOL_GPL(iio_device_release_buffer_mode); /** * iio_device_get_current_mode() - helper function providing read-only access to * the opaque @currentmode variable * @indio_dev: IIO device structure for device */ int iio_device_get_current_mode(struct iio_dev *indio_dev) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); return iio_dev_opaque->currentmode; } EXPORT_SYMBOL_GPL(iio_device_get_current_mode); subsys_initcall(iio_init); module_exit(iio_exit); MODULE_AUTHOR("Jonathan Cameron <jic23@kernel.org>"); MODULE_DESCRIPTION("Industrial I/O core"); MODULE_LICENSE("GPL");
14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 196 196 14 14 14 14 14 14 14 14 14 14 14 14 14 14 23 23 23 23 23 23 2 2 2 21 21 21 21 14 14 14 14 14 14 14 14 14 14 14 14 14 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/super.c * * Copyright (C) 1991, 1992 Linus Torvalds * * super.c contains code to handle: - mount structures * - super-block tables * - filesystem drivers list * - mount system call * - umount system call * - ustat system call * * GK 2/5/95 - Changed to support mounting the root fs via NFS * * Added kerneld support: Jacques Gelinas and Bjorn Ekwall * Added change_root: Werner Almesberger & Hans Lermen, Feb '96 * Added options to /proc/mounts: * Torbjörn Lindh (torbjorn.lindh@gopta.se), April 14, 1996. * Added devfs support: Richard Gooch <rgooch@atnf.csiro.au>, 13-JAN-1998 * Heavily rewritten for 'one fs - one tree' dcache architecture. AV, Mar 2000 */ #include <linux/export.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/mount.h> #include <linux/security.h> #include <linux/writeback.h> /* for the emergency remount stuff */ #include <linux/idr.h> #include <linux/mutex.h> #include <linux/backing-dev.h> #include <linux/rculist_bl.h> #include <linux/fscrypt.h> #include <linux/fsnotify.h> #include <linux/lockdep.h> #include <linux/user_namespace.h> #include <linux/fs_context.h> #include <uapi/linux/mount.h> #include "internal.h" static int thaw_super_locked(struct super_block *sb, enum freeze_holder who); static LIST_HEAD(super_blocks); static DEFINE_SPINLOCK(sb_lock); static char *sb_writers_name[SB_FREEZE_LEVELS] = { "sb_writers", "sb_pagefaults", "sb_internal", }; static inline void __super_lock(struct super_block *sb, bool excl) { if (excl) down_write(&sb->s_umount); else down_read(&sb->s_umount); } static inline void super_unlock(struct super_block *sb, bool excl) { if (excl) up_write(&sb->s_umount); else up_read(&sb->s_umount); } static inline void __super_lock_excl(struct super_block *sb) { __super_lock(sb, true); } static inline void super_unlock_excl(struct super_block *sb) { super_unlock(sb, true); } static inline void super_unlock_shared(struct super_block *sb) { super_unlock(sb, false); } static inline bool wait_born(struct super_block *sb) { unsigned int flags; /* * Pairs with smp_store_release() in super_wake() and ensures * that we see SB_BORN or SB_DYING after we're woken. */ flags = smp_load_acquire(&sb->s_flags); return flags & (SB_BORN | SB_DYING); } /** * super_lock - wait for superblock to become ready and lock it * @sb: superblock to wait for * @excl: whether exclusive access is required * * If the superblock has neither passed through vfs_get_tree() or * generic_shutdown_super() yet wait for it to happen. Either superblock * creation will succeed and SB_BORN is set by vfs_get_tree() or we're * woken and we'll see SB_DYING. * * The caller must have acquired a temporary reference on @sb->s_count. * * Return: This returns true if SB_BORN was set, false if SB_DYING was * set. The function acquires s_umount and returns with it held. */ static __must_check bool super_lock(struct super_block *sb, bool excl) { lockdep_assert_not_held(&sb->s_umount); relock: __super_lock(sb, excl); /* * Has gone through generic_shutdown_super() in the meantime. * @sb->s_root is NULL and @sb->s_active is 0. No one needs to * grab a reference to this. Tell them so. */ if (sb->s_flags & SB_DYING) return false; /* Has called ->get_tree() successfully. */ if (sb->s_flags & SB_BORN) return true; super_unlock(sb, excl); /* wait until the superblock is ready or dying */ wait_var_event(&sb->s_flags, wait_born(sb)); /* * Neither SB_BORN nor SB_DYING are ever unset so we never loop. * Just reacquire @sb->s_umount for the caller. */ goto relock; } /* wait and acquire read-side of @sb->s_umount */ static inline bool super_lock_shared(struct super_block *sb) { return super_lock(sb, false); } /* wait and acquire write-side of @sb->s_umount */ static inline bool super_lock_excl(struct super_block *sb) { return super_lock(sb, true); } /* wake waiters */ #define SUPER_WAKE_FLAGS (SB_BORN | SB_DYING | SB_DEAD) static void super_wake(struct super_block *sb, unsigned int flag) { WARN_ON_ONCE((flag & ~SUPER_WAKE_FLAGS)); WARN_ON_ONCE(hweight32(flag & SUPER_WAKE_FLAGS) > 1); /* * Pairs with smp_load_acquire() in super_lock() to make sure * all initializations in the superblock are seen by the user * seeing SB_BORN sent. */ smp_store_release(&sb->s_flags, sb->s_flags | flag); /* * Pairs with the barrier in prepare_to_wait_event() to make sure * ___wait_var_event() either sees SB_BORN set or * waitqueue_active() check in wake_up_var() sees the waiter. */ smp_mb(); wake_up_var(&sb->s_flags); } /* * One thing we have to be careful of with a per-sb shrinker is that we don't * drop the last active reference to the superblock from within the shrinker. * If that happens we could trigger unregistering the shrinker from within the * shrinker path and that leads to deadlock on the shrinker_mutex. Hence we * take a passive reference to the superblock to avoid this from occurring. */ static unsigned long super_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { struct super_block *sb; long fs_objects = 0; long total_objects; long freed = 0; long dentries; long inodes; sb = shrink->private_data; /* * Deadlock avoidance. We may hold various FS locks, and we don't want * to recurse into the FS that called us in clear_inode() and friends.. */ if (!(sc->gfp_mask & __GFP_FS)) return SHRINK_STOP; if (!super_trylock_shared(sb)) return SHRINK_STOP; if (sb->s_op->nr_cached_objects) fs_objects = sb->s_op->nr_cached_objects(sb, sc); inodes = list_lru_shrink_count(&sb->s_inode_lru, sc); dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc); total_objects = dentries + inodes + fs_objects + 1; if (!total_objects) total_objects = 1; /* proportion the scan between the caches */ dentries = mult_frac(sc->nr_to_scan, dentries, total_objects); inodes = mult_frac(sc->nr_to_scan, inodes, total_objects); fs_objects = mult_frac(sc->nr_to_scan, fs_objects, total_objects); /* * prune the dcache first as the icache is pinned by it, then * prune the icache, followed by the filesystem specific caches * * Ensure that we always scan at least one object - memcg kmem * accounting uses this to fully empty the caches. */ sc->nr_to_scan = dentries + 1; freed = prune_dcache_sb(sb, sc); sc->nr_to_scan = inodes + 1; freed += prune_icache_sb(sb, sc); if (fs_objects) { sc->nr_to_scan = fs_objects + 1; freed += sb->s_op->free_cached_objects(sb, sc); } super_unlock_shared(sb); return freed; } static unsigned long super_cache_count(struct shrinker *shrink, struct shrink_control *sc) { struct super_block *sb; long total_objects = 0; sb = shrink->private_data; /* * We don't call super_trylock_shared() here as it is a scalability * bottleneck, so we're exposed to partial setup state. The shrinker * rwsem does not protect filesystem operations backing * list_lru_shrink_count() or s_op->nr_cached_objects(). Counts can * change between super_cache_count and super_cache_scan, so we really * don't need locks here. * * However, if we are currently mounting the superblock, the underlying * filesystem might be in a state of partial construction and hence it * is dangerous to access it. super_trylock_shared() uses a SB_BORN check * to avoid this situation, so do the same here. The memory barrier is * matched with the one in mount_fs() as we don't hold locks here. */ if (!(sb->s_flags & SB_BORN)) return 0; smp_rmb(); if (sb->s_op && sb->s_op->nr_cached_objects) total_objects = sb->s_op->nr_cached_objects(sb, sc); total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc); total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc); if (!total_objects) return SHRINK_EMPTY; total_objects = vfs_pressure_ratio(total_objects); return total_objects; } static void destroy_super_work(struct work_struct *work) { struct super_block *s = container_of(work, struct super_block, destroy_work); int i; for (i = 0; i < SB_FREEZE_LEVELS; i++) percpu_free_rwsem(&s->s_writers.rw_sem[i]); kfree(s); } static void destroy_super_rcu(struct rcu_head *head) { struct super_block *s = container_of(head, struct super_block, rcu); INIT_WORK(&s->destroy_work, destroy_super_work); schedule_work(&s->destroy_work); } /* Free a superblock that has never been seen by anyone */ static void destroy_unused_super(struct super_block *s) { if (!s) return; super_unlock_excl(s); list_lru_destroy(&s->s_dentry_lru); list_lru_destroy(&s->s_inode_lru); security_sb_free(s); put_user_ns(s->s_user_ns); kfree(s->s_subtype); shrinker_free(s->s_shrink); /* no delays needed */ destroy_super_work(&s->destroy_work); } /** * alloc_super - create new superblock * @type: filesystem type superblock should belong to * @flags: the mount flags * @user_ns: User namespace for the super_block * * Allocates and initializes a new &struct super_block. alloc_super() * returns a pointer new superblock or %NULL if allocation had failed. */ static struct super_block *alloc_super(struct file_system_type *type, int flags, struct user_namespace *user_ns) { struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER); static const struct super_operations default_op; int i; if (!s) return NULL; INIT_LIST_HEAD(&s->s_mounts); s->s_user_ns = get_user_ns(user_ns); init_rwsem(&s->s_umount); lockdep_set_class(&s->s_umount, &type->s_umount_key); /* * sget() can have s_umount recursion. * * When it cannot find a suitable sb, it allocates a new * one (this one), and tries again to find a suitable old * one. * * In case that succeeds, it will acquire the s_umount * lock of the old one. Since these are clearly distrinct * locks, and this object isn't exposed yet, there's no * risk of deadlocks. * * Annotate this by putting this lock in a different * subclass. */ down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING); if (security_sb_alloc(s)) goto fail; for (i = 0; i < SB_FREEZE_LEVELS; i++) { if (__percpu_init_rwsem(&s->s_writers.rw_sem[i], sb_writers_name[i], &type->s_writers_key[i])) goto fail; } s->s_bdi = &noop_backing_dev_info; s->s_flags = flags; if (s->s_user_ns != &init_user_ns) s->s_iflags |= SB_I_NODEV; INIT_HLIST_NODE(&s->s_instances); INIT_HLIST_BL_HEAD(&s->s_roots); mutex_init(&s->s_sync_lock); INIT_LIST_HEAD(&s->s_inodes); spin_lock_init(&s->s_inode_list_lock); INIT_LIST_HEAD(&s->s_inodes_wb); spin_lock_init(&s->s_inode_wblist_lock); s->s_count = 1; atomic_set(&s->s_active, 1); mutex_init(&s->s_vfs_rename_mutex); lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key); init_rwsem(&s->s_dquot.dqio_sem); s->s_maxbytes = MAX_NON_LFS; s->s_op = &default_op; s->s_time_gran = 1000000000; s->s_time_min = TIME64_MIN; s->s_time_max = TIME64_MAX; s->s_shrink = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "sb-%s", type->name); if (!s->s_shrink) goto fail; s->s_shrink->scan_objects = super_cache_scan; s->s_shrink->count_objects = super_cache_count; s->s_shrink->batch = 1024; s->s_shrink->private_data = s; if (list_lru_init_memcg(&s->s_dentry_lru, s->s_shrink)) goto fail; if (list_lru_init_memcg(&s->s_inode_lru, s->s_shrink)) goto fail; return s; fail: destroy_unused_super(s); return NULL; } /* Superblock refcounting */ /* * Drop a superblock's refcount. The caller must hold sb_lock. */ static void __put_super(struct super_block *s) { if (!--s->s_count) { list_del_init(&s->s_list); WARN_ON(s->s_dentry_lru.node); WARN_ON(s->s_inode_lru.node); WARN_ON(!list_empty(&s->s_mounts)); security_sb_free(s); put_user_ns(s->s_user_ns); kfree(s->s_subtype); call_rcu(&s->rcu, destroy_super_rcu); } } /** * put_super - drop a temporary reference to superblock * @sb: superblock in question * * Drops a temporary reference, frees superblock if there's no * references left. */ void put_super(struct super_block *sb) { spin_lock(&sb_lock); __put_super(sb); spin_unlock(&sb_lock); } static void kill_super_notify(struct super_block *sb) { lockdep_assert_not_held(&sb->s_umount); /* already notified earlier */ if (sb->s_flags & SB_DEAD) return; /* * Remove it from @fs_supers so it isn't found by new * sget{_fc}() walkers anymore. Any concurrent mounter still * managing to grab a temporary reference is guaranteed to * already see SB_DYING and will wait until we notify them about * SB_DEAD. */ spin_lock(&sb_lock); hlist_del_init(&sb->s_instances); spin_unlock(&sb_lock); /* * Let concurrent mounts know that this thing is really dead. * We don't need @sb->s_umount here as every concurrent caller * will see SB_DYING and either discard the superblock or wait * for SB_DEAD. */ super_wake(sb, SB_DEAD); } /** * deactivate_locked_super - drop an active reference to superblock * @s: superblock to deactivate * * Drops an active reference to superblock, converting it into a temporary * one if there is no other active references left. In that case we * tell fs driver to shut it down and drop the temporary reference we * had just acquired. * * Caller holds exclusive lock on superblock; that lock is released. */ void deactivate_locked_super(struct super_block *s) { struct file_system_type *fs = s->s_type; if (atomic_dec_and_test(&s->s_active)) { shrinker_free(s->s_shrink); fs->kill_sb(s); kill_super_notify(s); /* * Since list_lru_destroy() may sleep, we cannot call it from * put_super(), where we hold the sb_lock. Therefore we destroy * the lru lists right now. */ list_lru_destroy(&s->s_dentry_lru); list_lru_destroy(&s->s_inode_lru); put_filesystem(fs); put_super(s); } else { super_unlock_excl(s); } } EXPORT_SYMBOL(deactivate_locked_super); /** * deactivate_super - drop an active reference to superblock * @s: superblock to deactivate * * Variant of deactivate_locked_super(), except that superblock is *not* * locked by caller. If we are going to drop the final active reference, * lock will be acquired prior to that. */ void deactivate_super(struct super_block *s) { if (!atomic_add_unless(&s->s_active, -1, 1)) { __super_lock_excl(s); deactivate_locked_super(s); } } EXPORT_SYMBOL(deactivate_super); /** * grab_super - acquire an active reference * @s: reference we are trying to make active * * Tries to acquire an active reference. grab_super() is used when we * had just found a superblock in super_blocks or fs_type->fs_supers * and want to turn it into a full-blown active reference. grab_super() * is called with sb_lock held and drops it. Returns 1 in case of * success, 0 if we had failed (superblock contents was already dead or * dying when grab_super() had been called). Note that this is only * called for superblocks not in rundown mode (== ones still on ->fs_supers * of their type), so increment of ->s_count is OK here. */ static int grab_super(struct super_block *s) __releases(sb_lock) { bool born; s->s_count++; spin_unlock(&sb_lock); born = super_lock_excl(s); if (born && atomic_inc_not_zero(&s->s_active)) { put_super(s); return 1; } super_unlock_excl(s); put_super(s); return 0; } static inline bool wait_dead(struct super_block *sb) { unsigned int flags; /* * Pairs with memory barrier in super_wake() and ensures * that we see SB_DEAD after we're woken. */ flags = smp_load_acquire(&sb->s_flags); return flags & SB_DEAD; } /** * grab_super_dead - acquire an active reference to a superblock * @sb: superblock to acquire * * Acquire a temporary reference on a superblock and try to trade it for * an active reference. This is used in sget{_fc}() to wait for a * superblock to either become SB_BORN or for it to pass through * sb->kill() and be marked as SB_DEAD. * * Return: This returns true if an active reference could be acquired, * false if not. */ static bool grab_super_dead(struct super_block *sb) { sb->s_count++; if (grab_super(sb)) { put_super(sb); lockdep_assert_held(&sb->s_umount); return true; } wait_var_event(&sb->s_flags, wait_dead(sb)); lockdep_assert_not_held(&sb->s_umount); put_super(sb); return false; } /* * super_trylock_shared - try to grab ->s_umount shared * @sb: reference we are trying to grab * * Try to prevent fs shutdown. This is used in places where we * cannot take an active reference but we need to ensure that the * filesystem is not shut down while we are working on it. It returns * false if we cannot acquire s_umount or if we lose the race and * filesystem already got into shutdown, and returns true with the s_umount * lock held in read mode in case of success. On successful return, * the caller must drop the s_umount lock when done. * * Note that unlike get_super() et.al. this one does *not* bump ->s_count. * The reason why it's safe is that we are OK with doing trylock instead * of down_read(). There's a couple of places that are OK with that, but * it's very much not a general-purpose interface. */ bool super_trylock_shared(struct super_block *sb) { if (down_read_trylock(&sb->s_umount)) { if (!(sb->s_flags & SB_DYING) && sb->s_root && (sb->s_flags & SB_BORN)) return true; super_unlock_shared(sb); } return false; } /** * retire_super - prevents superblock from being reused * @sb: superblock to retire * * The function marks superblock to be ignored in superblock test, which * prevents it from being reused for any new mounts. If the superblock has * a private bdi, it also unregisters it, but doesn't reduce the refcount * of the superblock to prevent potential races. The refcount is reduced * by generic_shutdown_super(). The function can not be called * concurrently with generic_shutdown_super(). It is safe to call the * function multiple times, subsequent calls have no effect. * * The marker will affect the re-use only for block-device-based * superblocks. Other superblocks will still get marked if this function * is used, but that will not affect their reusability. */ void retire_super(struct super_block *sb) { WARN_ON(!sb->s_bdev); __super_lock_excl(sb); if (sb->s_iflags & SB_I_PERSB_BDI) { bdi_unregister(sb->s_bdi); sb->s_iflags &= ~SB_I_PERSB_BDI; } sb->s_iflags |= SB_I_RETIRED; super_unlock_excl(sb); } EXPORT_SYMBOL(retire_super); /** * generic_shutdown_super - common helper for ->kill_sb() * @sb: superblock to kill * * generic_shutdown_super() does all fs-independent work on superblock * shutdown. Typical ->kill_sb() should pick all fs-specific objects * that need destruction out of superblock, call generic_shutdown_super() * and release aforementioned objects. Note: dentries and inodes _are_ * taken care of and do not need specific handling. * * Upon calling this function, the filesystem may no longer alter or * rearrange the set of dentries belonging to this super_block, nor may it * change the attachments of dentries to inodes. */ void generic_shutdown_super(struct super_block *sb) { const struct super_operations *sop = sb->s_op; if (sb->s_root) { shrink_dcache_for_umount(sb); sync_filesystem(sb); sb->s_flags &= ~SB_ACTIVE; cgroup_writeback_umount(); /* Evict all inodes with zero refcount. */ evict_inodes(sb); /* * Clean up and evict any inodes that still have references due * to fsnotify or the security policy. */ fsnotify_sb_delete(sb); security_sb_delete(sb); /* * Now that all potentially-encrypted inodes have been evicted, * the fscrypt keyring can be destroyed. */ fscrypt_destroy_keyring(sb); if (sb->s_dio_done_wq) { destroy_workqueue(sb->s_dio_done_wq); sb->s_dio_done_wq = NULL; } if (sop->put_super) sop->put_super(sb); if (CHECK_DATA_CORRUPTION(!list_empty(&sb->s_inodes), "VFS: Busy inodes after unmount of %s (%s)", sb->s_id, sb->s_type->name)) { /* * Adding a proper bailout path here would be hard, but * we can at least make it more likely that a later * iput_final() or such crashes cleanly. */ struct inode *inode; spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { inode->i_op = VFS_PTR_POISON; inode->i_sb = VFS_PTR_POISON; inode->i_mapping = VFS_PTR_POISON; } spin_unlock(&sb->s_inode_list_lock); } } /* * Broadcast to everyone that grabbed a temporary reference to this * superblock before we removed it from @fs_supers that the superblock * is dying. Every walker of @fs_supers outside of sget{_fc}() will now * discard this superblock and treat it as dead. * * We leave the superblock on @fs_supers so it can be found by * sget{_fc}() until we passed sb->kill_sb(). */ super_wake(sb, SB_DYING); super_unlock_excl(sb); if (sb->s_bdi != &noop_backing_dev_info) { if (sb->s_iflags & SB_I_PERSB_BDI) bdi_unregister(sb->s_bdi); bdi_put(sb->s_bdi); sb->s_bdi = &noop_backing_dev_info; } } EXPORT_SYMBOL(generic_shutdown_super); bool mount_capable(struct fs_context *fc) { if (!(fc->fs_type->fs_flags & FS_USERNS_MOUNT)) return capable(CAP_SYS_ADMIN); else return ns_capable(fc->user_ns, CAP_SYS_ADMIN); } /** * sget_fc - Find or create a superblock * @fc: Filesystem context. * @test: Comparison callback * @set: Setup callback * * Create a new superblock or find an existing one. * * The @test callback is used to find a matching existing superblock. * Whether or not the requested parameters in @fc are taken into account * is specific to the @test callback that is used. They may even be * completely ignored. * * If an extant superblock is matched, it will be returned unless: * * (1) the namespace the filesystem context @fc and the extant * superblock's namespace differ * * (2) the filesystem context @fc has requested that reusing an extant * superblock is not allowed * * In both cases EBUSY will be returned. * * If no match is made, a new superblock will be allocated and basic * initialisation will be performed (s_type, s_fs_info and s_id will be * set and the @set callback will be invoked), the superblock will be * published and it will be returned in a partially constructed state * with SB_BORN and SB_ACTIVE as yet unset. * * Return: On success, an extant or newly created superblock is * returned. On failure an error pointer is returned. */ struct super_block *sget_fc(struct fs_context *fc, int (*test)(struct super_block *, struct fs_context *), int (*set)(struct super_block *, struct fs_context *)) { struct super_block *s = NULL; struct super_block *old; struct user_namespace *user_ns = fc->global ? &init_user_ns : fc->user_ns; int err; retry: spin_lock(&sb_lock); if (test) { hlist_for_each_entry(old, &fc->fs_type->fs_supers, s_instances) { if (test(old, fc)) goto share_extant_sb; } } if (!s) { spin_unlock(&sb_lock); s = alloc_super(fc->fs_type, fc->sb_flags, user_ns); if (!s) return ERR_PTR(-ENOMEM); goto retry; } s->s_fs_info = fc->s_fs_info; err = set(s, fc); if (err) { s->s_fs_info = NULL; spin_unlock(&sb_lock); destroy_unused_super(s); return ERR_PTR(err); } fc->s_fs_info = NULL; s->s_type = fc->fs_type; s->s_iflags |= fc->s_iflags; strscpy(s->s_id, s->s_type->name, sizeof(s->s_id)); /* * Make the superblock visible on @super_blocks and @fs_supers. * It's in a nascent state and users should wait on SB_BORN or * SB_DYING to be set. */ list_add_tail(&s->s_list, &super_blocks); hlist_add_head(&s->s_instances, &s->s_type->fs_supers); spin_unlock(&sb_lock); get_filesystem(s->s_type); shrinker_register(s->s_shrink); return s; share_extant_sb: if (user_ns != old->s_user_ns || fc->exclusive) { spin_unlock(&sb_lock); destroy_unused_super(s); if (fc->exclusive) warnfc(fc, "reusing existing filesystem not allowed"); else warnfc(fc, "reusing existing filesystem in another namespace not allowed"); return ERR_PTR(-EBUSY); } if (!grab_super_dead(old)) goto retry; destroy_unused_super(s); return old; } EXPORT_SYMBOL(sget_fc); /** * sget - find or create a superblock * @type: filesystem type superblock should belong to * @test: comparison callback * @set: setup callback * @flags: mount flags * @data: argument to each of them */ struct super_block *sget(struct file_system_type *type, int (*test)(struct super_block *,void *), int (*set)(struct super_block *,void *), int flags, void *data) { struct user_namespace *user_ns = current_user_ns(); struct super_block *s = NULL; struct super_block *old; int err; /* We don't yet pass the user namespace of the parent * mount through to here so always use &init_user_ns * until that changes. */ if (flags & SB_SUBMOUNT) user_ns = &init_user_ns; retry: spin_lock(&sb_lock); if (test) { hlist_for_each_entry(old, &type->fs_supers, s_instances) { if (!test(old, data)) continue; if (user_ns != old->s_user_ns) { spin_unlock(&sb_lock); destroy_unused_super(s); return ERR_PTR(-EBUSY); } if (!grab_super_dead(old)) goto retry; destroy_unused_super(s); return old; } } if (!s) { spin_unlock(&sb_lock); s = alloc_super(type, (flags & ~SB_SUBMOUNT), user_ns); if (!s) return ERR_PTR(-ENOMEM); goto retry; } err = set(s, data); if (err) { spin_unlock(&sb_lock); destroy_unused_super(s); return ERR_PTR(err); } s->s_type = type; strscpy(s->s_id, type->name, sizeof(s->s_id)); list_add_tail(&s->s_list, &super_blocks); hlist_add_head(&s->s_instances, &type->fs_supers); spin_unlock(&sb_lock); get_filesystem(type); shrinker_register(s->s_shrink); return s; } EXPORT_SYMBOL(sget); void drop_super(struct super_block *sb) { super_unlock_shared(sb); put_super(sb); } EXPORT_SYMBOL(drop_super); void drop_super_exclusive(struct super_block *sb) { super_unlock_excl(sb); put_super(sb); } EXPORT_SYMBOL(drop_super_exclusive); static void __iterate_supers(void (*f)(struct super_block *)) { struct super_block *sb, *p = NULL; spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { /* Pairs with memory marrier in super_wake(). */ if (smp_load_acquire(&sb->s_flags) & SB_DYING) continue; sb->s_count++; spin_unlock(&sb_lock); f(sb); spin_lock(&sb_lock); if (p) __put_super(p); p = sb; } if (p) __put_super(p); spin_unlock(&sb_lock); } /** * iterate_supers - call function for all active superblocks * @f: function to call * @arg: argument to pass to it * * Scans the superblock list and calls given function, passing it * locked superblock and given argument. */ void iterate_supers(void (*f)(struct super_block *, void *), void *arg) { struct super_block *sb, *p = NULL; spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { bool born; sb->s_count++; spin_unlock(&sb_lock); born = super_lock_shared(sb); if (born && sb->s_root) f(sb, arg); super_unlock_shared(sb); spin_lock(&sb_lock); if (p) __put_super(p); p = sb; } if (p) __put_super(p); spin_unlock(&sb_lock); } /** * iterate_supers_type - call function for superblocks of given type * @type: fs type * @f: function to call * @arg: argument to pass to it * * Scans the superblock list and calls given function, passing it * locked superblock and given argument. */ void iterate_supers_type(struct file_system_type *type, void (*f)(struct super_block *, void *), void *arg) { struct super_block *sb, *p = NULL; spin_lock(&sb_lock); hlist_for_each_entry(sb, &type->fs_supers, s_instances) { bool born; sb->s_count++; spin_unlock(&sb_lock); born = super_lock_shared(sb); if (born && sb->s_root) f(sb, arg); super_unlock_shared(sb); spin_lock(&sb_lock); if (p) __put_super(p); p = sb; } if (p) __put_super(p); spin_unlock(&sb_lock); } EXPORT_SYMBOL(iterate_supers_type); /** * get_active_super - get an active reference to the superblock of a device * @bdev: device to get the superblock for * * Scans the superblock list and finds the superblock of the file system * mounted on the device given. Returns the superblock with an active * reference or %NULL if none was found. */ struct super_block *get_active_super(struct block_device *bdev) { struct super_block *sb; if (!bdev) return NULL; spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { if (sb->s_bdev == bdev) { if (!grab_super(sb)) return NULL; super_unlock_excl(sb); return sb; } } spin_unlock(&sb_lock); return NULL; } struct super_block *user_get_super(dev_t dev, bool excl) { struct super_block *sb; spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { if (sb->s_dev == dev) { bool born; sb->s_count++; spin_unlock(&sb_lock); /* still alive? */ born = super_lock(sb, excl); if (born && sb->s_root) return sb; super_unlock(sb, excl); /* nope, got unmounted */ spin_lock(&sb_lock); __put_super(sb); break; } } spin_unlock(&sb_lock); return NULL; } /** * reconfigure_super - asks filesystem to change superblock parameters * @fc: The superblock and configuration * * Alters the configuration parameters of a live superblock. */ int reconfigure_super(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; int retval; bool remount_ro = false; bool remount_rw = false; bool force = fc->sb_flags & SB_FORCE; if (fc->sb_flags_mask & ~MS_RMT_MASK) return -EINVAL; if (sb->s_writers.frozen != SB_UNFROZEN) return -EBUSY; retval = security_sb_remount(sb, fc->security); if (retval) return retval; if (fc->sb_flags_mask & SB_RDONLY) { #ifdef CONFIG_BLOCK if (!(fc->sb_flags & SB_RDONLY) && sb->s_bdev && bdev_read_only(sb->s_bdev)) return -EACCES; #endif remount_rw = !(fc->sb_flags & SB_RDONLY) && sb_rdonly(sb); remount_ro = (fc->sb_flags & SB_RDONLY) && !sb_rdonly(sb); } if (remount_ro) { if (!hlist_empty(&sb->s_pins)) { super_unlock_excl(sb); group_pin_kill(&sb->s_pins); __super_lock_excl(sb); if (!sb->s_root) return 0; if (sb->s_writers.frozen != SB_UNFROZEN) return -EBUSY; remount_ro = !sb_rdonly(sb); } } shrink_dcache_sb(sb); /* If we are reconfiguring to RDONLY and current sb is read/write, * make sure there are no files open for writing. */ if (remount_ro) { if (force) { sb_start_ro_state_change(sb); } else { retval = sb_prepare_remount_readonly(sb); if (retval) return retval; } } else if (remount_rw) { /* * Protect filesystem's reconfigure code from writes from * userspace until reconfigure finishes. */ sb_start_ro_state_change(sb); } if (fc->ops->reconfigure) { retval = fc->ops->reconfigure(fc); if (retval) { if (!force) goto cancel_readonly; /* If forced remount, go ahead despite any errors */ WARN(1, "forced remount of a %s fs returned %i\n", sb->s_type->name, retval); } } WRITE_ONCE(sb->s_flags, ((sb->s_flags & ~fc->sb_flags_mask) | (fc->sb_flags & fc->sb_flags_mask))); sb_end_ro_state_change(sb); /* * Some filesystems modify their metadata via some other path than the * bdev buffer cache (eg. use a private mapping, or directories in * pagecache, etc). Also file data modifications go via their own * mappings. So If we try to mount readonly then copy the filesystem * from bdev, we could get stale data, so invalidate it to give a best * effort at coherency. */ if (remount_ro && sb->s_bdev) invalidate_bdev(sb->s_bdev); return 0; cancel_readonly: sb_end_ro_state_change(sb); return retval; } static void do_emergency_remount_callback(struct super_block *sb) { bool born = super_lock_excl(sb); if (born && sb->s_root && sb->s_bdev && !sb_rdonly(sb)) { struct fs_context *fc; fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY | SB_FORCE, SB_RDONLY); if (!IS_ERR(fc)) { if (parse_monolithic_mount_data(fc, NULL) == 0) (void)reconfigure_super(fc); put_fs_context(fc); } } super_unlock_excl(sb); } static void do_emergency_remount(struct work_struct *work) { __iterate_supers(do_emergency_remount_callback); kfree(work); printk("Emergency Remount complete\n"); } void emergency_remount(void) { struct work_struct *work; work = kmalloc(sizeof(*work), GFP_ATOMIC); if (work) { INIT_WORK(work, do_emergency_remount); schedule_work(work); } } static void do_thaw_all_callback(struct super_block *sb) { bool born = super_lock_excl(sb); if (born && sb->s_root) { if (IS_ENABLED(CONFIG_BLOCK)) while (sb->s_bdev && !thaw_bdev(sb->s_bdev)) pr_warn("Emergency Thaw on %pg\n", sb->s_bdev); thaw_super_locked(sb, FREEZE_HOLDER_USERSPACE); } else { super_unlock_excl(sb); } } static void do_thaw_all(struct work_struct *work) { __iterate_supers(do_thaw_all_callback); kfree(work); printk(KERN_WARNING "Emergency Thaw complete\n"); } /** * emergency_thaw_all -- forcibly thaw every frozen filesystem * * Used for emergency unfreeze of all filesystems via SysRq */ void emergency_thaw_all(void) { struct work_struct *work; work = kmalloc(sizeof(*work), GFP_ATOMIC); if (work) { INIT_WORK(work, do_thaw_all); schedule_work(work); } } static DEFINE_IDA(unnamed_dev_ida); /** * get_anon_bdev - Allocate a block device for filesystems which don't have one. * @p: Pointer to a dev_t. * * Filesystems which don't use real block devices can call this function * to allocate a virtual block device. * * Context: Any context. Frequently called while holding sb_lock. * Return: 0 on success, -EMFILE if there are no anonymous bdevs left * or -ENOMEM if memory allocation failed. */ int get_anon_bdev(dev_t *p) { int dev; /* * Many userspace utilities consider an FSID of 0 invalid. * Always return at least 1 from get_anon_bdev. */ dev = ida_alloc_range(&unnamed_dev_ida, 1, (1 << MINORBITS) - 1, GFP_ATOMIC); if (dev == -ENOSPC) dev = -EMFILE; if (dev < 0) return dev; *p = MKDEV(0, dev); return 0; } EXPORT_SYMBOL(get_anon_bdev); void free_anon_bdev(dev_t dev) { ida_free(&unnamed_dev_ida, MINOR(dev)); } EXPORT_SYMBOL(free_anon_bdev); int set_anon_super(struct super_block *s, void *data) { return get_anon_bdev(&s->s_dev); } EXPORT_SYMBOL(set_anon_super); void kill_anon_super(struct super_block *sb) { dev_t dev = sb->s_dev; generic_shutdown_super(sb); kill_super_notify(sb); free_anon_bdev(dev); } EXPORT_SYMBOL(kill_anon_super); void kill_litter_super(struct super_block *sb) { if (sb->s_root) d_genocide(sb->s_root); kill_anon_super(sb); } EXPORT_SYMBOL(kill_litter_super); int set_anon_super_fc(struct super_block *sb, struct fs_context *fc) { return set_anon_super(sb, NULL); } EXPORT_SYMBOL(set_anon_super_fc); static int test_keyed_super(struct super_block *sb, struct fs_context *fc) { return sb->s_fs_info == fc->s_fs_info; } static int test_single_super(struct super_block *s, struct fs_context *fc) { return 1; } static int vfs_get_super(struct fs_context *fc, int (*test)(struct super_block *, struct fs_context *), int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { struct super_block *sb; int err; sb = sget_fc(fc, test, set_anon_super_fc); if (IS_ERR(sb)) return PTR_ERR(sb); if (!sb->s_root) { err = fill_super(sb, fc); if (err) goto error; sb->s_flags |= SB_ACTIVE; } fc->root = dget(sb->s_root); return 0; error: deactivate_locked_super(sb); return err; } int get_tree_nodev(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { return vfs_get_super(fc, NULL, fill_super); } EXPORT_SYMBOL(get_tree_nodev); int get_tree_single(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { return vfs_get_super(fc, test_single_super, fill_super); } EXPORT_SYMBOL(get_tree_single); int get_tree_keyed(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc), void *key) { fc->s_fs_info = key; return vfs_get_super(fc, test_keyed_super, fill_super); } EXPORT_SYMBOL(get_tree_keyed); static int set_bdev_super(struct super_block *s, void *data) { s->s_dev = *(dev_t *)data; return 0; } static int super_s_dev_set(struct super_block *s, struct fs_context *fc) { return set_bdev_super(s, fc->sget_key); } static int super_s_dev_test(struct super_block *s, struct fs_context *fc) { return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)fc->sget_key; } /** * sget_dev - Find or create a superblock by device number * @fc: Filesystem context. * @dev: device number * * Find or create a superblock using the provided device number that * will be stored in fc->sget_key. * * If an extant superblock is matched, then that will be returned with * an elevated reference count that the caller must transfer or discard. * * If no match is made, a new superblock will be allocated and basic * initialisation will be performed (s_type, s_fs_info, s_id, s_dev will * be set). The superblock will be published and it will be returned in * a partially constructed state with SB_BORN and SB_ACTIVE as yet * unset. * * Return: an existing or newly created superblock on success, an error * pointer on failure. */ struct super_block *sget_dev(struct fs_context *fc, dev_t dev) { fc->sget_key = &dev; return sget_fc(fc, super_s_dev_test, super_s_dev_set); } EXPORT_SYMBOL(sget_dev); #ifdef CONFIG_BLOCK /* * Lock the superblock that is holder of the bdev. Returns the superblock * pointer if we successfully locked the superblock and it is alive. Otherwise * we return NULL and just unlock bdev->bd_holder_lock. * * The function must be called with bdev->bd_holder_lock and releases it. */ static struct super_block *bdev_super_lock_shared(struct block_device *bdev) __releases(&bdev->bd_holder_lock) { struct super_block *sb = bdev->bd_holder; bool born; lockdep_assert_held(&bdev->bd_holder_lock); lockdep_assert_not_held(&sb->s_umount); lockdep_assert_not_held(&bdev->bd_disk->open_mutex); /* Make sure sb doesn't go away from under us */ spin_lock(&sb_lock); sb->s_count++; spin_unlock(&sb_lock); mutex_unlock(&bdev->bd_holder_lock); born = super_lock_shared(sb); if (!born || !sb->s_root || !(sb->s_flags & SB_ACTIVE)) { super_unlock_shared(sb); put_super(sb); return NULL; } /* * The superblock is active and we hold s_umount, we can drop our * temporary reference now. */ put_super(sb); return sb; } static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise) { struct super_block *sb; sb = bdev_super_lock_shared(bdev); if (!sb) return; if (!surprise) sync_filesystem(sb); shrink_dcache_sb(sb); invalidate_inodes(sb); if (sb->s_op->shutdown) sb->s_op->shutdown(sb); super_unlock_shared(sb); } static void fs_bdev_sync(struct block_device *bdev) { struct super_block *sb; sb = bdev_super_lock_shared(bdev); if (!sb) return; sync_filesystem(sb); super_unlock_shared(sb); } const struct blk_holder_ops fs_holder_ops = { .mark_dead = fs_bdev_mark_dead, .sync = fs_bdev_sync, }; EXPORT_SYMBOL_GPL(fs_holder_ops); int setup_bdev_super(struct super_block *sb, int sb_flags, struct fs_context *fc) { blk_mode_t mode = sb_open_mode(sb_flags); struct bdev_handle *bdev_handle; struct block_device *bdev; bdev_handle = bdev_open_by_dev(sb->s_dev, mode, sb, &fs_holder_ops); if (IS_ERR(bdev_handle)) { if (fc) errorf(fc, "%s: Can't open blockdev", fc->source); return PTR_ERR(bdev_handle); } bdev = bdev_handle->bdev; /* * This really should be in blkdev_get_by_dev, but right now can't due * to legacy issues that require us to allow opening a block device node * writable from userspace even for a read-only block device. */ if ((mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) { bdev_release(bdev_handle); return -EACCES; } /* * Until SB_BORN flag is set, there can be no active superblock * references and thus no filesystem freezing. get_active_super() will * just loop waiting for SB_BORN so even freeze_bdev() cannot proceed. * * It is enough to check bdev was not frozen before we set s_bdev. */ mutex_lock(&bdev->bd_fsfreeze_mutex); if (bdev->bd_fsfreeze_count > 0) { mutex_unlock(&bdev->bd_fsfreeze_mutex); if (fc) warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev); bdev_release(bdev_handle); return -EBUSY; } spin_lock(&sb_lock); sb->s_bdev_handle = bdev_handle; sb->s_bdev = bdev; sb->s_bdi = bdi_get(bdev->bd_disk->bdi); if (bdev_stable_writes(bdev)) sb->s_iflags |= SB_I_STABLE_WRITES; spin_unlock(&sb_lock); mutex_unlock(&bdev->bd_fsfreeze_mutex); snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev); shrinker_debugfs_rename(sb->s_shrink, "sb-%s:%s", sb->s_type->name, sb->s_id); sb_set_blocksize(sb, block_size(bdev)); return 0; } EXPORT_SYMBOL_GPL(setup_bdev_super); /** * get_tree_bdev - Get a superblock based on a single block device * @fc: The filesystem context holding the parameters * @fill_super: Helper to initialise a new superblock */ int get_tree_bdev(struct fs_context *fc, int (*fill_super)(struct super_block *, struct fs_context *)) { struct super_block *s; int error = 0; dev_t dev; if (!fc->source) return invalf(fc, "No source specified"); error = lookup_bdev(fc->source, &dev); if (error) { errorf(fc, "%s: Can't lookup blockdev", fc->source); return error; } fc->sb_flags |= SB_NOSEC; s = sget_dev(fc, dev); if (IS_ERR(s)) return PTR_ERR(s); if (s->s_root) { /* Don't summarily change the RO/RW state. */ if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) { warnf(fc, "%pg: Can't mount, would change RO state", s->s_bdev); deactivate_locked_super(s); return -EBUSY; } } else { /* * We drop s_umount here because we need to open the bdev and * bdev->open_mutex ranks above s_umount (blkdev_put() -> * bdev_mark_dead()). It is safe because we have active sb * reference and SB_BORN is not set yet. */ super_unlock_excl(s); error = setup_bdev_super(s, fc->sb_flags, fc); __super_lock_excl(s); if (!error) error = fill_super(s, fc); if (error) { deactivate_locked_super(s); return error; } s->s_flags |= SB_ACTIVE; } BUG_ON(fc->root); fc->root = dget(s->s_root); return 0; } EXPORT_SYMBOL(get_tree_bdev); static int test_bdev_super(struct super_block *s, void *data) { return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)data; } struct dentry *mount_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, int (*fill_super)(struct super_block *, void *, int)) { struct super_block *s; int error; dev_t dev; error = lookup_bdev(dev_name, &dev); if (error) return ERR_PTR(error); flags |= SB_NOSEC; s = sget(fs_type, test_bdev_super, set_bdev_super, flags, &dev); if (IS_ERR(s)) return ERR_CAST(s); if (s->s_root) { if ((flags ^ s->s_flags) & SB_RDONLY) { deactivate_locked_super(s); return ERR_PTR(-EBUSY); } } else { /* * We drop s_umount here because we need to open the bdev and * bdev->open_mutex ranks above s_umount (blkdev_put() -> * bdev_mark_dead()). It is safe because we have active sb * reference and SB_BORN is not set yet. */ super_unlock_excl(s); error = setup_bdev_super(s, flags, NULL); __super_lock_excl(s); if (!error) error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); if (error) { deactivate_locked_super(s); return ERR_PTR(error); } s->s_flags |= SB_ACTIVE; } return dget(s->s_root); } EXPORT_SYMBOL(mount_bdev); void kill_block_super(struct super_block *sb) { struct block_device *bdev = sb->s_bdev; generic_shutdown_super(sb); if (bdev) { sync_blockdev(bdev); bdev_release(sb->s_bdev_handle); } } EXPORT_SYMBOL(kill_block_super); #endif struct dentry *mount_nodev(struct file_system_type *fs_type, int flags, void *data, int (*fill_super)(struct super_block *, void *, int)) { int error; struct super_block *s = sget(fs_type, NULL, set_anon_super, flags, NULL); if (IS_ERR(s)) return ERR_CAST(s); error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); if (error) { deactivate_locked_super(s); return ERR_PTR(error); } s->s_flags |= SB_ACTIVE; return dget(s->s_root); } EXPORT_SYMBOL(mount_nodev); int reconfigure_single(struct super_block *s, int flags, void *data) { struct fs_context *fc; int ret; /* The caller really need to be passing fc down into mount_single(), * then a chunk of this can be removed. [Bollocks -- AV] * Better yet, reconfiguration shouldn't happen, but rather the second * mount should be rejected if the parameters are not compatible. */ fc = fs_context_for_reconfigure(s->s_root, flags, MS_RMT_MASK); if (IS_ERR(fc)) return PTR_ERR(fc); ret = parse_monolithic_mount_data(fc, data); if (ret < 0) goto out; ret = reconfigure_super(fc); out: put_fs_context(fc); return ret; } static int compare_single(struct super_block *s, void *p) { return 1; } struct dentry *mount_single(struct file_system_type *fs_type, int flags, void *data, int (*fill_super)(struct super_block *, void *, int)) { struct super_block *s; int error; s = sget(fs_type, compare_single, set_anon_super, flags, NULL); if (IS_ERR(s)) return ERR_CAST(s); if (!s->s_root) { error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); if (!error) s->s_flags |= SB_ACTIVE; } else { error = reconfigure_single(s, flags, data); } if (unlikely(error)) { deactivate_locked_super(s); return ERR_PTR(error); } return dget(s->s_root); } EXPORT_SYMBOL(mount_single); /** * vfs_get_tree - Get the mountable root * @fc: The superblock configuration context. * * The filesystem is invoked to get or create a superblock which can then later * be used for mounting. The filesystem places a pointer to the root to be * used for mounting in @fc->root. */ int vfs_get_tree(struct fs_context *fc) { struct super_block *sb; int error; if (fc->root) return -EBUSY; /* Get the mountable root in fc->root, with a ref on the root and a ref * on the superblock. */ error = fc->ops->get_tree(fc); if (error < 0) return error; if (!fc->root) { pr_err("Filesystem %s get_tree() didn't set fc->root\n", fc->fs_type->name); /* We don't know what the locking state of the superblock is - * if there is a superblock. */ BUG(); } sb = fc->root->d_sb; WARN_ON(!sb->s_bdi); /* * super_wake() contains a memory barrier which also care of * ordering for super_cache_count(). We place it before setting * SB_BORN as the data dependency between the two functions is * the superblock structure contents that we just set up, not * the SB_BORN flag. */ super_wake(sb, SB_BORN); error = security_sb_set_mnt_opts(sb, fc->security, 0, NULL); if (unlikely(error)) { fc_drop_locked(fc); return error; } /* * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE * but s_maxbytes was an unsigned long long for many releases. Throw * this warning for a little while to try and catch filesystems that * violate this rule. */ WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to " "negative value (%lld)\n", fc->fs_type->name, sb->s_maxbytes); return 0; } EXPORT_SYMBOL(vfs_get_tree); /* * Setup private BDI for given superblock. It gets automatically cleaned up * in generic_shutdown_super(). */ int super_setup_bdi_name(struct super_block *sb, char *fmt, ...) { struct backing_dev_info *bdi; int err; va_list args; bdi = bdi_alloc(NUMA_NO_NODE); if (!bdi) return -ENOMEM; va_start(args, fmt); err = bdi_register_va(bdi, fmt, args); va_end(args); if (err) { bdi_put(bdi); return err; } WARN_ON(sb->s_bdi != &noop_backing_dev_info); sb->s_bdi = bdi; sb->s_iflags |= SB_I_PERSB_BDI; return 0; } EXPORT_SYMBOL(super_setup_bdi_name); /* * Setup private BDI for given superblock. I gets automatically cleaned up * in generic_shutdown_super(). */ int super_setup_bdi(struct super_block *sb) { static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); return super_setup_bdi_name(sb, "%.28s-%ld", sb->s_type->name, atomic_long_inc_return(&bdi_seq)); } EXPORT_SYMBOL(super_setup_bdi); /** * sb_wait_write - wait until all writers to given file system finish * @sb: the super for which we wait * @level: type of writers we wait for (normal vs page fault) * * This function waits until there are no writers of given type to given file * system. */ static void sb_wait_write(struct super_block *sb, int level) { percpu_down_write(sb->s_writers.rw_sem + level-1); } /* * We are going to return to userspace and forget about these locks, the * ownership goes to the caller of thaw_super() which does unlock(). */ static void lockdep_sb_freeze_release(struct super_block *sb) { int level; for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--) percpu_rwsem_release(sb->s_writers.rw_sem + level, 0, _THIS_IP_); } /* * Tell lockdep we are holding these locks before we call ->unfreeze_fs(sb). */ static void lockdep_sb_freeze_acquire(struct super_block *sb) { int level; for (level = 0; level < SB_FREEZE_LEVELS; ++level) percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_); } static void sb_freeze_unlock(struct super_block *sb, int level) { for (level--; level >= 0; level--) percpu_up_write(sb->s_writers.rw_sem + level); } static int wait_for_partially_frozen(struct super_block *sb) { int ret = 0; do { unsigned short old = sb->s_writers.frozen; up_write(&sb->s_umount); ret = wait_var_event_killable(&sb->s_writers.frozen, sb->s_writers.frozen != old); down_write(&sb->s_umount); } while (ret == 0 && sb->s_writers.frozen != SB_UNFROZEN && sb->s_writers.frozen != SB_FREEZE_COMPLETE); return ret; } /** * freeze_super - lock the filesystem and force it into a consistent state * @sb: the super to lock * @who: context that wants to freeze * * Syncs the super to make sure the filesystem is consistent and calls the fs's * freeze_fs. Subsequent calls to this without first thawing the fs may return * -EBUSY. * * @who should be: * * %FREEZE_HOLDER_USERSPACE if userspace wants to freeze the fs; * * %FREEZE_HOLDER_KERNEL if the kernel wants to freeze the fs. * * The @who argument distinguishes between the kernel and userspace trying to * freeze the filesystem. Although there cannot be multiple kernel freezes or * multiple userspace freezes in effect at any given time, the kernel and * userspace can both hold a filesystem frozen. The filesystem remains frozen * until there are no kernel or userspace freezes in effect. * * During this function, sb->s_writers.frozen goes through these values: * * SB_UNFROZEN: File system is normal, all writes progress as usual. * * SB_FREEZE_WRITE: The file system is in the process of being frozen. New * writes should be blocked, though page faults are still allowed. We wait for * all writes to complete and then proceed to the next stage. * * SB_FREEZE_PAGEFAULT: Freezing continues. Now also page faults are blocked * but internal fs threads can still modify the filesystem (although they * should not dirty new pages or inodes), writeback can run etc. After waiting * for all running page faults we sync the filesystem which will clean all * dirty pages and inodes (no new dirty pages or inodes can be created when * sync is running). * * SB_FREEZE_FS: The file system is frozen. Now all internal sources of fs * modification are blocked (e.g. XFS preallocation truncation on inode * reclaim). This is usually implemented by blocking new transactions for * filesystems that have them and need this additional guard. After all * internal writers are finished we call ->freeze_fs() to finish filesystem * freezing. Then we transition to SB_FREEZE_COMPLETE state. This state is * mostly auxiliary for filesystems to verify they do not modify frozen fs. * * sb->s_writers.frozen is protected by sb->s_umount. */ int freeze_super(struct super_block *sb, enum freeze_holder who) { int ret; atomic_inc(&sb->s_active); if (!super_lock_excl(sb)) WARN(1, "Dying superblock while freezing!"); retry: if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) { if (sb->s_writers.freeze_holders & who) { deactivate_locked_super(sb); return -EBUSY; } WARN_ON(sb->s_writers.freeze_holders == 0); /* * Someone else already holds this type of freeze; share the * freeze and assign the active ref to the freeze. */ sb->s_writers.freeze_holders |= who; super_unlock_excl(sb); return 0; } if (sb->s_writers.frozen != SB_UNFROZEN) { ret = wait_for_partially_frozen(sb); if (ret) { deactivate_locked_super(sb); return ret; } goto retry; } if (!(sb->s_flags & SB_BORN)) { super_unlock_excl(sb); return 0; /* sic - it's "nothing to do" */ } if (sb_rdonly(sb)) { /* Nothing to do really... */ sb->s_writers.freeze_holders |= who; sb->s_writers.frozen = SB_FREEZE_COMPLETE; wake_up_var(&sb->s_writers.frozen); super_unlock_excl(sb); return 0; } sb->s_writers.frozen = SB_FREEZE_WRITE; /* Release s_umount to preserve sb_start_write -> s_umount ordering */ super_unlock_excl(sb); sb_wait_write(sb, SB_FREEZE_WRITE); if (!super_lock_excl(sb)) WARN(1, "Dying superblock while freezing!"); /* Now we go and block page faults... */ sb->s_writers.frozen = SB_FREEZE_PAGEFAULT; sb_wait_write(sb, SB_FREEZE_PAGEFAULT); /* All writers are done so after syncing there won't be dirty data */ ret = sync_filesystem(sb); if (ret) { sb->s_writers.frozen = SB_UNFROZEN; sb_freeze_unlock(sb, SB_FREEZE_PAGEFAULT); wake_up_var(&sb->s_writers.frozen); deactivate_locked_super(sb); return ret; } /* Now wait for internal filesystem counter */ sb->s_writers.frozen = SB_FREEZE_FS; sb_wait_write(sb, SB_FREEZE_FS); if (sb->s_op->freeze_fs) { ret = sb->s_op->freeze_fs(sb); if (ret) { printk(KERN_ERR "VFS:Filesystem freeze failed\n"); sb->s_writers.frozen = SB_UNFROZEN; sb_freeze_unlock(sb, SB_FREEZE_FS); wake_up_var(&sb->s_writers.frozen); deactivate_locked_super(sb); return ret; } } /* * For debugging purposes so that fs can warn if it sees write activity * when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super(). */ sb->s_writers.freeze_holders |= who; sb->s_writers.frozen = SB_FREEZE_COMPLETE; wake_up_var(&sb->s_writers.frozen); lockdep_sb_freeze_release(sb); super_unlock_excl(sb); return 0; } EXPORT_SYMBOL(freeze_super); /* * Undoes the effect of a freeze_super_locked call. If the filesystem is * frozen both by userspace and the kernel, a thaw call from either source * removes that state without releasing the other state or unlocking the * filesystem. */ static int thaw_super_locked(struct super_block *sb, enum freeze_holder who) { int error; if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) { if (!(sb->s_writers.freeze_holders & who)) { super_unlock_excl(sb); return -EINVAL; } /* * Freeze is shared with someone else. Release our hold and * drop the active ref that freeze_super assigned to the * freezer. */ if (sb->s_writers.freeze_holders & ~who) { sb->s_writers.freeze_holders &= ~who; deactivate_locked_super(sb); return 0; } } else { super_unlock_excl(sb); return -EINVAL; } if (sb_rdonly(sb)) { sb->s_writers.freeze_holders &= ~who; sb->s_writers.frozen = SB_UNFROZEN; wake_up_var(&sb->s_writers.frozen); goto out; } lockdep_sb_freeze_acquire(sb); if (sb->s_op->unfreeze_fs) { error = sb->s_op->unfreeze_fs(sb); if (error) { printk(KERN_ERR "VFS:Filesystem thaw failed\n"); lockdep_sb_freeze_release(sb); super_unlock_excl(sb); return error; } } sb->s_writers.freeze_holders &= ~who; sb->s_writers.frozen = SB_UNFROZEN; wake_up_var(&sb->s_writers.frozen); sb_freeze_unlock(sb, SB_FREEZE_FS); out: deactivate_locked_super(sb); return 0; } /** * thaw_super -- unlock filesystem * @sb: the super to thaw * @who: context that wants to freeze * * Unlocks the filesystem and marks it writeable again after freeze_super() * if there are no remaining freezes on the filesystem. * * @who should be: * * %FREEZE_HOLDER_USERSPACE if userspace wants to thaw the fs; * * %FREEZE_HOLDER_KERNEL if the kernel wants to thaw the fs. */ int thaw_super(struct super_block *sb, enum freeze_holder who) { if (!super_lock_excl(sb)) WARN(1, "Dying superblock while thawing!"); return thaw_super_locked(sb, who); } EXPORT_SYMBOL(thaw_super); /* * Create workqueue for deferred direct IO completions. We allocate the * workqueue when it's first needed. This avoids creating workqueue for * filesystems that don't need it and also allows us to create the workqueue * late enough so the we can include s_id in the name of the workqueue. */ int sb_init_dio_done_wq(struct super_block *sb) { struct workqueue_struct *old; struct workqueue_struct *wq = alloc_workqueue("dio/%s", WQ_MEM_RECLAIM, 0, sb->s_id); if (!wq) return -ENOMEM; /* * This has to be atomic as more DIOs can race to create the workqueue */ old = cmpxchg(&sb->s_dio_done_wq, NULL, wq); /* Someone created workqueue before us? Free ours... */ if (old) destroy_workqueue(wq); return 0; } EXPORT_SYMBOL_GPL(sb_init_dio_done_wq);
1227 1227 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2013 Politecnico di Torino, Italy * TORSEC group -- https://security.polito.it * * Author: Roberto Sassu <roberto.sassu@polito.it> * * File: ima_template.c * Helpers to manage template descriptors. */ #include <linux/rculist.h> #include "ima.h" #include "ima_template_lib.h" enum header_fields { HDR_PCR, HDR_DIGEST, HDR_TEMPLATE_NAME, HDR_TEMPLATE_DATA, HDR__LAST }; static struct ima_template_desc builtin_templates[] = { {.name = IMA_TEMPLATE_IMA_NAME, .fmt = IMA_TEMPLATE_IMA_FMT}, {.name = "ima-ng", .fmt = "d-ng|n-ng"}, {.name = "ima-sig", .fmt = "d-ng|n-ng|sig"}, {.name = "ima-ngv2", .fmt = "d-ngv2|n-ng"}, {.name = "ima-sigv2", .fmt = "d-ngv2|n-ng|sig"}, {.name = "ima-buf", .fmt = "d-ng|n-ng|buf"}, {.name = "ima-modsig", .fmt = "d-ng|n-ng|sig|d-modsig|modsig"}, {.name = "evm-sig", .fmt = "d-ng|n-ng|evmsig|xattrnames|xattrlengths|xattrvalues|iuid|igid|imode"}, {.name = "", .fmt = ""}, /* placeholder for a custom format */ }; static LIST_HEAD(defined_templates); static DEFINE_SPINLOCK(template_list); static int template_setup_done; static const struct ima_template_field supported_fields[] = { {.field_id = "d", .field_init = ima_eventdigest_init, .field_show = ima_show_template_digest}, {.field_id = "n", .field_init = ima_eventname_init, .field_show = ima_show_template_string}, {.field_id = "d-ng", .field_init = ima_eventdigest_ng_init, .field_show = ima_show_template_digest_ng}, {.field_id = "d-ngv2", .field_init = ima_eventdigest_ngv2_init, .field_show = ima_show_template_digest_ngv2}, {.field_id = "n-ng", .field_init = ima_eventname_ng_init, .field_show = ima_show_template_string}, {.field_id = "sig", .field_init = ima_eventsig_init, .field_show = ima_show_template_sig}, {.field_id = "buf", .field_init = ima_eventbuf_init, .field_show = ima_show_template_buf}, {.field_id = "d-modsig", .field_init = ima_eventdigest_modsig_init, .field_show = ima_show_template_digest_ng}, {.field_id = "modsig", .field_init = ima_eventmodsig_init, .field_show = ima_show_template_sig}, {.field_id = "evmsig", .field_init = ima_eventevmsig_init, .field_show = ima_show_template_sig}, {.field_id = "iuid", .field_init = ima_eventinodeuid_init, .field_show = ima_show_template_uint}, {.field_id = "igid", .field_init = ima_eventinodegid_init, .field_show = ima_show_template_uint}, {.field_id = "imode", .field_init = ima_eventinodemode_init, .field_show = ima_show_template_uint}, {.field_id = "xattrnames", .field_init = ima_eventinodexattrnames_init, .field_show = ima_show_template_string}, {.field_id = "xattrlengths", .field_init = ima_eventinodexattrlengths_init, .field_show = ima_show_template_sig}, {.field_id = "xattrvalues", .field_init = ima_eventinodexattrvalues_init, .field_show = ima_show_template_sig}, }; /* * Used when restoring measurements carried over from a kexec. 'd' and 'n' don't * need to be accounted for since they shouldn't be defined in the same template * description as 'd-ng' and 'n-ng' respectively. */ #define MAX_TEMPLATE_NAME_LEN \ sizeof("d-ng|n-ng|evmsig|xattrnames|xattrlengths|xattrvalues|iuid|igid|imode") static struct ima_template_desc *ima_template; static struct ima_template_desc *ima_buf_template; /** * ima_template_has_modsig - Check whether template has modsig-related fields. * @ima_template: IMA template to check. * * Tells whether the given template has fields referencing a file's appended * signature. */ bool ima_template_has_modsig(const struct ima_template_desc *ima_template) { int i; for (i = 0; i < ima_template->num_fields; i++) if (!strcmp(ima_template->fields[i]->field_id, "modsig") || !strcmp(ima_template->fields[i]->field_id, "d-modsig")) return true; return false; } static int __init ima_template_setup(char *str) { struct ima_template_desc *template_desc; int template_len = strlen(str); if (template_setup_done) return 1; if (!ima_template) ima_init_template_list(); /* * Verify that a template with the supplied name exists. * If not, use CONFIG_IMA_DEFAULT_TEMPLATE. */ template_desc = lookup_template_desc(str); if (!template_desc) { pr_err("template %s not found, using %s\n", str, CONFIG_IMA_DEFAULT_TEMPLATE); return 1; } /* * Verify whether the current hash algorithm is supported * by the 'ima' template. */ if (template_len == 3 && strcmp(str, IMA_TEMPLATE_IMA_NAME) == 0 && ima_hash_algo != HASH_ALGO_SHA1 && ima_hash_algo != HASH_ALGO_MD5) { pr_err("template does not support hash alg\n"); return 1; } ima_template = template_desc; template_setup_done = 1; return 1; } __setup("ima_template=", ima_template_setup); static int __init ima_template_fmt_setup(char *str) { int num_templates = ARRAY_SIZE(builtin_templates); if (template_setup_done) return 1; if (template_desc_init_fields(str, NULL, NULL) < 0) { pr_err("format string '%s' not valid, using template %s\n", str, CONFIG_IMA_DEFAULT_TEMPLATE); return 1; } builtin_templates[num_templates - 1].fmt = str; ima_template = builtin_templates + num_templates - 1; template_setup_done = 1; return 1; } __setup("ima_template_fmt=", ima_template_fmt_setup); struct ima_template_desc *lookup_template_desc(const char *name) { struct ima_template_desc *template_desc; int found = 0; rcu_read_lock(); list_for_each_entry_rcu(template_desc, &defined_templates, list) { if ((strcmp(template_desc->name, name) == 0) || (strcmp(template_desc->fmt, name) == 0)) { found = 1; break; } } rcu_read_unlock(); return found ? template_desc : NULL; } static const struct ima_template_field * lookup_template_field(const char *field_id) { int i; for (i = 0; i < ARRAY_SIZE(supported_fields); i++) if (strncmp(supported_fields[i].field_id, field_id, IMA_TEMPLATE_FIELD_ID_MAX_LEN) == 0) return &supported_fields[i]; return NULL; } static int template_fmt_size(const char *template_fmt) { char c; int template_fmt_len = strlen(template_fmt); int i = 0, j = 0; while (i < template_fmt_len) { c = template_fmt[i]; if (c == '|') j++; i++; } return j + 1; } int template_desc_init_fields(const char *template_fmt, const struct ima_template_field ***fields, int *num_fields) { const char *template_fmt_ptr; const struct ima_template_field *found_fields[IMA_TEMPLATE_NUM_FIELDS_MAX]; int template_num_fields; int i, len; if (num_fields && *num_fields > 0) /* already initialized? */ return 0; template_num_fields = template_fmt_size(template_fmt); if (template_num_fields > IMA_TEMPLATE_NUM_FIELDS_MAX) { pr_err("format string '%s' contains too many fields\n", template_fmt); return -EINVAL; } for (i = 0, template_fmt_ptr = template_fmt; i < template_num_fields; i++, template_fmt_ptr += len + 1) { char tmp_field_id[IMA_TEMPLATE_FIELD_ID_MAX_LEN + 1]; len = strchrnul(template_fmt_ptr, '|') - template_fmt_ptr; if (len == 0 || len > IMA_TEMPLATE_FIELD_ID_MAX_LEN) { pr_err("Invalid field with length %d\n", len); return -EINVAL; } memcpy(tmp_field_id, template_fmt_ptr, len); tmp_field_id[len] = '\0'; found_fields[i] = lookup_template_field(tmp_field_id); if (!found_fields[i]) { pr_err("field '%s' not found\n", tmp_field_id); return -ENOENT; } } if (fields && num_fields) { *fields = kmalloc_array(i, sizeof(**fields), GFP_KERNEL); if (*fields == NULL) return -ENOMEM; memcpy(*fields, found_fields, i * sizeof(**fields)); *num_fields = i; } return 0; } void ima_init_template_list(void) { int i; if (!list_empty(&defined_templates)) return; spin_lock(&template_list); for (i = 0; i < ARRAY_SIZE(builtin_templates); i++) { list_add_tail_rcu(&builtin_templates[i].list, &defined_templates); } spin_unlock(&template_list); } struct ima_template_desc *ima_template_desc_current(void) { if (!ima_template) { ima_init_template_list(); ima_template = lookup_template_desc(CONFIG_IMA_DEFAULT_TEMPLATE); } return ima_template; } struct ima_template_desc *ima_template_desc_buf(void) { if (!ima_buf_template) { ima_init_template_list(); ima_buf_template = lookup_template_desc("ima-buf"); } return ima_buf_template; } int __init ima_init_template(void) { struct ima_template_desc *template = ima_template_desc_current(); int result; result = template_desc_init_fields(template->fmt, &(template->fields), &(template->num_fields)); if (result < 0) { pr_err("template %s init failed, result: %d\n", (strlen(template->name) ? template->name : template->fmt), result); return result; } template = ima_template_desc_buf(); if (!template) { pr_err("Failed to get ima-buf template\n"); return -EINVAL; } result = template_desc_init_fields(template->fmt, &(template->fields), &(template->num_fields)); if (result < 0) pr_err("template %s init failed, result: %d\n", (strlen(template->name) ? template->name : template->fmt), result); return result; } static struct ima_template_desc *restore_template_fmt(char *template_name) { struct ima_template_desc *template_desc = NULL; int ret; ret = template_desc_init_fields(template_name, NULL, NULL); if (ret < 0) { pr_err("attempting to initialize the template \"%s\" failed\n", template_name); goto out; } template_desc = kzalloc(sizeof(*template_desc), GFP_KERNEL); if (!template_desc) goto out; template_desc->name = ""; template_desc->fmt = kstrdup(template_name, GFP_KERNEL); if (!template_desc->fmt) { kfree(template_desc); template_desc = NULL; goto out; } spin_lock(&template_list); list_add_tail_rcu(&template_desc->list, &defined_templates); spin_unlock(&template_list); out: return template_desc; } static int ima_restore_template_data(struct ima_template_desc *template_desc, void *template_data, int template_data_size, struct ima_template_entry **entry) { struct tpm_digest *digests; int ret = 0; int i; *entry = kzalloc(struct_size(*entry, template_data, template_desc->num_fields), GFP_NOFS); if (!*entry) return -ENOMEM; digests = kcalloc(NR_BANKS(ima_tpm_chip) + ima_extra_slots, sizeof(*digests), GFP_NOFS); if (!digests) { kfree(*entry); return -ENOMEM; } (*entry)->digests = digests; ret = ima_parse_buf(template_data, template_data + template_data_size, NULL, template_desc->num_fields, (*entry)->template_data, NULL, NULL, ENFORCE_FIELDS | ENFORCE_BUFEND, "template data"); if (ret < 0) { kfree((*entry)->digests); kfree(*entry); return ret; } (*entry)->template_desc = template_desc; for (i = 0; i < template_desc->num_fields; i++) { struct ima_field_data *field_data = &(*entry)->template_data[i]; u8 *data = field_data->data; (*entry)->template_data[i].data = kzalloc(field_data->len + 1, GFP_KERNEL); if (!(*entry)->template_data[i].data) { ret = -ENOMEM; break; } memcpy((*entry)->template_data[i].data, data, field_data->len); (*entry)->template_data_len += sizeof(field_data->len); (*entry)->template_data_len += field_data->len; } if (ret < 0) { ima_free_template_entry(*entry); *entry = NULL; } return ret; } /* Restore the serialized binary measurement list without extending PCRs. */ int ima_restore_measurement_list(loff_t size, void *buf) { char template_name[MAX_TEMPLATE_NAME_LEN]; unsigned char zero[TPM_DIGEST_SIZE] = { 0 }; struct ima_kexec_hdr *khdr = buf; struct ima_field_data hdr[HDR__LAST] = { [HDR_PCR] = {.len = sizeof(u32)}, [HDR_DIGEST] = {.len = TPM_DIGEST_SIZE}, }; void *bufp = buf + sizeof(*khdr); void *bufendp; struct ima_template_entry *entry; struct ima_template_desc *template_desc; DECLARE_BITMAP(hdr_mask, HDR__LAST); unsigned long count = 0; int ret = 0; if (!buf || size < sizeof(*khdr)) return 0; if (ima_canonical_fmt) { khdr->version = le16_to_cpu((__force __le16)khdr->version); khdr->count = le64_to_cpu((__force __le64)khdr->count); khdr->buffer_size = le64_to_cpu((__force __le64)khdr->buffer_size); } if (khdr->version != 1) { pr_err("attempting to restore a incompatible measurement list"); return -EINVAL; } if (khdr->count > ULONG_MAX - 1) { pr_err("attempting to restore too many measurements"); return -EINVAL; } bitmap_zero(hdr_mask, HDR__LAST); bitmap_set(hdr_mask, HDR_PCR, 1); bitmap_set(hdr_mask, HDR_DIGEST, 1); /* * ima kexec buffer prefix: version, buffer size, count * v1 format: pcr, digest, template-name-len, template-name, * template-data-size, template-data */ bufendp = buf + khdr->buffer_size; while ((bufp < bufendp) && (count++ < khdr->count)) { int enforce_mask = ENFORCE_FIELDS; enforce_mask |= (count == khdr->count) ? ENFORCE_BUFEND : 0; ret = ima_parse_buf(bufp, bufendp, &bufp, HDR__LAST, hdr, NULL, hdr_mask, enforce_mask, "entry header"); if (ret < 0) break; if (hdr[HDR_TEMPLATE_NAME].len >= MAX_TEMPLATE_NAME_LEN) { pr_err("attempting to restore a template name that is too long\n"); ret = -EINVAL; break; } /* template name is not null terminated */ memcpy(template_name, hdr[HDR_TEMPLATE_NAME].data, hdr[HDR_TEMPLATE_NAME].len); template_name[hdr[HDR_TEMPLATE_NAME].len] = 0; if (strcmp(template_name, "ima") == 0) { pr_err("attempting to restore an unsupported template \"%s\" failed\n", template_name); ret = -EINVAL; break; } template_desc = lookup_template_desc(template_name); if (!template_desc) { template_desc = restore_template_fmt(template_name); if (!template_desc) break; } /* * Only the running system's template format is initialized * on boot. As needed, initialize the other template formats. */ ret = template_desc_init_fields(template_desc->fmt, &(template_desc->fields), &(template_desc->num_fields)); if (ret < 0) { pr_err("attempting to restore the template fmt \"%s\" failed\n", template_desc->fmt); ret = -EINVAL; break; } ret = ima_restore_template_data(template_desc, hdr[HDR_TEMPLATE_DATA].data, hdr[HDR_TEMPLATE_DATA].len, &entry); if (ret < 0) break; if (memcmp(hdr[HDR_DIGEST].data, zero, sizeof(zero))) { ret = ima_calc_field_array_hash( &entry->template_data[0], entry); if (ret < 0) { pr_err("cannot calculate template digest\n"); ret = -EINVAL; break; } } entry->pcr = !ima_canonical_fmt ? *(u32 *)(hdr[HDR_PCR].data) : le32_to_cpu(*(__le32 *)(hdr[HDR_PCR].data)); ret = ima_restore_measurement_entry(entry); if (ret < 0) break; } return ret; }
110 110 110 198 198 198 198 155 155 155 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 // SPDX-License-Identifier: GPL-2.0-only /* * Pluggable TCP congestion control support and newReno * congestion control. * Based on ideas from I/O scheduler support and Web100. * * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org> */ #define pr_fmt(fmt) "TCP: " fmt #include <linux/module.h> #include <linux/mm.h> #include <linux/types.h> #include <linux/list.h> #include <linux/gfp.h> #include <linux/jhash.h> #include <net/tcp.h> #include <trace/events/tcp.h> static DEFINE_SPINLOCK(tcp_cong_list_lock); static LIST_HEAD(tcp_cong_list); /* Simple linear search, don't expect many entries! */ struct tcp_congestion_ops *tcp_ca_find(const char *name) { struct tcp_congestion_ops *e; list_for_each_entry_rcu(e, &tcp_cong_list, list) { if (strcmp(e->name, name) == 0) return e; } return NULL; } void tcp_set_ca_state(struct sock *sk, const u8 ca_state) { struct inet_connection_sock *icsk = inet_csk(sk); trace_tcp_cong_state_set(sk, ca_state); if (icsk->icsk_ca_ops->set_state) icsk->icsk_ca_ops->set_state(sk, ca_state); icsk->icsk_ca_state = ca_state; } /* Must be called with rcu lock held */ static struct tcp_congestion_ops *tcp_ca_find_autoload(struct net *net, const char *name) { struct tcp_congestion_ops *ca = tcp_ca_find(name); #ifdef CONFIG_MODULES if (!ca && capable(CAP_NET_ADMIN)) { rcu_read_unlock(); request_module("tcp_%s", name); rcu_read_lock(); ca = tcp_ca_find(name); } #endif return ca; } /* Simple linear search, not much in here. */ struct tcp_congestion_ops *tcp_ca_find_key(u32 key) { struct tcp_congestion_ops *e; list_for_each_entry_rcu(e, &tcp_cong_list, list) { if (e->key == key) return e; } return NULL; } int tcp_validate_congestion_control(struct tcp_congestion_ops *ca) { /* all algorithms must implement these */ if (!ca->ssthresh || !ca->undo_cwnd || !(ca->cong_avoid || ca->cong_control)) { pr_err("%s does not implement required ops\n", ca->name); return -EINVAL; } return 0; } /* Attach new congestion control algorithm to the list * of available options. */ int tcp_register_congestion_control(struct tcp_congestion_ops *ca) { int ret; ret = tcp_validate_congestion_control(ca); if (ret) return ret; ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name)); spin_lock(&tcp_cong_list_lock); if (ca->key == TCP_CA_UNSPEC || tcp_ca_find_key(ca->key)) { pr_notice("%s already registered or non-unique key\n", ca->name); ret = -EEXIST; } else { list_add_tail_rcu(&ca->list, &tcp_cong_list); pr_debug("%s registered\n", ca->name); } spin_unlock(&tcp_cong_list_lock); return ret; } EXPORT_SYMBOL_GPL(tcp_register_congestion_control); /* * Remove congestion control algorithm, called from * the module's remove function. Module ref counts are used * to ensure that this can't be done till all sockets using * that method are closed. */ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) { spin_lock(&tcp_cong_list_lock); list_del_rcu(&ca->list); spin_unlock(&tcp_cong_list_lock); /* Wait for outstanding readers to complete before the * module gets removed entirely. * * A try_module_get() should fail by now as our module is * in "going" state since no refs are held anymore and * module_exit() handler being called. */ synchronize_rcu(); } EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); /* Replace a registered old ca with a new one. * * The new ca must have the same name as the old one, that has been * registered. */ int tcp_update_congestion_control(struct tcp_congestion_ops *ca, struct tcp_congestion_ops *old_ca) { struct tcp_congestion_ops *existing; int ret; ret = tcp_validate_congestion_control(ca); if (ret) return ret; ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name)); spin_lock(&tcp_cong_list_lock); existing = tcp_ca_find_key(old_ca->key); if (ca->key == TCP_CA_UNSPEC || !existing || strcmp(existing->name, ca->name)) { pr_notice("%s not registered or non-unique key\n", ca->name); ret = -EINVAL; } else if (existing != old_ca) { pr_notice("invalid old congestion control algorithm to replace\n"); ret = -EINVAL; } else { /* Add the new one before removing the old one to keep * one implementation available all the time. */ list_add_tail_rcu(&ca->list, &tcp_cong_list); list_del_rcu(&existing->list); pr_debug("%s updated\n", ca->name); } spin_unlock(&tcp_cong_list_lock); /* Wait for outstanding readers to complete before the * module or struct_ops gets removed entirely. */ if (!ret) synchronize_rcu(); return ret; } u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca) { const struct tcp_congestion_ops *ca; u32 key = TCP_CA_UNSPEC; might_sleep(); rcu_read_lock(); ca = tcp_ca_find_autoload(net, name); if (ca) { key = ca->key; *ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN; } rcu_read_unlock(); return key; } char *tcp_ca_get_name_by_key(u32 key, char *buffer) { const struct tcp_congestion_ops *ca; char *ret = NULL; rcu_read_lock(); ca = tcp_ca_find_key(key); if (ca) ret = strncpy(buffer, ca->name, TCP_CA_NAME_MAX); rcu_read_unlock(); return ret; } /* Assign choice of congestion control. */ void tcp_assign_congestion_control(struct sock *sk) { struct net *net = sock_net(sk); struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_congestion_ops *ca; rcu_read_lock(); ca = rcu_dereference(net->ipv4.tcp_congestion_control); if (unlikely(!bpf_try_module_get(ca, ca->owner))) ca = &tcp_reno; icsk->icsk_ca_ops = ca; rcu_read_unlock(); memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); if (ca->flags & TCP_CONG_NEEDS_ECN) INET_ECN_xmit(sk); else INET_ECN_dontxmit(sk); } void tcp_init_congestion_control(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); tcp_sk(sk)->prior_ssthresh = 0; if (icsk->icsk_ca_ops->init) icsk->icsk_ca_ops->init(sk); if (tcp_ca_needs_ecn(sk)) INET_ECN_xmit(sk); else INET_ECN_dontxmit(sk); icsk->icsk_ca_initialized = 1; } static void tcp_reinit_congestion_control(struct sock *sk, const struct tcp_congestion_ops *ca) { struct inet_connection_sock *icsk = inet_csk(sk); tcp_cleanup_congestion_control(sk); icsk->icsk_ca_ops = ca; icsk->icsk_ca_setsockopt = 1; memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); if (ca->flags & TCP_CONG_NEEDS_ECN) INET_ECN_xmit(sk); else INET_ECN_dontxmit(sk); if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) tcp_init_congestion_control(sk); } /* Manage refcounts on socket close. */ void tcp_cleanup_congestion_control(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner); } /* Used by sysctl to change default congestion control */ int tcp_set_default_congestion_control(struct net *net, const char *name) { struct tcp_congestion_ops *ca; const struct tcp_congestion_ops *prev; int ret; rcu_read_lock(); ca = tcp_ca_find_autoload(net, name); if (!ca) { ret = -ENOENT; } else if (!bpf_try_module_get(ca, ca->owner)) { ret = -EBUSY; } else if (!net_eq(net, &init_net) && !(ca->flags & TCP_CONG_NON_RESTRICTED)) { /* Only init netns can set default to a restricted algorithm */ ret = -EPERM; } else { prev = xchg(&net->ipv4.tcp_congestion_control, ca); if (prev) bpf_module_put(prev, prev->owner); ca->flags |= TCP_CONG_NON_RESTRICTED; ret = 0; } rcu_read_unlock(); return ret; } /* Set default value from kernel configuration at bootup */ static int __init tcp_congestion_default(void) { return tcp_set_default_congestion_control(&init_net, CONFIG_DEFAULT_TCP_CONG); } late_initcall(tcp_congestion_default); /* Build string with list of available congestion control values */ void tcp_get_available_congestion_control(char *buf, size_t maxlen) { struct tcp_congestion_ops *ca; size_t offs = 0; rcu_read_lock(); list_for_each_entry_rcu(ca, &tcp_cong_list, list) { offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", ca->name); if (WARN_ON_ONCE(offs >= maxlen)) break; } rcu_read_unlock(); } /* Get current default congestion control */ void tcp_get_default_congestion_control(struct net *net, char *name) { const struct tcp_congestion_ops *ca; rcu_read_lock(); ca = rcu_dereference(net->ipv4.tcp_congestion_control); strncpy(name, ca->name, TCP_CA_NAME_MAX); rcu_read_unlock(); } /* Built list of non-restricted congestion control values */ void tcp_get_allowed_congestion_control(char *buf, size_t maxlen) { struct tcp_congestion_ops *ca; size_t offs = 0; *buf = '\0'; rcu_read_lock(); list_for_each_entry_rcu(ca, &tcp_cong_list, list) { if (!(ca->flags & TCP_CONG_NON_RESTRICTED)) continue; offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", ca->name); if (WARN_ON_ONCE(offs >= maxlen)) break; } rcu_read_unlock(); } /* Change list of non-restricted congestion control */ int tcp_set_allowed_congestion_control(char *val) { struct tcp_congestion_ops *ca; char *saved_clone, *clone, *name; int ret = 0; saved_clone = clone = kstrdup(val, GFP_USER); if (!clone) return -ENOMEM; spin_lock(&tcp_cong_list_lock); /* pass 1 check for bad entries */ while ((name = strsep(&clone, " ")) && *name) { ca = tcp_ca_find(name); if (!ca) { ret = -ENOENT; goto out; } } /* pass 2 clear old values */ list_for_each_entry_rcu(ca, &tcp_cong_list, list) ca->flags &= ~TCP_CONG_NON_RESTRICTED; /* pass 3 mark as allowed */ while ((name = strsep(&val, " ")) && *name) { ca = tcp_ca_find(name); WARN_ON(!ca); if (ca) ca->flags |= TCP_CONG_NON_RESTRICTED; } out: spin_unlock(&tcp_cong_list_lock); kfree(saved_clone); return ret; } /* Change congestion control for socket. If load is false, then it is the * responsibility of the caller to call tcp_init_congestion_control or * tcp_reinit_congestion_control (if the current congestion control was * already initialized. */ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool cap_net_admin) { struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_congestion_ops *ca; int err = 0; if (icsk->icsk_ca_dst_locked) return -EPERM; rcu_read_lock(); if (!load) ca = tcp_ca_find(name); else ca = tcp_ca_find_autoload(sock_net(sk), name); /* No change asking for existing value */ if (ca == icsk->icsk_ca_ops) { icsk->icsk_ca_setsockopt = 1; goto out; } if (!ca) err = -ENOENT; else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin)) err = -EPERM; else if (!bpf_try_module_get(ca, ca->owner)) err = -EBUSY; else tcp_reinit_congestion_control(sk, ca); out: rcu_read_unlock(); return err; } /* Slow start is used when congestion window is no greater than the slow start * threshold. We base on RFC2581 and also handle stretch ACKs properly. * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but * something better;) a packet is only considered (s)acked in its entirety to * defend the ACK attacks described in the RFC. Slow start processes a stretch * ACK of degree N as if N acks of degree 1 are received back to back except * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and * returns the leftover acks to adjust cwnd in congestion avoidance mode. */ __bpf_kfunc u32 tcp_slow_start(struct tcp_sock *tp, u32 acked) { u32 cwnd = min(tcp_snd_cwnd(tp) + acked, tp->snd_ssthresh); acked -= cwnd - tcp_snd_cwnd(tp); tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); return acked; } EXPORT_SYMBOL_GPL(tcp_slow_start); /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w), * for every packet that was ACKed. */ __bpf_kfunc void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked) { /* If credits accumulated at a higher w, apply them gently now. */ if (tp->snd_cwnd_cnt >= w) { tp->snd_cwnd_cnt = 0; tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1); } tp->snd_cwnd_cnt += acked; if (tp->snd_cwnd_cnt >= w) { u32 delta = tp->snd_cwnd_cnt / w; tp->snd_cwnd_cnt -= delta * w; tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + delta); } tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), tp->snd_cwnd_clamp)); } EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai); /* * TCP Reno congestion control * This is special case used for fallback as well. */ /* This is Jacobson's slow start and congestion avoidance. * SIGCOMM '88, p. 328. */ __bpf_kfunc void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); if (!tcp_is_cwnd_limited(sk)) return; /* In "safe" area, increase. */ if (tcp_in_slow_start(tp)) { acked = tcp_slow_start(tp, acked); if (!acked) return; } /* In dangerous area, increase slowly. */ tcp_cong_avoid_ai(tp, tcp_snd_cwnd(tp), acked); } EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); /* Slow start threshold is half the congestion window (min 2) */ __bpf_kfunc u32 tcp_reno_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); return max(tcp_snd_cwnd(tp) >> 1U, 2U); } EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); __bpf_kfunc u32 tcp_reno_undo_cwnd(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); return max(tcp_snd_cwnd(tp), tp->prior_cwnd); } EXPORT_SYMBOL_GPL(tcp_reno_undo_cwnd); struct tcp_congestion_ops tcp_reno = { .flags = TCP_CONG_NON_RESTRICTED, .name = "reno", .owner = THIS_MODULE, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, .undo_cwnd = tcp_reno_undo_cwnd, };
3 3 4 7 7 7 7 7 4 4 5 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 // SPDX-License-Identifier: GPL-2.0-only /* tunnel4.c: Generic IP tunnel transformer. * * Copyright (C) 2003 David S. Miller (davem@redhat.com) */ #include <linux/init.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/mpls.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/icmp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/xfrm.h> static struct xfrm_tunnel __rcu *tunnel4_handlers __read_mostly; static struct xfrm_tunnel __rcu *tunnel64_handlers __read_mostly; static struct xfrm_tunnel __rcu *tunnelmpls4_handlers __read_mostly; static DEFINE_MUTEX(tunnel4_mutex); static inline struct xfrm_tunnel __rcu **fam_handlers(unsigned short family) { return (family == AF_INET) ? &tunnel4_handlers : (family == AF_INET6) ? &tunnel64_handlers : &tunnelmpls4_handlers; } int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family) { struct xfrm_tunnel __rcu **pprev; struct xfrm_tunnel *t; int ret = -EEXIST; int priority = handler->priority; mutex_lock(&tunnel4_mutex); for (pprev = fam_handlers(family); (t = rcu_dereference_protected(*pprev, lockdep_is_held(&tunnel4_mutex))) != NULL; pprev = &t->next) { if (t->priority > priority) break; if (t->priority == priority) goto err; } handler->next = *pprev; rcu_assign_pointer(*pprev, handler); ret = 0; err: mutex_unlock(&tunnel4_mutex); return ret; } EXPORT_SYMBOL(xfrm4_tunnel_register); int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family) { struct xfrm_tunnel __rcu **pprev; struct xfrm_tunnel *t; int ret = -ENOENT; mutex_lock(&tunnel4_mutex); for (pprev = fam_handlers(family); (t = rcu_dereference_protected(*pprev, lockdep_is_held(&tunnel4_mutex))) != NULL; pprev = &t->next) { if (t == handler) { *pprev = handler->next; ret = 0; break; } } mutex_unlock(&tunnel4_mutex); synchronize_net(); return ret; } EXPORT_SYMBOL(xfrm4_tunnel_deregister); #define for_each_tunnel_rcu(head, handler) \ for (handler = rcu_dereference(head); \ handler != NULL; \ handler = rcu_dereference(handler->next)) \ static int tunnel4_rcv(struct sk_buff *skb) { struct xfrm_tunnel *handler; if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto drop; for_each_tunnel_rcu(tunnel4_handlers, handler) if (!handler->handler(skb)) return 0; icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); drop: kfree_skb(skb); return 0; } #if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) static int tunnel4_rcv_cb(struct sk_buff *skb, u8 proto, int err) { struct xfrm_tunnel __rcu *head; struct xfrm_tunnel *handler; int ret; head = (proto == IPPROTO_IPIP) ? tunnel4_handlers : tunnel64_handlers; for_each_tunnel_rcu(head, handler) { if (handler->cb_handler) { ret = handler->cb_handler(skb, err); if (ret <= 0) return ret; } } return 0; } static const struct xfrm_input_afinfo tunnel4_input_afinfo = { .family = AF_INET, .is_ipip = true, .callback = tunnel4_rcv_cb, }; #endif #if IS_ENABLED(CONFIG_IPV6) static int tunnel64_rcv(struct sk_buff *skb) { struct xfrm_tunnel *handler; if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto drop; for_each_tunnel_rcu(tunnel64_handlers, handler) if (!handler->handler(skb)) return 0; icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); drop: kfree_skb(skb); return 0; } #endif #if IS_ENABLED(CONFIG_MPLS) static int tunnelmpls4_rcv(struct sk_buff *skb) { struct xfrm_tunnel *handler; if (!pskb_may_pull(skb, sizeof(struct mpls_label))) goto drop; for_each_tunnel_rcu(tunnelmpls4_handlers, handler) if (!handler->handler(skb)) return 0; icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); drop: kfree_skb(skb); return 0; } #endif static int tunnel4_err(struct sk_buff *skb, u32 info) { struct xfrm_tunnel *handler; for_each_tunnel_rcu(tunnel4_handlers, handler) if (!handler->err_handler(skb, info)) return 0; return -ENOENT; } #if IS_ENABLED(CONFIG_IPV6) static int tunnel64_err(struct sk_buff *skb, u32 info) { struct xfrm_tunnel *handler; for_each_tunnel_rcu(tunnel64_handlers, handler) if (!handler->err_handler(skb, info)) return 0; return -ENOENT; } #endif #if IS_ENABLED(CONFIG_MPLS) static int tunnelmpls4_err(struct sk_buff *skb, u32 info) { struct xfrm_tunnel *handler; for_each_tunnel_rcu(tunnelmpls4_handlers, handler) if (!handler->err_handler(skb, info)) return 0; return -ENOENT; } #endif static const struct net_protocol tunnel4_protocol = { .handler = tunnel4_rcv, .err_handler = tunnel4_err, .no_policy = 1, }; #if IS_ENABLED(CONFIG_IPV6) static const struct net_protocol tunnel64_protocol = { .handler = tunnel64_rcv, .err_handler = tunnel64_err, .no_policy = 1, }; #endif #if IS_ENABLED(CONFIG_MPLS) static const struct net_protocol tunnelmpls4_protocol = { .handler = tunnelmpls4_rcv, .err_handler = tunnelmpls4_err, .no_policy = 1, }; #endif static int __init tunnel4_init(void) { if (inet_add_protocol(&tunnel4_protocol, IPPROTO_IPIP)) goto err; #if IS_ENABLED(CONFIG_IPV6) if (inet_add_protocol(&tunnel64_protocol, IPPROTO_IPV6)) { inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP); goto err; } #endif #if IS_ENABLED(CONFIG_MPLS) if (inet_add_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS)) { inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP); #if IS_ENABLED(CONFIG_IPV6) inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6); #endif goto err; } #endif #if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) if (xfrm_input_register_afinfo(&tunnel4_input_afinfo)) { inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP); #if IS_ENABLED(CONFIG_IPV6) inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6); #endif #if IS_ENABLED(CONFIG_MPLS) inet_del_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS); #endif goto err; } #endif return 0; err: pr_err("%s: can't add protocol\n", __func__); return -EAGAIN; } static void __exit tunnel4_fini(void) { #if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) if (xfrm_input_unregister_afinfo(&tunnel4_input_afinfo)) pr_err("tunnel4 close: can't remove input afinfo\n"); #endif #if IS_ENABLED(CONFIG_MPLS) if (inet_del_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS)) pr_err("tunnelmpls4 close: can't remove protocol\n"); #endif #if IS_ENABLED(CONFIG_IPV6) if (inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6)) pr_err("tunnel64 close: can't remove protocol\n"); #endif if (inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP)) pr_err("tunnel4 close: can't remove protocol\n"); } module_init(tunnel4_init); module_exit(tunnel4_fini); MODULE_LICENSE("GPL");
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef _NET_GSO_H #define _NET_GSO_H #include <linux/skbuff.h> /* Keeps track of mac header offset relative to skb->head. * It is useful for TSO of Tunneling protocol. e.g. GRE. * For non-tunnel skb it points to skb_mac_header() and for * tunnel skb it points to outer mac header. * Keeps track of level of encapsulation of network headers. */ struct skb_gso_cb { union { int mac_offset; int data_offset; }; int encap_level; __wsum csum; __u16 csum_start; }; #define SKB_GSO_CB_OFFSET 32 #define SKB_GSO_CB(skb) ((struct skb_gso_cb *)((skb)->cb + SKB_GSO_CB_OFFSET)) static inline int skb_tnl_header_len(const struct sk_buff *inner_skb) { return (skb_mac_header(inner_skb) - inner_skb->head) - SKB_GSO_CB(inner_skb)->mac_offset; } static inline int gso_pskb_expand_head(struct sk_buff *skb, int extra) { int new_headroom, headroom; int ret; headroom = skb_headroom(skb); ret = pskb_expand_head(skb, extra, 0, GFP_ATOMIC); if (ret) return ret; new_headroom = skb_headroom(skb); SKB_GSO_CB(skb)->mac_offset += (new_headroom - headroom); return 0; } static inline void gso_reset_checksum(struct sk_buff *skb, __wsum res) { /* Do not update partial checksums if remote checksum is enabled. */ if (skb->remcsum_offload) return; SKB_GSO_CB(skb)->csum = res; SKB_GSO_CB(skb)->csum_start = skb_checksum_start(skb) - skb->head; } /* Compute the checksum for a gso segment. First compute the checksum value * from the start of transport header to SKB_GSO_CB(skb)->csum_start, and * then add in skb->csum (checksum from csum_start to end of packet). * skb->csum and csum_start are then updated to reflect the checksum of the * resultant packet starting from the transport header-- the resultant checksum * is in the res argument (i.e. normally zero or ~ of checksum of a pseudo * header. */ static inline __sum16 gso_make_checksum(struct sk_buff *skb, __wsum res) { unsigned char *csum_start = skb_transport_header(skb); int plen = (skb->head + SKB_GSO_CB(skb)->csum_start) - csum_start; __wsum partial = SKB_GSO_CB(skb)->csum; SKB_GSO_CB(skb)->csum = res; SKB_GSO_CB(skb)->csum_start = csum_start - skb->head; return csum_fold(csum_partial(csum_start, plen, partial)); } struct sk_buff *__skb_gso_segment(struct sk_buff *skb, netdev_features_t features, bool tx_path); static inline struct sk_buff *skb_gso_segment(struct sk_buff *skb, netdev_features_t features) { return __skb_gso_segment(skb, features, true); } struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb, netdev_features_t features, __be16 type); struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, netdev_features_t features); bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu); bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len); static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol, int pulled_hlen, u16 mac_offset, int mac_len) { skb->protocol = protocol; skb->encapsulation = 1; skb_push(skb, pulled_hlen); skb_reset_transport_header(skb); skb->mac_header = mac_offset; skb->network_header = skb->mac_header + mac_len; skb->mac_len = mac_len; } #endif /* _NET_GSO_H */
2697 2697 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM irq #if !defined(_TRACE_IRQ_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_IRQ_H #include <linux/tracepoint.h> struct irqaction; struct softirq_action; #define SOFTIRQ_NAME_LIST \ softirq_name(HI) \ softirq_name(TIMER) \ softirq_name(NET_TX) \ softirq_name(NET_RX) \ softirq_name(BLOCK) \ softirq_name(IRQ_POLL) \ softirq_name(TASKLET) \ softirq_name(SCHED) \ softirq_name(HRTIMER) \ softirq_name_end(RCU) #undef softirq_name #undef softirq_name_end #define softirq_name(sirq) TRACE_DEFINE_ENUM(sirq##_SOFTIRQ); #define softirq_name_end(sirq) TRACE_DEFINE_ENUM(sirq##_SOFTIRQ); SOFTIRQ_NAME_LIST #undef softirq_name #undef softirq_name_end #define softirq_name(sirq) { sirq##_SOFTIRQ, #sirq }, #define softirq_name_end(sirq) { sirq##_SOFTIRQ, #sirq } #define show_softirq_name(val) \ __print_symbolic(val, SOFTIRQ_NAME_LIST) /** * irq_handler_entry - called immediately before the irq action handler * @irq: irq number * @action: pointer to struct irqaction * * The struct irqaction pointed to by @action contains various * information about the handler, including the device name, * @action->name, and the device id, @action->dev_id. When used in * conjunction with the irq_handler_exit tracepoint, we can figure * out irq handler latencies. */ TRACE_EVENT(irq_handler_entry, TP_PROTO(int irq, struct irqaction *action), TP_ARGS(irq, action), TP_STRUCT__entry( __field( int, irq ) __string( name, action->name ) ), TP_fast_assign( __entry->irq = irq; __assign_str(name, action->name); ), TP_printk("irq=%d name=%s", __entry->irq, __get_str(name)) ); /** * irq_handler_exit - called immediately after the irq action handler returns * @irq: irq number * @action: pointer to struct irqaction * @ret: return value * * If the @ret value is set to IRQ_HANDLED, then we know that the corresponding * @action->handler successfully handled this irq. Otherwise, the irq might be * a shared irq line, or the irq was not handled successfully. Can be used in * conjunction with the irq_handler_entry to understand irq handler latencies. */ TRACE_EVENT(irq_handler_exit, TP_PROTO(int irq, struct irqaction *action, int ret), TP_ARGS(irq, action, ret), TP_STRUCT__entry( __field( int, irq ) __field( int, ret ) ), TP_fast_assign( __entry->irq = irq; __entry->ret = ret; ), TP_printk("irq=%d ret=%s", __entry->irq, __entry->ret ? "handled" : "unhandled") ); DECLARE_EVENT_CLASS(softirq, TP_PROTO(unsigned int vec_nr), TP_ARGS(vec_nr), TP_STRUCT__entry( __field( unsigned int, vec ) ), TP_fast_assign( __entry->vec = vec_nr; ), TP_printk("vec=%u [action=%s]", __entry->vec, show_softirq_name(__entry->vec)) ); /** * softirq_entry - called immediately before the softirq handler * @vec_nr: softirq vector number * * When used in combination with the softirq_exit tracepoint * we can determine the softirq handler routine. */ DEFINE_EVENT(softirq, softirq_entry, TP_PROTO(unsigned int vec_nr), TP_ARGS(vec_nr) ); /** * softirq_exit - called immediately after the softirq handler returns * @vec_nr: softirq vector number * * When used in combination with the softirq_entry tracepoint * we can determine the softirq handler routine. */ DEFINE_EVENT(softirq, softirq_exit, TP_PROTO(unsigned int vec_nr), TP_ARGS(vec_nr) ); /** * softirq_raise - called immediately when a softirq is raised * @vec_nr: softirq vector number * * When used in combination with the softirq_entry tracepoint * we can determine the softirq raise to run latency. */ DEFINE_EVENT(softirq, softirq_raise, TP_PROTO(unsigned int vec_nr), TP_ARGS(vec_nr) ); DECLARE_EVENT_CLASS(tasklet, TP_PROTO(struct tasklet_struct *t, void *func), TP_ARGS(t, func), TP_STRUCT__entry( __field( void *, tasklet) __field( void *, func) ), TP_fast_assign( __entry->tasklet = t; __entry->func = func; ), TP_printk("tasklet=%ps function=%ps", __entry->tasklet, __entry->func) ); /** * tasklet_entry - called immediately before the tasklet is run * @t: tasklet pointer * @func: tasklet callback or function being run * * Used to find individual tasklet execution time */ DEFINE_EVENT(tasklet, tasklet_entry, TP_PROTO(struct tasklet_struct *t, void *func), TP_ARGS(t, func) ); /** * tasklet_exit - called immediately after the tasklet is run * @t: tasklet pointer * @func: tasklet callback or function being run * * Used to find individual tasklet execution time */ DEFINE_EVENT(tasklet, tasklet_exit, TP_PROTO(struct tasklet_struct *t, void *func), TP_ARGS(t, func) ); #endif /* _TRACE_IRQ_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
231 226 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/init.h> #include <linux/module.h> #include <linux/netfilter.h> #include <net/flow_offload.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_offload.h> #include <net/pkt_cls.h> static struct nft_flow_rule *nft_flow_rule_alloc(int num_actions) { struct nft_flow_rule *flow; flow = kzalloc(sizeof(struct nft_flow_rule), GFP_KERNEL); if (!flow) return NULL; flow->rule = flow_rule_alloc(num_actions); if (!flow->rule) { kfree(flow); return NULL; } flow->rule->match.dissector = &flow->match.dissector; flow->rule->match.mask = &flow->match.mask; flow->rule->match.key = &flow->match.key; return flow; } void nft_flow_rule_set_addr_type(struct nft_flow_rule *flow, enum flow_dissector_key_id addr_type) { struct nft_flow_match *match = &flow->match; struct nft_flow_key *mask = &match->mask; struct nft_flow_key *key = &match->key; if (match->dissector.used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL)) return; key->control.addr_type = addr_type; mask->control.addr_type = 0xffff; match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL); match->dissector.offset[FLOW_DISSECTOR_KEY_CONTROL] = offsetof(struct nft_flow_key, control); } struct nft_offload_ethertype { __be16 value; __be16 mask; }; static void nft_flow_rule_transfer_vlan(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow) { struct nft_flow_match *match = &flow->match; struct nft_offload_ethertype ethertype = { .value = match->key.basic.n_proto, .mask = match->mask.basic.n_proto, }; if (match->dissector.used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_VLAN) && (match->key.vlan.vlan_tpid == htons(ETH_P_8021Q) || match->key.vlan.vlan_tpid == htons(ETH_P_8021AD))) { match->key.basic.n_proto = match->key.cvlan.vlan_tpid; match->mask.basic.n_proto = match->mask.cvlan.vlan_tpid; match->key.cvlan.vlan_tpid = match->key.vlan.vlan_tpid; match->mask.cvlan.vlan_tpid = match->mask.vlan.vlan_tpid; match->key.vlan.vlan_tpid = ethertype.value; match->mask.vlan.vlan_tpid = ethertype.mask; match->dissector.offset[FLOW_DISSECTOR_KEY_CVLAN] = offsetof(struct nft_flow_key, cvlan); match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_CVLAN); } else if (match->dissector.used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_BASIC) && (match->key.basic.n_proto == htons(ETH_P_8021Q) || match->key.basic.n_proto == htons(ETH_P_8021AD))) { match->key.basic.n_proto = match->key.vlan.vlan_tpid; match->mask.basic.n_proto = match->mask.vlan.vlan_tpid; match->key.vlan.vlan_tpid = ethertype.value; match->mask.vlan.vlan_tpid = ethertype.mask; match->dissector.offset[FLOW_DISSECTOR_KEY_VLAN] = offsetof(struct nft_flow_key, vlan); match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_VLAN); } } struct nft_flow_rule *nft_flow_rule_create(struct net *net, const struct nft_rule *rule) { struct nft_offload_ctx *ctx; struct nft_flow_rule *flow; int num_actions = 0, err; struct nft_expr *expr; expr = nft_expr_first(rule); while (nft_expr_more(rule, expr)) { if (expr->ops->offload_action && expr->ops->offload_action(expr)) num_actions++; expr = nft_expr_next(expr); } if (num_actions == 0) return ERR_PTR(-EOPNOTSUPP); flow = nft_flow_rule_alloc(num_actions); if (!flow) return ERR_PTR(-ENOMEM); expr = nft_expr_first(rule); ctx = kzalloc(sizeof(struct nft_offload_ctx), GFP_KERNEL); if (!ctx) { err = -ENOMEM; goto err_out; } ctx->net = net; ctx->dep.type = NFT_OFFLOAD_DEP_UNSPEC; while (nft_expr_more(rule, expr)) { if (!expr->ops->offload) { err = -EOPNOTSUPP; goto err_out; } err = expr->ops->offload(ctx, flow, expr); if (err < 0) goto err_out; expr = nft_expr_next(expr); } nft_flow_rule_transfer_vlan(ctx, flow); flow->proto = ctx->dep.l3num; kfree(ctx); return flow; err_out: kfree(ctx); nft_flow_rule_destroy(flow); return ERR_PTR(err); } void nft_flow_rule_destroy(struct nft_flow_rule *flow) { struct flow_action_entry *entry; int i; flow_action_for_each(i, entry, &flow->rule->action) { switch (entry->id) { case FLOW_ACTION_REDIRECT: case FLOW_ACTION_MIRRED: dev_put(entry->dev); break; default: break; } } kfree(flow->rule); kfree(flow); } void nft_offload_set_dependency(struct nft_offload_ctx *ctx, enum nft_offload_dep_type type) { ctx->dep.type = type; } void nft_offload_update_dependency(struct nft_offload_ctx *ctx, const void *data, u32 len) { switch (ctx->dep.type) { case NFT_OFFLOAD_DEP_NETWORK: WARN_ON(len != sizeof(__u16)); memcpy(&ctx->dep.l3num, data, sizeof(__u16)); break; case NFT_OFFLOAD_DEP_TRANSPORT: WARN_ON(len != sizeof(__u8)); memcpy(&ctx->dep.protonum, data, sizeof(__u8)); break; default: break; } ctx->dep.type = NFT_OFFLOAD_DEP_UNSPEC; } static void nft_flow_offload_common_init(struct flow_cls_common_offload *common, __be16 proto, int priority, struct netlink_ext_ack *extack) { common->protocol = proto; common->prio = priority; common->extack = extack; } static int nft_setup_cb_call(enum tc_setup_type type, void *type_data, struct list_head *cb_list) { struct flow_block_cb *block_cb; int err; list_for_each_entry(block_cb, cb_list, list) { err = block_cb->cb(type, type_data, block_cb->cb_priv); if (err < 0) return err; } return 0; } static int nft_chain_offload_priority(const struct nft_base_chain *basechain) { if (basechain->ops.priority <= 0 || basechain->ops.priority > USHRT_MAX) return -1; return 0; } bool nft_chain_offload_support(const struct nft_base_chain *basechain) { struct net_device *dev; struct nft_hook *hook; if (nft_chain_offload_priority(basechain) < 0) return false; list_for_each_entry(hook, &basechain->hook_list, list) { if (hook->ops.pf != NFPROTO_NETDEV || hook->ops.hooknum != NF_NETDEV_INGRESS) return false; dev = hook->ops.dev; if (!dev->netdev_ops->ndo_setup_tc && !flow_indr_dev_exists()) return false; } return true; } static void nft_flow_cls_offload_setup(struct flow_cls_offload *cls_flow, const struct nft_base_chain *basechain, const struct nft_rule *rule, const struct nft_flow_rule *flow, struct netlink_ext_ack *extack, enum flow_cls_command command) { __be16 proto = ETH_P_ALL; memset(cls_flow, 0, sizeof(*cls_flow)); if (flow) proto = flow->proto; nft_flow_offload_common_init(&cls_flow->common, proto, basechain->ops.priority, extack); cls_flow->command = command; cls_flow->cookie = (unsigned long) rule; if (flow) cls_flow->rule = flow->rule; } static int nft_flow_offload_cmd(const struct nft_chain *chain, const struct nft_rule *rule, struct nft_flow_rule *flow, enum flow_cls_command command, struct flow_cls_offload *cls_flow) { struct netlink_ext_ack extack = {}; struct nft_base_chain *basechain; if (!nft_is_base_chain(chain)) return -EOPNOTSUPP; basechain = nft_base_chain(chain); nft_flow_cls_offload_setup(cls_flow, basechain, rule, flow, &extack, command); return nft_setup_cb_call(TC_SETUP_CLSFLOWER, cls_flow, &basechain->flow_block.cb_list); } static int nft_flow_offload_rule(const struct nft_chain *chain, struct nft_rule *rule, struct nft_flow_rule *flow, enum flow_cls_command command) { struct flow_cls_offload cls_flow; return nft_flow_offload_cmd(chain, rule, flow, command, &cls_flow); } int nft_flow_rule_stats(const struct nft_chain *chain, const struct nft_rule *rule) { struct flow_cls_offload cls_flow = {}; struct nft_expr *expr, *next; int err; err = nft_flow_offload_cmd(chain, rule, NULL, FLOW_CLS_STATS, &cls_flow); if (err < 0) return err; nft_rule_for_each_expr(expr, next, rule) { if (expr->ops->offload_stats) expr->ops->offload_stats(expr, &cls_flow.stats); } return 0; } static int nft_flow_offload_bind(struct flow_block_offload *bo, struct nft_base_chain *basechain) { list_splice(&bo->cb_list, &basechain->flow_block.cb_list); return 0; } static int nft_flow_offload_unbind(struct flow_block_offload *bo, struct nft_base_chain *basechain) { struct flow_block_cb *block_cb, *next; struct flow_cls_offload cls_flow; struct netlink_ext_ack extack; struct nft_chain *chain; struct nft_rule *rule; chain = &basechain->chain; list_for_each_entry(rule, &chain->rules, list) { memset(&extack, 0, sizeof(extack)); nft_flow_cls_offload_setup(&cls_flow, basechain, rule, NULL, &extack, FLOW_CLS_DESTROY); nft_setup_cb_call(TC_SETUP_CLSFLOWER, &cls_flow, &bo->cb_list); } list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) { list_del(&block_cb->list); flow_block_cb_free(block_cb); } return 0; } static int nft_block_setup(struct nft_base_chain *basechain, struct flow_block_offload *bo, enum flow_block_command cmd) { int err; switch (cmd) { case FLOW_BLOCK_BIND: err = nft_flow_offload_bind(bo, basechain); break; case FLOW_BLOCK_UNBIND: err = nft_flow_offload_unbind(bo, basechain); break; default: WARN_ON_ONCE(1); err = -EOPNOTSUPP; } return err; } static void nft_flow_block_offload_init(struct flow_block_offload *bo, struct net *net, enum flow_block_command cmd, struct nft_base_chain *basechain, struct netlink_ext_ack *extack) { memset(bo, 0, sizeof(*bo)); bo->net = net; bo->block = &basechain->flow_block; bo->command = cmd; bo->binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; bo->extack = extack; bo->cb_list_head = &basechain->flow_block.cb_list; INIT_LIST_HEAD(&bo->cb_list); } static int nft_block_offload_cmd(struct nft_base_chain *chain, struct net_device *dev, enum flow_block_command cmd) { struct netlink_ext_ack extack = {}; struct flow_block_offload bo; int err; nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack); err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); if (err < 0) return err; return nft_block_setup(chain, &bo, cmd); } static void nft_indr_block_cleanup(struct flow_block_cb *block_cb) { struct nft_base_chain *basechain = block_cb->indr.data; struct net_device *dev = block_cb->indr.dev; struct netlink_ext_ack extack = {}; struct nftables_pernet *nft_net; struct net *net = dev_net(dev); struct flow_block_offload bo; nft_flow_block_offload_init(&bo, dev_net(dev), FLOW_BLOCK_UNBIND, basechain, &extack); nft_net = nft_pernet(net); mutex_lock(&nft_net->commit_mutex); list_del(&block_cb->driver_list); list_move(&block_cb->list, &bo.cb_list); nft_flow_offload_unbind(&bo, basechain); mutex_unlock(&nft_net->commit_mutex); } static int nft_indr_block_offload_cmd(struct nft_base_chain *basechain, struct net_device *dev, enum flow_block_command cmd) { struct netlink_ext_ack extack = {}; struct flow_block_offload bo; int err; nft_flow_block_offload_init(&bo, dev_net(dev), cmd, basechain, &extack); err = flow_indr_dev_setup_offload(dev, NULL, TC_SETUP_BLOCK, basechain, &bo, nft_indr_block_cleanup); if (err < 0) return err; if (list_empty(&bo.cb_list)) return -EOPNOTSUPP; return nft_block_setup(basechain, &bo, cmd); } static int nft_chain_offload_cmd(struct nft_base_chain *basechain, struct net_device *dev, enum flow_block_command cmd) { int err; if (dev->netdev_ops->ndo_setup_tc) err = nft_block_offload_cmd(basechain, dev, cmd); else err = nft_indr_block_offload_cmd(basechain, dev, cmd); return err; } static int nft_flow_block_chain(struct nft_base_chain *basechain, const struct net_device *this_dev, enum flow_block_command cmd) { struct net_device *dev; struct nft_hook *hook; int err, i = 0; list_for_each_entry(hook, &basechain->hook_list, list) { dev = hook->ops.dev; if (this_dev && this_dev != dev) continue; err = nft_chain_offload_cmd(basechain, dev, cmd); if (err < 0 && cmd == FLOW_BLOCK_BIND) { if (!this_dev) goto err_flow_block; return err; } i++; } return 0; err_flow_block: list_for_each_entry(hook, &basechain->hook_list, list) { if (i-- <= 0) break; dev = hook->ops.dev; nft_chain_offload_cmd(basechain, dev, FLOW_BLOCK_UNBIND); } return err; } static int nft_flow_offload_chain(struct nft_chain *chain, u8 *ppolicy, enum flow_block_command cmd) { struct nft_base_chain *basechain; u8 policy; if (!nft_is_base_chain(chain)) return -EOPNOTSUPP; basechain = nft_base_chain(chain); policy = ppolicy ? *ppolicy : basechain->policy; /* Only default policy to accept is supported for now. */ if (cmd == FLOW_BLOCK_BIND && policy == NF_DROP) return -EOPNOTSUPP; return nft_flow_block_chain(basechain, NULL, cmd); } static void nft_flow_rule_offload_abort(struct net *net, struct nft_trans *trans) { struct nftables_pernet *nft_net = nft_pernet(net); int err = 0; list_for_each_entry_continue_reverse(trans, &nft_net->commit_list, list) { if (trans->ctx.family != NFPROTO_NETDEV) continue; switch (trans->msg_type) { case NFT_MSG_NEWCHAIN: if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) || nft_trans_chain_update(trans)) continue; err = nft_flow_offload_chain(trans->ctx.chain, NULL, FLOW_BLOCK_UNBIND); break; case NFT_MSG_DELCHAIN: if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) continue; err = nft_flow_offload_chain(trans->ctx.chain, NULL, FLOW_BLOCK_BIND); break; case NFT_MSG_NEWRULE: if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) continue; err = nft_flow_offload_rule(trans->ctx.chain, nft_trans_rule(trans), NULL, FLOW_CLS_DESTROY); break; case NFT_MSG_DELRULE: if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) continue; err = nft_flow_offload_rule(trans->ctx.chain, nft_trans_rule(trans), nft_trans_flow_rule(trans), FLOW_CLS_REPLACE); break; } if (WARN_ON_ONCE(err)) break; } } int nft_flow_rule_offload_commit(struct net *net) { struct nftables_pernet *nft_net = nft_pernet(net); struct nft_trans *trans; int err = 0; u8 policy; list_for_each_entry(trans, &nft_net->commit_list, list) { if (trans->ctx.family != NFPROTO_NETDEV) continue; switch (trans->msg_type) { case NFT_MSG_NEWCHAIN: if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) || nft_trans_chain_update(trans)) continue; policy = nft_trans_chain_policy(trans); err = nft_flow_offload_chain(trans->ctx.chain, &policy, FLOW_BLOCK_BIND); break; case NFT_MSG_DELCHAIN: if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) continue; policy = nft_trans_chain_policy(trans); err = nft_flow_offload_chain(trans->ctx.chain, &policy, FLOW_BLOCK_UNBIND); break; case NFT_MSG_NEWRULE: if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) continue; if (trans->ctx.flags & NLM_F_REPLACE || !(trans->ctx.flags & NLM_F_APPEND)) { err = -EOPNOTSUPP; break; } err = nft_flow_offload_rule(trans->ctx.chain, nft_trans_rule(trans), nft_trans_flow_rule(trans), FLOW_CLS_REPLACE); break; case NFT_MSG_DELRULE: if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) continue; err = nft_flow_offload_rule(trans->ctx.chain, nft_trans_rule(trans), NULL, FLOW_CLS_DESTROY); break; } if (err) { nft_flow_rule_offload_abort(net, trans); break; } } return err; } static struct nft_chain *__nft_offload_get_chain(const struct nftables_pernet *nft_net, struct net_device *dev) { struct nft_base_chain *basechain; struct nft_hook *hook, *found; const struct nft_table *table; struct nft_chain *chain; list_for_each_entry(table, &nft_net->tables, list) { if (table->family != NFPROTO_NETDEV) continue; list_for_each_entry(chain, &table->chains, list) { if (!nft_is_base_chain(chain) || !(chain->flags & NFT_CHAIN_HW_OFFLOAD)) continue; found = NULL; basechain = nft_base_chain(chain); list_for_each_entry(hook, &basechain->hook_list, list) { if (hook->ops.dev != dev) continue; found = hook; break; } if (!found) continue; return chain; } } return NULL; } static int nft_offload_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct nftables_pernet *nft_net; struct net *net = dev_net(dev); struct nft_chain *chain; if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; nft_net = nft_pernet(net); mutex_lock(&nft_net->commit_mutex); chain = __nft_offload_get_chain(nft_net, dev); if (chain) nft_flow_block_chain(nft_base_chain(chain), dev, FLOW_BLOCK_UNBIND); mutex_unlock(&nft_net->commit_mutex); return NOTIFY_DONE; } static struct notifier_block nft_offload_netdev_notifier = { .notifier_call = nft_offload_netdev_event, }; int nft_offload_init(void) { return register_netdevice_notifier(&nft_offload_netdev_notifier); } void nft_offload_exit(void) { unregister_netdevice_notifier(&nft_offload_netdev_notifier); }
5195 3346 1211 67 1432 1433 1433 1120 5835 3599 230 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_FIND_H_ #define __LINUX_FIND_H_ #ifndef __LINUX_BITMAP_H #error only <linux/bitmap.h> can be included directly #endif #include <linux/bitops.h> unsigned long _find_next_bit(const unsigned long *addr1, unsigned long nbits, unsigned long start); unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start); unsigned long _find_next_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start); unsigned long _find_next_or_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start); unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits, unsigned long start); extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size); unsigned long __find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n); unsigned long __find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n); unsigned long __find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n); unsigned long __find_nth_and_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, const unsigned long *addr3, unsigned long size, unsigned long n); extern unsigned long _find_first_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size); extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size); extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size); #ifdef __BIG_ENDIAN unsigned long _find_first_zero_bit_le(const unsigned long *addr, unsigned long size); unsigned long _find_next_zero_bit_le(const unsigned long *addr, unsigned long size, unsigned long offset); unsigned long _find_next_bit_le(const unsigned long *addr, unsigned long size, unsigned long offset); #endif #ifndef find_next_bit /** * find_next_bit - find the next set bit in a memory region * @addr: The address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit * If no bits are set, returns @size. */ static inline unsigned long find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val; if (unlikely(offset >= size)) return size; val = *addr & GENMASK(size - 1, offset); return val ? __ffs(val) : size; } return _find_next_bit(addr, size, offset); } #endif #ifndef find_next_and_bit /** * find_next_and_bit - find the next set bit in both memory regions * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit * If no bits are set, returns @size. */ static inline unsigned long find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val; if (unlikely(offset >= size)) return size; val = *addr1 & *addr2 & GENMASK(size - 1, offset); return val ? __ffs(val) : size; } return _find_next_and_bit(addr1, addr2, size, offset); } #endif #ifndef find_next_andnot_bit /** * find_next_andnot_bit - find the next set bit in *addr1 excluding all the bits * in *addr2 * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit * If no bits are set, returns @size. */ static inline unsigned long find_next_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val; if (unlikely(offset >= size)) return size; val = *addr1 & ~*addr2 & GENMASK(size - 1, offset); return val ? __ffs(val) : size; } return _find_next_andnot_bit(addr1, addr2, size, offset); } #endif #ifndef find_next_or_bit /** * find_next_or_bit - find the next set bit in either memory regions * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit * If no bits are set, returns @size. */ static inline unsigned long find_next_or_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val; if (unlikely(offset >= size)) return size; val = (*addr1 | *addr2) & GENMASK(size - 1, offset); return val ? __ffs(val) : size; } return _find_next_or_bit(addr1, addr2, size, offset); } #endif #ifndef find_next_zero_bit /** * find_next_zero_bit - find the next cleared bit in a memory region * @addr: The address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number of the next zero bit * If no bits are zero, returns @size. */ static inline unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val; if (unlikely(offset >= size)) return size; val = *addr | ~GENMASK(size - 1, offset); return val == ~0UL ? size : ffz(val); } return _find_next_zero_bit(addr, size, offset); } #endif #ifndef find_first_bit /** * find_first_bit - find the first set bit in a memory region * @addr: The address to start the search at * @size: The maximum number of bits to search * * Returns the bit number of the first set bit. * If no bits are set, returns @size. */ static inline unsigned long find_first_bit(const unsigned long *addr, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr & GENMASK(size - 1, 0); return val ? __ffs(val) : size; } return _find_first_bit(addr, size); } #endif /** * find_nth_bit - find N'th set bit in a memory region * @addr: The address to start the search at * @size: The maximum number of bits to search * @n: The number of set bit, which position is needed, counting from 0 * * The following is semantically equivalent: * idx = find_nth_bit(addr, size, 0); * idx = find_first_bit(addr, size); * * Returns the bit number of the N'th set bit. * If no such, returns @size. */ static inline unsigned long find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n) { if (n >= size) return size; if (small_const_nbits(size)) { unsigned long val = *addr & GENMASK(size - 1, 0); return val ? fns(val, n) : size; } return __find_nth_bit(addr, size, n); } /** * find_nth_and_bit - find N'th set bit in 2 memory regions * @addr1: The 1st address to start the search at * @addr2: The 2nd address to start the search at * @size: The maximum number of bits to search * @n: The number of set bit, which position is needed, counting from 0 * * Returns the bit number of the N'th set bit. * If no such, returns @size. */ static inline unsigned long find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n) { if (n >= size) return size; if (small_const_nbits(size)) { unsigned long val = *addr1 & *addr2 & GENMASK(size - 1, 0); return val ? fns(val, n) : size; } return __find_nth_and_bit(addr1, addr2, size, n); } /** * find_nth_andnot_bit - find N'th set bit in 2 memory regions, * flipping bits in 2nd region * @addr1: The 1st address to start the search at * @addr2: The 2nd address to start the search at * @size: The maximum number of bits to search * @n: The number of set bit, which position is needed, counting from 0 * * Returns the bit number of the N'th set bit. * If no such, returns @size. */ static inline unsigned long find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n) { if (n >= size) return size; if (small_const_nbits(size)) { unsigned long val = *addr1 & (~*addr2) & GENMASK(size - 1, 0); return val ? fns(val, n) : size; } return __find_nth_andnot_bit(addr1, addr2, size, n); } /** * find_nth_and_andnot_bit - find N'th set bit in 2 memory regions, * excluding those set in 3rd region * @addr1: The 1st address to start the search at * @addr2: The 2nd address to start the search at * @addr3: The 3rd address to start the search at * @size: The maximum number of bits to search * @n: The number of set bit, which position is needed, counting from 0 * * Returns the bit number of the N'th set bit. * If no such, returns @size. */ static __always_inline unsigned long find_nth_and_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, const unsigned long *addr3, unsigned long size, unsigned long n) { if (n >= size) return size; if (small_const_nbits(size)) { unsigned long val = *addr1 & *addr2 & (~*addr3) & GENMASK(size - 1, 0); return val ? fns(val, n) : size; } return __find_nth_and_andnot_bit(addr1, addr2, addr3, size, n); } #ifndef find_first_and_bit /** * find_first_and_bit - find the first set bit in both memory regions * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * * Returns the bit number for the next set bit * If no bits are set, returns @size. */ static inline unsigned long find_first_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr1 & *addr2 & GENMASK(size - 1, 0); return val ? __ffs(val) : size; } return _find_first_and_bit(addr1, addr2, size); } #endif #ifndef find_first_zero_bit /** * find_first_zero_bit - find the first cleared bit in a memory region * @addr: The address to start the search at * @size: The maximum number of bits to search * * Returns the bit number of the first cleared bit. * If no bits are zero, returns @size. */ static inline unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr | ~GENMASK(size - 1, 0); return val == ~0UL ? size : ffz(val); } return _find_first_zero_bit(addr, size); } #endif #ifndef find_last_bit /** * find_last_bit - find the last set bit in a memory region * @addr: The address to start the search at * @size: The number of bits to search * * Returns the bit number of the last set bit, or size. */ static inline unsigned long find_last_bit(const unsigned long *addr, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr & GENMASK(size - 1, 0); return val ? __fls(val) : size; } return _find_last_bit(addr, size); } #endif /** * find_next_and_bit_wrap - find the next set bit in both memory regions * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit, or first set bit up to @offset * If no bits are set, returns @size. */ static inline unsigned long find_next_and_bit_wrap(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) { unsigned long bit = find_next_and_bit(addr1, addr2, size, offset); if (bit < size) return bit; bit = find_first_and_bit(addr1, addr2, offset); return bit < offset ? bit : size; } /** * find_next_bit_wrap - find the next set bit in both memory regions * @addr: The first address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit, or first set bit up to @offset * If no bits are set, returns @size. */ static inline unsigned long find_next_bit_wrap(const unsigned long *addr, unsigned long size, unsigned long offset) { unsigned long bit = find_next_bit(addr, size, offset); if (bit < size) return bit; bit = find_first_bit(addr, offset); return bit < offset ? bit : size; } /* * Helper for for_each_set_bit_wrap(). Make sure you're doing right thing * before using it alone. */ static inline unsigned long __for_each_wrap(const unsigned long *bitmap, unsigned long size, unsigned long start, unsigned long n) { unsigned long bit; /* If not wrapped around */ if (n > start) { /* and have a bit, just return it. */ bit = find_next_bit(bitmap, size, n); if (bit < size) return bit; /* Otherwise, wrap around and ... */ n = 0; } /* Search the other part. */ bit = find_next_bit(bitmap, start, n); return bit < start ? bit : size; } /** * find_next_clump8 - find next 8-bit clump with set bits in a memory region * @clump: location to store copy of found clump * @addr: address to base the search on * @size: bitmap size in number of bits * @offset: bit offset at which to start searching * * Returns the bit offset for the next set clump; the found clump value is * copied to the location pointed by @clump. If no bits are set, returns @size. */ extern unsigned long find_next_clump8(unsigned long *clump, const unsigned long *addr, unsigned long size, unsigned long offset); #define find_first_clump8(clump, bits, size) \ find_next_clump8((clump), (bits), (size), 0) #if defined(__LITTLE_ENDIAN) static inline unsigned long find_next_zero_bit_le(const void *addr, unsigned long size, unsigned long offset) { return find_next_zero_bit(addr, size, offset); } static inline unsigned long find_next_bit_le(const void *addr, unsigned long size, unsigned long offset) { return find_next_bit(addr, size, offset); } static inline unsigned long find_first_zero_bit_le(const void *addr, unsigned long size) { return find_first_zero_bit(addr, size); } #elif defined(__BIG_ENDIAN) #ifndef find_next_zero_bit_le static inline unsigned long find_next_zero_bit_le(const void *addr, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val = *(const unsigned long *)addr; if (unlikely(offset >= size)) return size; val = swab(val) | ~GENMASK(size - 1, offset); return val == ~0UL ? size : ffz(val); } return _find_next_zero_bit_le(addr, size, offset); } #endif #ifndef find_first_zero_bit_le static inline unsigned long find_first_zero_bit_le(const void *addr, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = swab(*(const unsigned long *)addr) | ~GENMASK(size - 1, 0); return val == ~0UL ? size : ffz(val); } return _find_first_zero_bit_le(addr, size); } #endif #ifndef find_next_bit_le static inline unsigned long find_next_bit_le(const void *addr, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val = *(const unsigned long *)addr; if (unlikely(offset >= size)) return size; val = swab(val) & GENMASK(size - 1, offset); return val ? __ffs(val) : size; } return _find_next_bit_le(addr, size, offset); } #endif #else #error "Please fix <asm/byteorder.h>" #endif #define for_each_set_bit(bit, addr, size) \ for ((bit) = 0; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++) #define for_each_and_bit(bit, addr1, addr2, size) \ for ((bit) = 0; \ (bit) = find_next_and_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\ (bit)++) #define for_each_andnot_bit(bit, addr1, addr2, size) \ for ((bit) = 0; \ (bit) = find_next_andnot_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\ (bit)++) #define for_each_or_bit(bit, addr1, addr2, size) \ for ((bit) = 0; \ (bit) = find_next_or_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\ (bit)++) /* same as for_each_set_bit() but use bit as value to start with */ #define for_each_set_bit_from(bit, addr, size) \ for (; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++) #define for_each_clear_bit(bit, addr, size) \ for ((bit) = 0; \ (bit) = find_next_zero_bit((addr), (size), (bit)), (bit) < (size); \ (bit)++) /* same as for_each_clear_bit() but use bit as value to start with */ #define for_each_clear_bit_from(bit, addr, size) \ for (; (bit) = find_next_zero_bit((addr), (size), (bit)), (bit) < (size); (bit)++) /** * for_each_set_bitrange - iterate over all set bit ranges [b; e) * @b: bit offset of start of current bitrange (first set bit) * @e: bit offset of end of current bitrange (first unset bit) * @addr: bitmap address to base the search on * @size: bitmap size in number of bits */ #define for_each_set_bitrange(b, e, addr, size) \ for ((b) = 0; \ (b) = find_next_bit((addr), (size), b), \ (e) = find_next_zero_bit((addr), (size), (b) + 1), \ (b) < (size); \ (b) = (e) + 1) /** * for_each_set_bitrange_from - iterate over all set bit ranges [b; e) * @b: bit offset of start of current bitrange (first set bit); must be initialized * @e: bit offset of end of current bitrange (first unset bit) * @addr: bitmap address to base the search on * @size: bitmap size in number of bits */ #define for_each_set_bitrange_from(b, e, addr, size) \ for (; \ (b) = find_next_bit((addr), (size), (b)), \ (e) = find_next_zero_bit((addr), (size), (b) + 1), \ (b) < (size); \ (b) = (e) + 1) /** * for_each_clear_bitrange - iterate over all unset bit ranges [b; e) * @b: bit offset of start of current bitrange (first unset bit) * @e: bit offset of end of current bitrange (first set bit) * @addr: bitmap address to base the search on * @size: bitmap size in number of bits */ #define for_each_clear_bitrange(b, e, addr, size) \ for ((b) = 0; \ (b) = find_next_zero_bit((addr), (size), (b)), \ (e) = find_next_bit((addr), (size), (b) + 1), \ (b) < (size); \ (b) = (e) + 1) /** * for_each_clear_bitrange_from - iterate over all unset bit ranges [b; e) * @b: bit offset of start of current bitrange (first set bit); must be initialized * @e: bit offset of end of current bitrange (first unset bit) * @addr: bitmap address to base the search on * @size: bitmap size in number of bits */ #define for_each_clear_bitrange_from(b, e, addr, size) \ for (; \ (b) = find_next_zero_bit((addr), (size), (b)), \ (e) = find_next_bit((addr), (size), (b) + 1), \ (b) < (size); \ (b) = (e) + 1) /** * for_each_set_bit_wrap - iterate over all set bits starting from @start, and * wrapping around the end of bitmap. * @bit: offset for current iteration * @addr: bitmap address to base the search on * @size: bitmap size in number of bits * @start: Starting bit for bitmap traversing, wrapping around the bitmap end */ #define for_each_set_bit_wrap(bit, addr, size, start) \ for ((bit) = find_next_bit_wrap((addr), (size), (start)); \ (bit) < (size); \ (bit) = __for_each_wrap((addr), (size), (start), (bit) + 1)) /** * for_each_set_clump8 - iterate over bitmap for each 8-bit clump with set bits * @start: bit offset to start search and to store the current iteration offset * @clump: location to store copy of current 8-bit clump * @bits: bitmap address to base the search on * @size: bitmap size in number of bits */ #define for_each_set_clump8(start, clump, bits, size) \ for ((start) = find_first_clump8(&(clump), (bits), (size)); \ (start) < (size); \ (start) = find_next_clump8(&(clump), (bits), (size), (start) + 8)) #endif /*__LINUX_FIND_H_ */
9 11 9 9 9 9 10 9 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 /* * linux/drivers/video/fb_defio.c * * Copyright (C) 2006 Jaya Kumar * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of this archive * for more details. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/vmalloc.h> #include <linux/delay.h> #include <linux/interrupt.h> #include <linux/fb.h> #include <linux/list.h> /* to support deferred IO */ #include <linux/rmap.h> #include <linux/pagemap.h> static struct page *fb_deferred_io_page(struct fb_info *info, unsigned long offs) { void *screen_base = (void __force *) info->screen_base; struct page *page; if (is_vmalloc_addr(screen_base + offs)) page = vmalloc_to_page(screen_base + offs); else page = pfn_to_page((info->fix.smem_start + offs) >> PAGE_SHIFT); return page; } static struct fb_deferred_io_pageref *fb_deferred_io_pageref_get(struct fb_info *info, unsigned long offset, struct page *page) { struct fb_deferred_io *fbdefio = info->fbdefio; struct list_head *pos = &fbdefio->pagereflist; unsigned long pgoff = offset >> PAGE_SHIFT; struct fb_deferred_io_pageref *pageref, *cur; if (WARN_ON_ONCE(pgoff >= info->npagerefs)) return NULL; /* incorrect allocation size */ /* 1:1 mapping between pageref and page offset */ pageref = &info->pagerefs[pgoff]; /* * This check is to catch the case where a new process could start * writing to the same page through a new PTE. This new access * can cause a call to .page_mkwrite even if the original process' * PTE is marked writable. */ if (!list_empty(&pageref->list)) goto pageref_already_added; pageref->page = page; pageref->offset = pgoff << PAGE_SHIFT; if (unlikely(fbdefio->sort_pagereflist)) { /* * We loop through the list of pagerefs before adding in * order to keep the pagerefs sorted. This has significant * overhead of O(n^2) with n being the number of written * pages. If possible, drivers should try to work with * unsorted page lists instead. */ list_for_each_entry(cur, &fbdefio->pagereflist, list) { if (cur->offset > pageref->offset) break; } pos = &cur->list; } list_add_tail(&pageref->list, pos); pageref_already_added: return pageref; } static void fb_deferred_io_pageref_put(struct fb_deferred_io_pageref *pageref, struct fb_info *info) { list_del_init(&pageref->list); } /* this is to find and return the vmalloc-ed fb pages */ static vm_fault_t fb_deferred_io_fault(struct vm_fault *vmf) { unsigned long offset; struct page *page; struct fb_info *info = vmf->vma->vm_private_data; offset = vmf->pgoff << PAGE_SHIFT; if (offset >= info->fix.smem_len) return VM_FAULT_SIGBUS; page = fb_deferred_io_page(info, offset); if (!page) return VM_FAULT_SIGBUS; get_page(page); if (vmf->vma->vm_file) page->mapping = vmf->vma->vm_file->f_mapping; else printk(KERN_ERR "no mapping available\n"); BUG_ON(!page->mapping); page->index = vmf->pgoff; /* for page_mkclean() */ vmf->page = page; return 0; } int fb_deferred_io_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct fb_info *info = file->private_data; struct inode *inode = file_inode(file); int err = file_write_and_wait_range(file, start, end); if (err) return err; /* Skip if deferred io is compiled-in but disabled on this fbdev */ if (!info->fbdefio) return 0; inode_lock(inode); /* Kill off the delayed work */ cancel_delayed_work_sync(&info->deferred_work); /* Run it immediately */ schedule_delayed_work(&info->deferred_work, 0); inode_unlock(inode); return 0; } EXPORT_SYMBOL_GPL(fb_deferred_io_fsync); /* * Adds a page to the dirty list. Call this from struct * vm_operations_struct.page_mkwrite. */ static vm_fault_t fb_deferred_io_track_page(struct fb_info *info, unsigned long offset, struct page *page) { struct fb_deferred_io *fbdefio = info->fbdefio; struct fb_deferred_io_pageref *pageref; vm_fault_t ret; /* protect against the workqueue changing the page list */ mutex_lock(&fbdefio->lock); pageref = fb_deferred_io_pageref_get(info, offset, page); if (WARN_ON_ONCE(!pageref)) { ret = VM_FAULT_OOM; goto err_mutex_unlock; } /* * We want the page to remain locked from ->page_mkwrite until * the PTE is marked dirty to avoid page_mkclean() being called * before the PTE is updated, which would leave the page ignored * by defio. * Do this by locking the page here and informing the caller * about it with VM_FAULT_LOCKED. */ lock_page(pageref->page); mutex_unlock(&fbdefio->lock); /* come back after delay to process the deferred IO */ schedule_delayed_work(&info->deferred_work, fbdefio->delay); return VM_FAULT_LOCKED; err_mutex_unlock: mutex_unlock(&fbdefio->lock); return ret; } /* * fb_deferred_io_page_mkwrite - Mark a page as written for deferred I/O * @fb_info: The fbdev info structure * @vmf: The VM fault * * This is a callback we get when userspace first tries to * write to the page. We schedule a workqueue. That workqueue * will eventually mkclean the touched pages and execute the * deferred framebuffer IO. Then if userspace touches a page * again, we repeat the same scheme. * * Returns: * VM_FAULT_LOCKED on success, or a VM_FAULT error otherwise. */ static vm_fault_t fb_deferred_io_page_mkwrite(struct fb_info *info, struct vm_fault *vmf) { unsigned long offset = vmf->address - vmf->vma->vm_start; struct page *page = vmf->page; file_update_time(vmf->vma->vm_file); return fb_deferred_io_track_page(info, offset, page); } /* vm_ops->page_mkwrite handler */ static vm_fault_t fb_deferred_io_mkwrite(struct vm_fault *vmf) { struct fb_info *info = vmf->vma->vm_private_data; return fb_deferred_io_page_mkwrite(info, vmf); } static const struct vm_operations_struct fb_deferred_io_vm_ops = { .fault = fb_deferred_io_fault, .page_mkwrite = fb_deferred_io_mkwrite, }; static const struct address_space_operations fb_deferred_io_aops = { .dirty_folio = noop_dirty_folio, }; int fb_deferred_io_mmap(struct fb_info *info, struct vm_area_struct *vma) { vma->vm_ops = &fb_deferred_io_vm_ops; vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); if (!(info->flags & FBINFO_VIRTFB)) vm_flags_set(vma, VM_IO); vma->vm_private_data = info; return 0; } EXPORT_SYMBOL_GPL(fb_deferred_io_mmap); /* workqueue callback */ static void fb_deferred_io_work(struct work_struct *work) { struct fb_info *info = container_of(work, struct fb_info, deferred_work.work); struct fb_deferred_io_pageref *pageref, *next; struct fb_deferred_io *fbdefio = info->fbdefio; /* here we mkclean the pages, then do all deferred IO */ mutex_lock(&fbdefio->lock); list_for_each_entry(pageref, &fbdefio->pagereflist, list) { struct page *cur = pageref->page; lock_page(cur); page_mkclean(cur); unlock_page(cur); } /* driver's callback with pagereflist */ fbdefio->deferred_io(info, &fbdefio->pagereflist); /* clear the list */ list_for_each_entry_safe(pageref, next, &fbdefio->pagereflist, list) fb_deferred_io_pageref_put(pageref, info); mutex_unlock(&fbdefio->lock); } int fb_deferred_io_init(struct fb_info *info) { struct fb_deferred_io *fbdefio = info->fbdefio; struct fb_deferred_io_pageref *pagerefs; unsigned long npagerefs, i; int ret; BUG_ON(!fbdefio); if (WARN_ON(!info->fix.smem_len)) return -EINVAL; mutex_init(&fbdefio->lock); INIT_DELAYED_WORK(&info->deferred_work, fb_deferred_io_work); INIT_LIST_HEAD(&fbdefio->pagereflist); if (fbdefio->delay == 0) /* set a default of 1 s */ fbdefio->delay = HZ; npagerefs = DIV_ROUND_UP(info->fix.smem_len, PAGE_SIZE); /* alloc a page ref for each page of the display memory */ pagerefs = kvcalloc(npagerefs, sizeof(*pagerefs), GFP_KERNEL); if (!pagerefs) { ret = -ENOMEM; goto err; } for (i = 0; i < npagerefs; ++i) INIT_LIST_HEAD(&pagerefs[i].list); info->npagerefs = npagerefs; info->pagerefs = pagerefs; return 0; err: mutex_destroy(&fbdefio->lock); return ret; } EXPORT_SYMBOL_GPL(fb_deferred_io_init); void fb_deferred_io_open(struct fb_info *info, struct inode *inode, struct file *file) { struct fb_deferred_io *fbdefio = info->fbdefio; file->f_mapping->a_ops = &fb_deferred_io_aops; fbdefio->open_count++; } EXPORT_SYMBOL_GPL(fb_deferred_io_open); static void fb_deferred_io_lastclose(struct fb_info *info) { struct page *page; int i; cancel_delayed_work_sync(&info->deferred_work); /* clear out the mapping that we setup */ for (i = 0 ; i < info->fix.smem_len; i += PAGE_SIZE) { page = fb_deferred_io_page(info, i); page->mapping = NULL; } } void fb_deferred_io_release(struct fb_info *info) { struct fb_deferred_io *fbdefio = info->fbdefio; if (!--fbdefio->open_count) fb_deferred_io_lastclose(info); } EXPORT_SYMBOL_GPL(fb_deferred_io_release); void fb_deferred_io_cleanup(struct fb_info *info) { struct fb_deferred_io *fbdefio = info->fbdefio; fb_deferred_io_lastclose(info); kvfree(info->pagerefs); mutex_destroy(&fbdefio->lock); } EXPORT_SYMBOL_GPL(fb_deferred_io_cleanup);
2 39 39 38 40 39 1 1 1 38 37 37 37 37 40 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) /* af_can.c - Protocol family CAN core module * (used by different CAN protocol modules) * * Copyright (c) 2002-2017 Volkswagen Group Electronic Research * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Volkswagen nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * Alternatively, provided that this notice is retained in full, this * software may be distributed under the terms of the GNU General * Public License ("GPL") version 2, in which case the provisions of the * GPL apply INSTEAD OF those given above. * * The provided data structures and external interfaces from this code * are not restricted to be used by modules with a GPL compatible license. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * */ #include <linux/module.h> #include <linux/stddef.h> #include <linux/init.h> #include <linux/kmod.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/uaccess.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/socket.h> #include <linux/if_ether.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/can.h> #include <linux/can/core.h> #include <linux/can/skb.h> #include <linux/can/can-ml.h> #include <linux/ratelimit.h> #include <net/net_namespace.h> #include <net/sock.h> #include "af_can.h" MODULE_DESCRIPTION("Controller Area Network PF_CAN core"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Urs Thuermann <urs.thuermann@volkswagen.de>, " "Oliver Hartkopp <oliver.hartkopp@volkswagen.de>"); MODULE_ALIAS_NETPROTO(PF_CAN); static int stats_timer __read_mostly = 1; module_param(stats_timer, int, 0444); MODULE_PARM_DESC(stats_timer, "enable timer for statistics (default:on)"); static struct kmem_cache *rcv_cache __read_mostly; /* table of registered CAN protocols */ static const struct can_proto __rcu *proto_tab[CAN_NPROTO] __read_mostly; static DEFINE_MUTEX(proto_tab_lock); static atomic_t skbcounter = ATOMIC_INIT(0); /* af_can socket functions */ void can_sock_destruct(struct sock *sk) { skb_queue_purge(&sk->sk_receive_queue); skb_queue_purge(&sk->sk_error_queue); } EXPORT_SYMBOL(can_sock_destruct); static const struct can_proto *can_get_proto(int protocol) { const struct can_proto *cp; rcu_read_lock(); cp = rcu_dereference(proto_tab[protocol]); if (cp && !try_module_get(cp->prot->owner)) cp = NULL; rcu_read_unlock(); return cp; } static inline void can_put_proto(const struct can_proto *cp) { module_put(cp->prot->owner); } static int can_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; const struct can_proto *cp; int err = 0; sock->state = SS_UNCONNECTED; if (protocol < 0 || protocol >= CAN_NPROTO) return -EINVAL; cp = can_get_proto(protocol); #ifdef CONFIG_MODULES if (!cp) { /* try to load protocol module if kernel is modular */ err = request_module("can-proto-%d", protocol); /* In case of error we only print a message but don't * return the error code immediately. Below we will * return -EPROTONOSUPPORT */ if (err) pr_err_ratelimited("can: request_module (can-proto-%d) failed.\n", protocol); cp = can_get_proto(protocol); } #endif /* check for available protocol and correct usage */ if (!cp) return -EPROTONOSUPPORT; if (cp->type != sock->type) { err = -EPROTOTYPE; goto errout; } sock->ops = cp->ops; sk = sk_alloc(net, PF_CAN, GFP_KERNEL, cp->prot, kern); if (!sk) { err = -ENOMEM; goto errout; } sock_init_data(sock, sk); sk->sk_destruct = can_sock_destruct; if (sk->sk_prot->init) err = sk->sk_prot->init(sk); if (err) { /* release sk on errors */ sock_orphan(sk); sock_put(sk); } errout: can_put_proto(cp); return err; } /* af_can tx path */ /** * can_send - transmit a CAN frame (optional with local loopback) * @skb: pointer to socket buffer with CAN frame in data section * @loop: loopback for listeners on local CAN sockets (recommended default!) * * Due to the loopback this routine must not be called from hardirq context. * * Return: * 0 on success * -ENETDOWN when the selected interface is down * -ENOBUFS on full driver queue (see net_xmit_errno()) * -ENOMEM when local loopback failed at calling skb_clone() * -EPERM when trying to send on a non-CAN interface * -EMSGSIZE CAN frame size is bigger than CAN interface MTU * -EINVAL when the skb->data does not contain a valid CAN frame */ int can_send(struct sk_buff *skb, int loop) { struct sk_buff *newskb = NULL; struct can_pkg_stats *pkg_stats = dev_net(skb->dev)->can.pkg_stats; int err = -EINVAL; if (can_is_canxl_skb(skb)) { skb->protocol = htons(ETH_P_CANXL); } else if (can_is_can_skb(skb)) { skb->protocol = htons(ETH_P_CAN); } else if (can_is_canfd_skb(skb)) { struct canfd_frame *cfd = (struct canfd_frame *)skb->data; skb->protocol = htons(ETH_P_CANFD); /* set CAN FD flag for CAN FD frames by default */ cfd->flags |= CANFD_FDF; } else { goto inval_skb; } /* Make sure the CAN frame can pass the selected CAN netdevice. */ if (unlikely(skb->len > skb->dev->mtu)) { err = -EMSGSIZE; goto inval_skb; } if (unlikely(skb->dev->type != ARPHRD_CAN)) { err = -EPERM; goto inval_skb; } if (unlikely(!(skb->dev->flags & IFF_UP))) { err = -ENETDOWN; goto inval_skb; } skb->ip_summed = CHECKSUM_UNNECESSARY; skb_reset_mac_header(skb); skb_reset_network_header(skb); skb_reset_transport_header(skb); if (loop) { /* local loopback of sent CAN frames */ /* indication for the CAN driver: do loopback */ skb->pkt_type = PACKET_LOOPBACK; /* The reference to the originating sock may be required * by the receiving socket to check whether the frame is * its own. Example: can_raw sockopt CAN_RAW_RECV_OWN_MSGS * Therefore we have to ensure that skb->sk remains the * reference to the originating sock by restoring skb->sk * after each skb_clone() or skb_orphan() usage. */ if (!(skb->dev->flags & IFF_ECHO)) { /* If the interface is not capable to do loopback * itself, we do it here. */ newskb = skb_clone(skb, GFP_ATOMIC); if (!newskb) { kfree_skb(skb); return -ENOMEM; } can_skb_set_owner(newskb, skb->sk); newskb->ip_summed = CHECKSUM_UNNECESSARY; newskb->pkt_type = PACKET_BROADCAST; } } else { /* indication for the CAN driver: no loopback required */ skb->pkt_type = PACKET_HOST; } /* send to netdevice */ err = dev_queue_xmit(skb); if (err > 0) err = net_xmit_errno(err); if (err) { kfree_skb(newskb); return err; } if (newskb) netif_rx(newskb); /* update statistics */ pkg_stats->tx_frames++; pkg_stats->tx_frames_delta++; return 0; inval_skb: kfree_skb(skb); return err; } EXPORT_SYMBOL(can_send); /* af_can rx path */ static struct can_dev_rcv_lists *can_dev_rcv_lists_find(struct net *net, struct net_device *dev) { if (dev) { struct can_ml_priv *can_ml = can_get_ml_priv(dev); return &can_ml->dev_rcv_lists; } else { return net->can.rx_alldev_list; } } /** * effhash - hash function for 29 bit CAN identifier reduction * @can_id: 29 bit CAN identifier * * Description: * To reduce the linear traversal in one linked list of _single_ EFF CAN * frame subscriptions the 29 bit identifier is mapped to 10 bits. * (see CAN_EFF_RCV_HASH_BITS definition) * * Return: * Hash value from 0x000 - 0x3FF ( enforced by CAN_EFF_RCV_HASH_BITS mask ) */ static unsigned int effhash(canid_t can_id) { unsigned int hash; hash = can_id; hash ^= can_id >> CAN_EFF_RCV_HASH_BITS; hash ^= can_id >> (2 * CAN_EFF_RCV_HASH_BITS); return hash & ((1 << CAN_EFF_RCV_HASH_BITS) - 1); } /** * can_rcv_list_find - determine optimal filterlist inside device filter struct * @can_id: pointer to CAN identifier of a given can_filter * @mask: pointer to CAN mask of a given can_filter * @dev_rcv_lists: pointer to the device filter struct * * Description: * Returns the optimal filterlist to reduce the filter handling in the * receive path. This function is called by service functions that need * to register or unregister a can_filter in the filter lists. * * A filter matches in general, when * * <received_can_id> & mask == can_id & mask * * so every bit set in the mask (even CAN_EFF_FLAG, CAN_RTR_FLAG) describe * relevant bits for the filter. * * The filter can be inverted (CAN_INV_FILTER bit set in can_id) or it can * filter for error messages (CAN_ERR_FLAG bit set in mask). For error msg * frames there is a special filterlist and a special rx path filter handling. * * Return: * Pointer to optimal filterlist for the given can_id/mask pair. * Consistency checked mask. * Reduced can_id to have a preprocessed filter compare value. */ static struct hlist_head *can_rcv_list_find(canid_t *can_id, canid_t *mask, struct can_dev_rcv_lists *dev_rcv_lists) { canid_t inv = *can_id & CAN_INV_FILTER; /* save flag before masking */ /* filter for error message frames in extra filterlist */ if (*mask & CAN_ERR_FLAG) { /* clear CAN_ERR_FLAG in filter entry */ *mask &= CAN_ERR_MASK; return &dev_rcv_lists->rx[RX_ERR]; } /* with cleared CAN_ERR_FLAG we have a simple mask/value filterpair */ #define CAN_EFF_RTR_FLAGS (CAN_EFF_FLAG | CAN_RTR_FLAG) /* ensure valid values in can_mask for 'SFF only' frame filtering */ if ((*mask & CAN_EFF_FLAG) && !(*can_id & CAN_EFF_FLAG)) *mask &= (CAN_SFF_MASK | CAN_EFF_RTR_FLAGS); /* reduce condition testing at receive time */ *can_id &= *mask; /* inverse can_id/can_mask filter */ if (inv) return &dev_rcv_lists->rx[RX_INV]; /* mask == 0 => no condition testing at receive time */ if (!(*mask)) return &dev_rcv_lists->rx[RX_ALL]; /* extra filterlists for the subscription of a single non-RTR can_id */ if (((*mask & CAN_EFF_RTR_FLAGS) == CAN_EFF_RTR_FLAGS) && !(*can_id & CAN_RTR_FLAG)) { if (*can_id & CAN_EFF_FLAG) { if (*mask == (CAN_EFF_MASK | CAN_EFF_RTR_FLAGS)) return &dev_rcv_lists->rx_eff[effhash(*can_id)]; } else { if (*mask == (CAN_SFF_MASK | CAN_EFF_RTR_FLAGS)) return &dev_rcv_lists->rx_sff[*can_id]; } } /* default: filter via can_id/can_mask */ return &dev_rcv_lists->rx[RX_FIL]; } /** * can_rx_register - subscribe CAN frames from a specific interface * @net: the applicable net namespace * @dev: pointer to netdevice (NULL => subscribe from 'all' CAN devices list) * @can_id: CAN identifier (see description) * @mask: CAN mask (see description) * @func: callback function on filter match * @data: returned parameter for callback function * @ident: string for calling module identification * @sk: socket pointer (might be NULL) * * Description: * Invokes the callback function with the received sk_buff and the given * parameter 'data' on a matching receive filter. A filter matches, when * * <received_can_id> & mask == can_id & mask * * The filter can be inverted (CAN_INV_FILTER bit set in can_id) or it can * filter for error message frames (CAN_ERR_FLAG bit set in mask). * * The provided pointer to the sk_buff is guaranteed to be valid as long as * the callback function is running. The callback function must *not* free * the given sk_buff while processing it's task. When the given sk_buff is * needed after the end of the callback function it must be cloned inside * the callback function with skb_clone(). * * Return: * 0 on success * -ENOMEM on missing cache mem to create subscription entry * -ENODEV unknown device */ int can_rx_register(struct net *net, struct net_device *dev, canid_t can_id, canid_t mask, void (*func)(struct sk_buff *, void *), void *data, char *ident, struct sock *sk) { struct receiver *rcv; struct hlist_head *rcv_list; struct can_dev_rcv_lists *dev_rcv_lists; struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats; /* insert new receiver (dev,canid,mask) -> (func,data) */ if (dev && (dev->type != ARPHRD_CAN || !can_get_ml_priv(dev))) return -ENODEV; if (dev && !net_eq(net, dev_net(dev))) return -ENODEV; rcv = kmem_cache_alloc(rcv_cache, GFP_KERNEL); if (!rcv) return -ENOMEM; spin_lock_bh(&net->can.rcvlists_lock); dev_rcv_lists = can_dev_rcv_lists_find(net, dev); rcv_list = can_rcv_list_find(&can_id, &mask, dev_rcv_lists); rcv->can_id = can_id; rcv->mask = mask; rcv->matches = 0; rcv->func = func; rcv->data = data; rcv->ident = ident; rcv->sk = sk; hlist_add_head_rcu(&rcv->list, rcv_list); dev_rcv_lists->entries++; rcv_lists_stats->rcv_entries++; rcv_lists_stats->rcv_entries_max = max(rcv_lists_stats->rcv_entries_max, rcv_lists_stats->rcv_entries); spin_unlock_bh(&net->can.rcvlists_lock); return 0; } EXPORT_SYMBOL(can_rx_register); /* can_rx_delete_receiver - rcu callback for single receiver entry removal */ static void can_rx_delete_receiver(struct rcu_head *rp) { struct receiver *rcv = container_of(rp, struct receiver, rcu); struct sock *sk = rcv->sk; kmem_cache_free(rcv_cache, rcv); if (sk) sock_put(sk); } /** * can_rx_unregister - unsubscribe CAN frames from a specific interface * @net: the applicable net namespace * @dev: pointer to netdevice (NULL => unsubscribe from 'all' CAN devices list) * @can_id: CAN identifier * @mask: CAN mask * @func: callback function on filter match * @data: returned parameter for callback function * * Description: * Removes subscription entry depending on given (subscription) values. */ void can_rx_unregister(struct net *net, struct net_device *dev, canid_t can_id, canid_t mask, void (*func)(struct sk_buff *, void *), void *data) { struct receiver *rcv = NULL; struct hlist_head *rcv_list; struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats; struct can_dev_rcv_lists *dev_rcv_lists; if (dev && dev->type != ARPHRD_CAN) return; if (dev && !net_eq(net, dev_net(dev))) return; spin_lock_bh(&net->can.rcvlists_lock); dev_rcv_lists = can_dev_rcv_lists_find(net, dev); rcv_list = can_rcv_list_find(&can_id, &mask, dev_rcv_lists); /* Search the receiver list for the item to delete. This should * exist, since no receiver may be unregistered that hasn't * been registered before. */ hlist_for_each_entry_rcu(rcv, rcv_list, list) { if (rcv->can_id == can_id && rcv->mask == mask && rcv->func == func && rcv->data == data) break; } /* Check for bugs in CAN protocol implementations using af_can.c: * 'rcv' will be NULL if no matching list item was found for removal. * As this case may potentially happen when closing a socket while * the notifier for removing the CAN netdev is running we just print * a warning here. */ if (!rcv) { pr_warn("can: receive list entry not found for dev %s, id %03X, mask %03X\n", DNAME(dev), can_id, mask); goto out; } hlist_del_rcu(&rcv->list); dev_rcv_lists->entries--; if (rcv_lists_stats->rcv_entries > 0) rcv_lists_stats->rcv_entries--; out: spin_unlock_bh(&net->can.rcvlists_lock); /* schedule the receiver item for deletion */ if (rcv) { if (rcv->sk) sock_hold(rcv->sk); call_rcu(&rcv->rcu, can_rx_delete_receiver); } } EXPORT_SYMBOL(can_rx_unregister); static inline void deliver(struct sk_buff *skb, struct receiver *rcv) { rcv->func(skb, rcv->data); rcv->matches++; } static int can_rcv_filter(struct can_dev_rcv_lists *dev_rcv_lists, struct sk_buff *skb) { struct receiver *rcv; int matches = 0; struct can_frame *cf = (struct can_frame *)skb->data; canid_t can_id = cf->can_id; if (dev_rcv_lists->entries == 0) return 0; if (can_id & CAN_ERR_FLAG) { /* check for error message frame entries only */ hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx[RX_ERR], list) { if (can_id & rcv->mask) { deliver(skb, rcv); matches++; } } return matches; } /* check for unfiltered entries */ hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx[RX_ALL], list) { deliver(skb, rcv); matches++; } /* check for can_id/mask entries */ hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx[RX_FIL], list) { if ((can_id & rcv->mask) == rcv->can_id) { deliver(skb, rcv); matches++; } } /* check for inverted can_id/mask entries */ hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx[RX_INV], list) { if ((can_id & rcv->mask) != rcv->can_id) { deliver(skb, rcv); matches++; } } /* check filterlists for single non-RTR can_ids */ if (can_id & CAN_RTR_FLAG) return matches; if (can_id & CAN_EFF_FLAG) { hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx_eff[effhash(can_id)], list) { if (rcv->can_id == can_id) { deliver(skb, rcv); matches++; } } } else { can_id &= CAN_SFF_MASK; hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx_sff[can_id], list) { deliver(skb, rcv); matches++; } } return matches; } static void can_receive(struct sk_buff *skb, struct net_device *dev) { struct can_dev_rcv_lists *dev_rcv_lists; struct net *net = dev_net(dev); struct can_pkg_stats *pkg_stats = net->can.pkg_stats; int matches; /* update statistics */ pkg_stats->rx_frames++; pkg_stats->rx_frames_delta++; /* create non-zero unique skb identifier together with *skb */ while (!(can_skb_prv(skb)->skbcnt)) can_skb_prv(skb)->skbcnt = atomic_inc_return(&skbcounter); rcu_read_lock(); /* deliver the packet to sockets listening on all devices */ matches = can_rcv_filter(net->can.rx_alldev_list, skb); /* find receive list for this device */ dev_rcv_lists = can_dev_rcv_lists_find(net, dev); matches += can_rcv_filter(dev_rcv_lists, skb); rcu_read_unlock(); /* consume the skbuff allocated by the netdevice driver */ consume_skb(skb); if (matches > 0) { pkg_stats->matches++; pkg_stats->matches_delta++; } } static int can_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || !can_is_can_skb(skb))) { pr_warn_once("PF_CAN: dropped non conform CAN skbuff: dev type %d, len %d\n", dev->type, skb->len); kfree_skb(skb); return NET_RX_DROP; } can_receive(skb, dev); return NET_RX_SUCCESS; } static int canfd_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || !can_is_canfd_skb(skb))) { pr_warn_once("PF_CAN: dropped non conform CAN FD skbuff: dev type %d, len %d\n", dev->type, skb->len); kfree_skb(skb); return NET_RX_DROP; } can_receive(skb, dev); return NET_RX_SUCCESS; } static int canxl_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || !can_is_canxl_skb(skb))) { pr_warn_once("PF_CAN: dropped non conform CAN XL skbuff: dev type %d, len %d\n", dev->type, skb->len); kfree_skb(skb); return NET_RX_DROP; } can_receive(skb, dev); return NET_RX_SUCCESS; } /* af_can protocol functions */ /** * can_proto_register - register CAN transport protocol * @cp: pointer to CAN protocol structure * * Return: * 0 on success * -EINVAL invalid (out of range) protocol number * -EBUSY protocol already in use * -ENOBUF if proto_register() fails */ int can_proto_register(const struct can_proto *cp) { int proto = cp->protocol; int err = 0; if (proto < 0 || proto >= CAN_NPROTO) { pr_err("can: protocol number %d out of range\n", proto); return -EINVAL; } err = proto_register(cp->prot, 0); if (err < 0) return err; mutex_lock(&proto_tab_lock); if (rcu_access_pointer(proto_tab[proto])) { pr_err("can: protocol %d already registered\n", proto); err = -EBUSY; } else { RCU_INIT_POINTER(proto_tab[proto], cp); } mutex_unlock(&proto_tab_lock); if (err < 0) proto_unregister(cp->prot); return err; } EXPORT_SYMBOL(can_proto_register); /** * can_proto_unregister - unregister CAN transport protocol * @cp: pointer to CAN protocol structure */ void can_proto_unregister(const struct can_proto *cp) { int proto = cp->protocol; mutex_lock(&proto_tab_lock); BUG_ON(rcu_access_pointer(proto_tab[proto]) != cp); RCU_INIT_POINTER(proto_tab[proto], NULL); mutex_unlock(&proto_tab_lock); synchronize_rcu(); proto_unregister(cp->prot); } EXPORT_SYMBOL(can_proto_unregister); static int can_pernet_init(struct net *net) { spin_lock_init(&net->can.rcvlists_lock); net->can.rx_alldev_list = kzalloc(sizeof(*net->can.rx_alldev_list), GFP_KERNEL); if (!net->can.rx_alldev_list) goto out; net->can.pkg_stats = kzalloc(sizeof(*net->can.pkg_stats), GFP_KERNEL); if (!net->can.pkg_stats) goto out_free_rx_alldev_list; net->can.rcv_lists_stats = kzalloc(sizeof(*net->can.rcv_lists_stats), GFP_KERNEL); if (!net->can.rcv_lists_stats) goto out_free_pkg_stats; if (IS_ENABLED(CONFIG_PROC_FS)) { /* the statistics are updated every second (timer triggered) */ if (stats_timer) { timer_setup(&net->can.stattimer, can_stat_update, 0); mod_timer(&net->can.stattimer, round_jiffies(jiffies + HZ)); } net->can.pkg_stats->jiffies_init = jiffies; can_init_proc(net); } return 0; out_free_pkg_stats: kfree(net->can.pkg_stats); out_free_rx_alldev_list: kfree(net->can.rx_alldev_list); out: return -ENOMEM; } static void can_pernet_exit(struct net *net) { if (IS_ENABLED(CONFIG_PROC_FS)) { can_remove_proc(net); if (stats_timer) del_timer_sync(&net->can.stattimer); } kfree(net->can.rx_alldev_list); kfree(net->can.pkg_stats); kfree(net->can.rcv_lists_stats); } /* af_can module init/exit functions */ static struct packet_type can_packet __read_mostly = { .type = cpu_to_be16(ETH_P_CAN), .func = can_rcv, }; static struct packet_type canfd_packet __read_mostly = { .type = cpu_to_be16(ETH_P_CANFD), .func = canfd_rcv, }; static struct packet_type canxl_packet __read_mostly = { .type = cpu_to_be16(ETH_P_CANXL), .func = canxl_rcv, }; static const struct net_proto_family can_family_ops = { .family = PF_CAN, .create = can_create, .owner = THIS_MODULE, }; static struct pernet_operations can_pernet_ops __read_mostly = { .init = can_pernet_init, .exit = can_pernet_exit, }; static __init int can_init(void) { int err; /* check for correct padding to be able to use the structs similarly */ BUILD_BUG_ON(offsetof(struct can_frame, len) != offsetof(struct canfd_frame, len) || offsetof(struct can_frame, data) != offsetof(struct canfd_frame, data)); pr_info("can: controller area network core\n"); rcv_cache = kmem_cache_create("can_receiver", sizeof(struct receiver), 0, 0, NULL); if (!rcv_cache) return -ENOMEM; err = register_pernet_subsys(&can_pernet_ops); if (err) goto out_pernet; /* protocol register */ err = sock_register(&can_family_ops); if (err) goto out_sock; dev_add_pack(&can_packet); dev_add_pack(&canfd_packet); dev_add_pack(&canxl_packet); return 0; out_sock: unregister_pernet_subsys(&can_pernet_ops); out_pernet: kmem_cache_destroy(rcv_cache); return err; } static __exit void can_exit(void) { /* protocol unregister */ dev_remove_pack(&canxl_packet); dev_remove_pack(&canfd_packet); dev_remove_pack(&can_packet); sock_unregister(PF_CAN); unregister_pernet_subsys(&can_pernet_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ kmem_cache_destroy(rcv_cache); } module_init(can_init); module_exit(can_exit);
222 222 226 226 231 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 // SPDX-License-Identifier: GPL-2.0-only /* * Monitoring code for network dropped packet alerts * * Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/string.h> #include <linux/if_arp.h> #include <linux/inetdevice.h> #include <linux/inet.h> #include <linux/interrupt.h> #include <linux/netpoll.h> #include <linux/sched.h> #include <linux/delay.h> #include <linux/types.h> #include <linux/workqueue.h> #include <linux/netlink.h> #include <linux/net_dropmon.h> #include <linux/bitfield.h> #include <linux/percpu.h> #include <linux/timer.h> #include <linux/bitops.h> #include <linux/slab.h> #include <linux/module.h> #include <net/genetlink.h> #include <net/netevent.h> #include <net/flow_offload.h> #include <net/dropreason.h> #include <net/devlink.h> #include <trace/events/skb.h> #include <trace/events/napi.h> #include <trace/events/devlink.h> #include <asm/unaligned.h> #define TRACE_ON 1 #define TRACE_OFF 0 /* * Globals, our netlink socket pointer * and the work handle that will send up * netlink alerts */ static int trace_state = TRACE_OFF; static bool monitor_hw; /* net_dm_mutex * * An overall lock guarding every operation coming from userspace. */ static DEFINE_MUTEX(net_dm_mutex); struct net_dm_stats { u64_stats_t dropped; struct u64_stats_sync syncp; }; #define NET_DM_MAX_HW_TRAP_NAME_LEN 40 struct net_dm_hw_entry { char trap_name[NET_DM_MAX_HW_TRAP_NAME_LEN]; u32 count; }; struct net_dm_hw_entries { u32 num_entries; struct net_dm_hw_entry entries[]; }; struct per_cpu_dm_data { spinlock_t lock; /* Protects 'skb', 'hw_entries' and * 'send_timer' */ union { struct sk_buff *skb; struct net_dm_hw_entries *hw_entries; }; struct sk_buff_head drop_queue; struct work_struct dm_alert_work; struct timer_list send_timer; struct net_dm_stats stats; }; struct dm_hw_stat_delta { unsigned long last_rx; unsigned long last_drop_val; struct rcu_head rcu; }; static struct genl_family net_drop_monitor_family; static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data); static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_hw_cpu_data); static int dm_hit_limit = 64; static int dm_delay = 1; static unsigned long dm_hw_check_delta = 2*HZ; static enum net_dm_alert_mode net_dm_alert_mode = NET_DM_ALERT_MODE_SUMMARY; static u32 net_dm_trunc_len; static u32 net_dm_queue_len = 1000; struct net_dm_alert_ops { void (*kfree_skb_probe)(void *ignore, struct sk_buff *skb, void *location, enum skb_drop_reason reason); void (*napi_poll_probe)(void *ignore, struct napi_struct *napi, int work, int budget); void (*work_item_func)(struct work_struct *work); void (*hw_work_item_func)(struct work_struct *work); void (*hw_trap_probe)(void *ignore, const struct devlink *devlink, struct sk_buff *skb, const struct devlink_trap_metadata *metadata); }; struct net_dm_skb_cb { union { struct devlink_trap_metadata *hw_metadata; void *pc; }; enum skb_drop_reason reason; }; #define NET_DM_SKB_CB(__skb) ((struct net_dm_skb_cb *)&((__skb)->cb[0])) static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data) { size_t al; struct net_dm_alert_msg *msg; struct nlattr *nla; struct sk_buff *skb; unsigned long flags; void *msg_header; al = sizeof(struct net_dm_alert_msg); al += dm_hit_limit * sizeof(struct net_dm_drop_point); al += sizeof(struct nlattr); skb = genlmsg_new(al, GFP_KERNEL); if (!skb) goto err; msg_header = genlmsg_put(skb, 0, 0, &net_drop_monitor_family, 0, NET_DM_CMD_ALERT); if (!msg_header) { nlmsg_free(skb); skb = NULL; goto err; } nla = nla_reserve(skb, NLA_UNSPEC, sizeof(struct net_dm_alert_msg)); if (!nla) { nlmsg_free(skb); skb = NULL; goto err; } msg = nla_data(nla); memset(msg, 0, al); goto out; err: mod_timer(&data->send_timer, jiffies + HZ / 10); out: spin_lock_irqsave(&data->lock, flags); swap(data->skb, skb); spin_unlock_irqrestore(&data->lock, flags); if (skb) { struct nlmsghdr *nlh = (struct nlmsghdr *)skb->data; struct genlmsghdr *gnlh = (struct genlmsghdr *)nlmsg_data(nlh); genlmsg_end(skb, genlmsg_data(gnlh)); } return skb; } static const struct genl_multicast_group dropmon_mcgrps[] = { { .name = "events", }, }; static void send_dm_alert(struct work_struct *work) { struct sk_buff *skb; struct per_cpu_dm_data *data; data = container_of(work, struct per_cpu_dm_data, dm_alert_work); skb = reset_per_cpu_data(data); if (skb) genlmsg_multicast(&net_drop_monitor_family, skb, 0, 0, GFP_KERNEL); } /* * This is the timer function to delay the sending of an alert * in the event that more drops will arrive during the * hysteresis period. */ static void sched_send_work(struct timer_list *t) { struct per_cpu_dm_data *data = from_timer(data, t, send_timer); schedule_work(&data->dm_alert_work); } static void trace_drop_common(struct sk_buff *skb, void *location) { struct net_dm_alert_msg *msg; struct net_dm_drop_point *point; struct nlmsghdr *nlh; struct nlattr *nla; int i; struct sk_buff *dskb; struct per_cpu_dm_data *data; unsigned long flags; local_irq_save(flags); data = this_cpu_ptr(&dm_cpu_data); spin_lock(&data->lock); dskb = data->skb; if (!dskb) goto out; nlh = (struct nlmsghdr *)dskb->data; nla = genlmsg_data(nlmsg_data(nlh)); msg = nla_data(nla); point = msg->points; for (i = 0; i < msg->entries; i++) { if (!memcmp(&location, &point->pc, sizeof(void *))) { point->count++; goto out; } point++; } if (msg->entries == dm_hit_limit) goto out; /* * We need to create a new entry */ __nla_reserve_nohdr(dskb, sizeof(struct net_dm_drop_point)); nla->nla_len += NLA_ALIGN(sizeof(struct net_dm_drop_point)); memcpy(point->pc, &location, sizeof(void *)); point->count = 1; msg->entries++; if (!timer_pending(&data->send_timer)) { data->send_timer.expires = jiffies + dm_delay * HZ; add_timer(&data->send_timer); } out: spin_unlock_irqrestore(&data->lock, flags); } static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location, enum skb_drop_reason reason) { trace_drop_common(skb, location); } static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi, int work, int budget) { struct net_device *dev = napi->dev; struct dm_hw_stat_delta *stat; /* * Don't check napi structures with no associated device */ if (!dev) return; rcu_read_lock(); stat = rcu_dereference(dev->dm_private); if (stat) { /* * only add a note to our monitor buffer if: * 1) its after the last_rx delta * 2) our rx_dropped count has gone up */ if (time_after(jiffies, stat->last_rx + dm_hw_check_delta) && (dev->stats.rx_dropped != stat->last_drop_val)) { trace_drop_common(NULL, NULL); stat->last_drop_val = dev->stats.rx_dropped; stat->last_rx = jiffies; } } rcu_read_unlock(); } static struct net_dm_hw_entries * net_dm_hw_reset_per_cpu_data(struct per_cpu_dm_data *hw_data) { struct net_dm_hw_entries *hw_entries; unsigned long flags; hw_entries = kzalloc(struct_size(hw_entries, entries, dm_hit_limit), GFP_KERNEL); if (!hw_entries) { /* If the memory allocation failed, we try to perform another * allocation in 1/10 second. Otherwise, the probe function * will constantly bail out. */ mod_timer(&hw_data->send_timer, jiffies + HZ / 10); } spin_lock_irqsave(&hw_data->lock, flags); swap(hw_data->hw_entries, hw_entries); spin_unlock_irqrestore(&hw_data->lock, flags); return hw_entries; } static int net_dm_hw_entry_put(struct sk_buff *msg, const struct net_dm_hw_entry *hw_entry) { struct nlattr *attr; attr = nla_nest_start(msg, NET_DM_ATTR_HW_ENTRY); if (!attr) return -EMSGSIZE; if (nla_put_string(msg, NET_DM_ATTR_HW_TRAP_NAME, hw_entry->trap_name)) goto nla_put_failure; if (nla_put_u32(msg, NET_DM_ATTR_HW_TRAP_COUNT, hw_entry->count)) goto nla_put_failure; nla_nest_end(msg, attr); return 0; nla_put_failure: nla_nest_cancel(msg, attr); return -EMSGSIZE; } static int net_dm_hw_entries_put(struct sk_buff *msg, const struct net_dm_hw_entries *hw_entries) { struct nlattr *attr; int i; attr = nla_nest_start(msg, NET_DM_ATTR_HW_ENTRIES); if (!attr) return -EMSGSIZE; for (i = 0; i < hw_entries->num_entries; i++) { int rc; rc = net_dm_hw_entry_put(msg, &hw_entries->entries[i]); if (rc) goto nla_put_failure; } nla_nest_end(msg, attr); return 0; nla_put_failure: nla_nest_cancel(msg, attr); return -EMSGSIZE; } static int net_dm_hw_summary_report_fill(struct sk_buff *msg, const struct net_dm_hw_entries *hw_entries) { struct net_dm_alert_msg anc_hdr = { 0 }; void *hdr; int rc; hdr = genlmsg_put(msg, 0, 0, &net_drop_monitor_family, 0, NET_DM_CMD_ALERT); if (!hdr) return -EMSGSIZE; /* We need to put the ancillary header in order not to break user * space. */ if (nla_put(msg, NLA_UNSPEC, sizeof(anc_hdr), &anc_hdr)) goto nla_put_failure; rc = net_dm_hw_entries_put(msg, hw_entries); if (rc) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static void net_dm_hw_summary_work(struct work_struct *work) { struct net_dm_hw_entries *hw_entries; struct per_cpu_dm_data *hw_data; struct sk_buff *msg; int rc; hw_data = container_of(work, struct per_cpu_dm_data, dm_alert_work); hw_entries = net_dm_hw_reset_per_cpu_data(hw_data); if (!hw_entries) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) goto out; rc = net_dm_hw_summary_report_fill(msg, hw_entries); if (rc) { nlmsg_free(msg); goto out; } genlmsg_multicast(&net_drop_monitor_family, msg, 0, 0, GFP_KERNEL); out: kfree(hw_entries); } static void net_dm_hw_trap_summary_probe(void *ignore, const struct devlink *devlink, struct sk_buff *skb, const struct devlink_trap_metadata *metadata) { struct net_dm_hw_entries *hw_entries; struct net_dm_hw_entry *hw_entry; struct per_cpu_dm_data *hw_data; unsigned long flags; int i; if (metadata->trap_type == DEVLINK_TRAP_TYPE_CONTROL) return; hw_data = this_cpu_ptr(&dm_hw_cpu_data); spin_lock_irqsave(&hw_data->lock, flags); hw_entries = hw_data->hw_entries; if (!hw_entries) goto out; for (i = 0; i < hw_entries->num_entries; i++) { hw_entry = &hw_entries->entries[i]; if (!strncmp(hw_entry->trap_name, metadata->trap_name, NET_DM_MAX_HW_TRAP_NAME_LEN - 1)) { hw_entry->count++; goto out; } } if (WARN_ON_ONCE(hw_entries->num_entries == dm_hit_limit)) goto out; hw_entry = &hw_entries->entries[hw_entries->num_entries]; strscpy(hw_entry->trap_name, metadata->trap_name, NET_DM_MAX_HW_TRAP_NAME_LEN - 1); hw_entry->count = 1; hw_entries->num_entries++; if (!timer_pending(&hw_data->send_timer)) { hw_data->send_timer.expires = jiffies + dm_delay * HZ; add_timer(&hw_data->send_timer); } out: spin_unlock_irqrestore(&hw_data->lock, flags); } static const struct net_dm_alert_ops net_dm_alert_summary_ops = { .kfree_skb_probe = trace_kfree_skb_hit, .napi_poll_probe = trace_napi_poll_hit, .work_item_func = send_dm_alert, .hw_work_item_func = net_dm_hw_summary_work, .hw_trap_probe = net_dm_hw_trap_summary_probe, }; static void net_dm_packet_trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location, enum skb_drop_reason reason) { ktime_t tstamp = ktime_get_real(); struct per_cpu_dm_data *data; struct net_dm_skb_cb *cb; struct sk_buff *nskb; unsigned long flags; if (!skb_mac_header_was_set(skb)) return; nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) return; cb = NET_DM_SKB_CB(nskb); cb->reason = reason; cb->pc = location; /* Override the timestamp because we care about the time when the * packet was dropped. */ nskb->tstamp = tstamp; data = this_cpu_ptr(&dm_cpu_data); spin_lock_irqsave(&data->drop_queue.lock, flags); if (skb_queue_len(&data->drop_queue) < net_dm_queue_len) __skb_queue_tail(&data->drop_queue, nskb); else goto unlock_free; spin_unlock_irqrestore(&data->drop_queue.lock, flags); schedule_work(&data->dm_alert_work); return; unlock_free: spin_unlock_irqrestore(&data->drop_queue.lock, flags); u64_stats_update_begin(&data->stats.syncp); u64_stats_inc(&data->stats.dropped); u64_stats_update_end(&data->stats.syncp); consume_skb(nskb); } static void net_dm_packet_trace_napi_poll_hit(void *ignore, struct napi_struct *napi, int work, int budget) { } static size_t net_dm_in_port_size(void) { /* NET_DM_ATTR_IN_PORT nest */ return nla_total_size(0) + /* NET_DM_ATTR_PORT_NETDEV_IFINDEX */ nla_total_size(sizeof(u32)) + /* NET_DM_ATTR_PORT_NETDEV_NAME */ nla_total_size(IFNAMSIZ + 1); } #define NET_DM_MAX_SYMBOL_LEN 40 #define NET_DM_MAX_REASON_LEN 50 static size_t net_dm_packet_report_size(size_t payload_len) { size_t size; size = nlmsg_msg_size(GENL_HDRLEN + net_drop_monitor_family.hdrsize); return NLMSG_ALIGN(size) + /* NET_DM_ATTR_ORIGIN */ nla_total_size(sizeof(u16)) + /* NET_DM_ATTR_PC */ nla_total_size(sizeof(u64)) + /* NET_DM_ATTR_SYMBOL */ nla_total_size(NET_DM_MAX_SYMBOL_LEN + 1) + /* NET_DM_ATTR_IN_PORT */ net_dm_in_port_size() + /* NET_DM_ATTR_TIMESTAMP */ nla_total_size(sizeof(u64)) + /* NET_DM_ATTR_ORIG_LEN */ nla_total_size(sizeof(u32)) + /* NET_DM_ATTR_PROTO */ nla_total_size(sizeof(u16)) + /* NET_DM_ATTR_REASON */ nla_total_size(NET_DM_MAX_REASON_LEN + 1) + /* NET_DM_ATTR_PAYLOAD */ nla_total_size(payload_len); } static int net_dm_packet_report_in_port_put(struct sk_buff *msg, int ifindex, const char *name) { struct nlattr *attr; attr = nla_nest_start(msg, NET_DM_ATTR_IN_PORT); if (!attr) return -EMSGSIZE; if (ifindex && nla_put_u32(msg, NET_DM_ATTR_PORT_NETDEV_IFINDEX, ifindex)) goto nla_put_failure; if (name && nla_put_string(msg, NET_DM_ATTR_PORT_NETDEV_NAME, name)) goto nla_put_failure; nla_nest_end(msg, attr); return 0; nla_put_failure: nla_nest_cancel(msg, attr); return -EMSGSIZE; } static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb, size_t payload_len) { struct net_dm_skb_cb *cb = NET_DM_SKB_CB(skb); const struct drop_reason_list *list = NULL; unsigned int subsys, subsys_reason; char buf[NET_DM_MAX_SYMBOL_LEN]; struct nlattr *attr; void *hdr; int rc; hdr = genlmsg_put(msg, 0, 0, &net_drop_monitor_family, 0, NET_DM_CMD_PACKET_ALERT); if (!hdr) return -EMSGSIZE; if (nla_put_u16(msg, NET_DM_ATTR_ORIGIN, NET_DM_ORIGIN_SW)) goto nla_put_failure; if (nla_put_u64_64bit(msg, NET_DM_ATTR_PC, (u64)(uintptr_t)cb->pc, NET_DM_ATTR_PAD)) goto nla_put_failure; rcu_read_lock(); subsys = u32_get_bits(cb->reason, SKB_DROP_REASON_SUBSYS_MASK); if (subsys < SKB_DROP_REASON_SUBSYS_NUM) list = rcu_dereference(drop_reasons_by_subsys[subsys]); subsys_reason = cb->reason & ~SKB_DROP_REASON_SUBSYS_MASK; if (!list || subsys_reason >= list->n_reasons || !list->reasons[subsys_reason] || strlen(list->reasons[subsys_reason]) > NET_DM_MAX_REASON_LEN) { list = rcu_dereference(drop_reasons_by_subsys[SKB_DROP_REASON_SUBSYS_CORE]); subsys_reason = SKB_DROP_REASON_NOT_SPECIFIED; } if (nla_put_string(msg, NET_DM_ATTR_REASON, list->reasons[subsys_reason])) { rcu_read_unlock(); goto nla_put_failure; } rcu_read_unlock(); snprintf(buf, sizeof(buf), "%pS", cb->pc); if (nla_put_string(msg, NET_DM_ATTR_SYMBOL, buf)) goto nla_put_failure; rc = net_dm_packet_report_in_port_put(msg, skb->skb_iif, NULL); if (rc) goto nla_put_failure; if (nla_put_u64_64bit(msg, NET_DM_ATTR_TIMESTAMP, ktime_to_ns(skb->tstamp), NET_DM_ATTR_PAD)) goto nla_put_failure; if (nla_put_u32(msg, NET_DM_ATTR_ORIG_LEN, skb->len)) goto nla_put_failure; if (!payload_len) goto out; if (nla_put_u16(msg, NET_DM_ATTR_PROTO, be16_to_cpu(skb->protocol))) goto nla_put_failure; attr = skb_put(msg, nla_total_size(payload_len)); attr->nla_type = NET_DM_ATTR_PAYLOAD; attr->nla_len = nla_attr_size(payload_len); if (skb_copy_bits(skb, 0, nla_data(attr), payload_len)) goto nla_put_failure; out: genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } #define NET_DM_MAX_PACKET_SIZE (0xffff - NLA_HDRLEN - NLA_ALIGNTO) static void net_dm_packet_report(struct sk_buff *skb) { struct sk_buff *msg; size_t payload_len; int rc; /* Make sure we start copying the packet from the MAC header */ if (skb->data > skb_mac_header(skb)) skb_push(skb, skb->data - skb_mac_header(skb)); else skb_pull(skb, skb_mac_header(skb) - skb->data); /* Ensure packet fits inside a single netlink attribute */ payload_len = min_t(size_t, skb->len, NET_DM_MAX_PACKET_SIZE); if (net_dm_trunc_len) payload_len = min_t(size_t, net_dm_trunc_len, payload_len); msg = nlmsg_new(net_dm_packet_report_size(payload_len), GFP_KERNEL); if (!msg) goto out; rc = net_dm_packet_report_fill(msg, skb, payload_len); if (rc) { nlmsg_free(msg); goto out; } genlmsg_multicast(&net_drop_monitor_family, msg, 0, 0, GFP_KERNEL); out: consume_skb(skb); } static void net_dm_packet_work(struct work_struct *work) { struct per_cpu_dm_data *data; struct sk_buff_head list; struct sk_buff *skb; unsigned long flags; data = container_of(work, struct per_cpu_dm_data, dm_alert_work); __skb_queue_head_init(&list); spin_lock_irqsave(&data->drop_queue.lock, flags); skb_queue_splice_tail_init(&data->drop_queue, &list); spin_unlock_irqrestore(&data->drop_queue.lock, flags); while ((skb = __skb_dequeue(&list))) net_dm_packet_report(skb); } static size_t net_dm_flow_action_cookie_size(const struct devlink_trap_metadata *hw_metadata) { return hw_metadata->fa_cookie ? nla_total_size(hw_metadata->fa_cookie->cookie_len) : 0; } static size_t net_dm_hw_packet_report_size(size_t payload_len, const struct devlink_trap_metadata *hw_metadata) { size_t size; size = nlmsg_msg_size(GENL_HDRLEN + net_drop_monitor_family.hdrsize); return NLMSG_ALIGN(size) + /* NET_DM_ATTR_ORIGIN */ nla_total_size(sizeof(u16)) + /* NET_DM_ATTR_HW_TRAP_GROUP_NAME */ nla_total_size(strlen(hw_metadata->trap_group_name) + 1) + /* NET_DM_ATTR_HW_TRAP_NAME */ nla_total_size(strlen(hw_metadata->trap_name) + 1) + /* NET_DM_ATTR_IN_PORT */ net_dm_in_port_size() + /* NET_DM_ATTR_FLOW_ACTION_COOKIE */ net_dm_flow_action_cookie_size(hw_metadata) + /* NET_DM_ATTR_TIMESTAMP */ nla_total_size(sizeof(u64)) + /* NET_DM_ATTR_ORIG_LEN */ nla_total_size(sizeof(u32)) + /* NET_DM_ATTR_PROTO */ nla_total_size(sizeof(u16)) + /* NET_DM_ATTR_PAYLOAD */ nla_total_size(payload_len); } static int net_dm_hw_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb, size_t payload_len) { struct devlink_trap_metadata *hw_metadata; struct nlattr *attr; void *hdr; hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata; hdr = genlmsg_put(msg, 0, 0, &net_drop_monitor_family, 0, NET_DM_CMD_PACKET_ALERT); if (!hdr) return -EMSGSIZE; if (nla_put_u16(msg, NET_DM_ATTR_ORIGIN, NET_DM_ORIGIN_HW)) goto nla_put_failure; if (nla_put_string(msg, NET_DM_ATTR_HW_TRAP_GROUP_NAME, hw_metadata->trap_group_name)) goto nla_put_failure; if (nla_put_string(msg, NET_DM_ATTR_HW_TRAP_NAME, hw_metadata->trap_name)) goto nla_put_failure; if (hw_metadata->input_dev) { struct net_device *dev = hw_metadata->input_dev; int rc; rc = net_dm_packet_report_in_port_put(msg, dev->ifindex, dev->name); if (rc) goto nla_put_failure; } if (hw_metadata->fa_cookie && nla_put(msg, NET_DM_ATTR_FLOW_ACTION_COOKIE, hw_metadata->fa_cookie->cookie_len, hw_metadata->fa_cookie->cookie)) goto nla_put_failure; if (nla_put_u64_64bit(msg, NET_DM_ATTR_TIMESTAMP, ktime_to_ns(skb->tstamp), NET_DM_ATTR_PAD)) goto nla_put_failure; if (nla_put_u32(msg, NET_DM_ATTR_ORIG_LEN, skb->len)) goto nla_put_failure; if (!payload_len) goto out; if (nla_put_u16(msg, NET_DM_ATTR_PROTO, be16_to_cpu(skb->protocol))) goto nla_put_failure; attr = skb_put(msg, nla_total_size(payload_len)); attr->nla_type = NET_DM_ATTR_PAYLOAD; attr->nla_len = nla_attr_size(payload_len); if (skb_copy_bits(skb, 0, nla_data(attr), payload_len)) goto nla_put_failure; out: genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static struct devlink_trap_metadata * net_dm_hw_metadata_copy(const struct devlink_trap_metadata *metadata) { const struct flow_action_cookie *fa_cookie; struct devlink_trap_metadata *hw_metadata; const char *trap_group_name; const char *trap_name; hw_metadata = kzalloc(sizeof(*hw_metadata), GFP_ATOMIC); if (!hw_metadata) return NULL; trap_group_name = kstrdup(metadata->trap_group_name, GFP_ATOMIC); if (!trap_group_name) goto free_hw_metadata; hw_metadata->trap_group_name = trap_group_name; trap_name = kstrdup(metadata->trap_name, GFP_ATOMIC); if (!trap_name) goto free_trap_group; hw_metadata->trap_name = trap_name; if (metadata->fa_cookie) { size_t cookie_size = sizeof(*fa_cookie) + metadata->fa_cookie->cookie_len; fa_cookie = kmemdup(metadata->fa_cookie, cookie_size, GFP_ATOMIC); if (!fa_cookie) goto free_trap_name; hw_metadata->fa_cookie = fa_cookie; } hw_metadata->input_dev = metadata->input_dev; netdev_hold(hw_metadata->input_dev, &hw_metadata->dev_tracker, GFP_ATOMIC); return hw_metadata; free_trap_name: kfree(trap_name); free_trap_group: kfree(trap_group_name); free_hw_metadata: kfree(hw_metadata); return NULL; } static void net_dm_hw_metadata_free(struct devlink_trap_metadata *hw_metadata) { netdev_put(hw_metadata->input_dev, &hw_metadata->dev_tracker); kfree(hw_metadata->fa_cookie); kfree(hw_metadata->trap_name); kfree(hw_metadata->trap_group_name); kfree(hw_metadata); } static void net_dm_hw_packet_report(struct sk_buff *skb) { struct devlink_trap_metadata *hw_metadata; struct sk_buff *msg; size_t payload_len; int rc; if (skb->data > skb_mac_header(skb)) skb_push(skb, skb->data - skb_mac_header(skb)); else skb_pull(skb, skb_mac_header(skb) - skb->data); payload_len = min_t(size_t, skb->len, NET_DM_MAX_PACKET_SIZE); if (net_dm_trunc_len) payload_len = min_t(size_t, net_dm_trunc_len, payload_len); hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata; msg = nlmsg_new(net_dm_hw_packet_report_size(payload_len, hw_metadata), GFP_KERNEL); if (!msg) goto out; rc = net_dm_hw_packet_report_fill(msg, skb, payload_len); if (rc) { nlmsg_free(msg); goto out; } genlmsg_multicast(&net_drop_monitor_family, msg, 0, 0, GFP_KERNEL); out: net_dm_hw_metadata_free(NET_DM_SKB_CB(skb)->hw_metadata); consume_skb(skb); } static void net_dm_hw_packet_work(struct work_struct *work) { struct per_cpu_dm_data *hw_data; struct sk_buff_head list; struct sk_buff *skb; unsigned long flags; hw_data = container_of(work, struct per_cpu_dm_data, dm_alert_work); __skb_queue_head_init(&list); spin_lock_irqsave(&hw_data->drop_queue.lock, flags); skb_queue_splice_tail_init(&hw_data->drop_queue, &list); spin_unlock_irqrestore(&hw_data->drop_queue.lock, flags); while ((skb = __skb_dequeue(&list))) net_dm_hw_packet_report(skb); } static void net_dm_hw_trap_packet_probe(void *ignore, const struct devlink *devlink, struct sk_buff *skb, const struct devlink_trap_metadata *metadata) { struct devlink_trap_metadata *n_hw_metadata; ktime_t tstamp = ktime_get_real(); struct per_cpu_dm_data *hw_data; struct sk_buff *nskb; unsigned long flags; if (metadata->trap_type == DEVLINK_TRAP_TYPE_CONTROL) return; if (!skb_mac_header_was_set(skb)) return; nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) return; n_hw_metadata = net_dm_hw_metadata_copy(metadata); if (!n_hw_metadata) goto free; NET_DM_SKB_CB(nskb)->hw_metadata = n_hw_metadata; nskb->tstamp = tstamp; hw_data = this_cpu_ptr(&dm_hw_cpu_data); spin_lock_irqsave(&hw_data->drop_queue.lock, flags); if (skb_queue_len(&hw_data->drop_queue) < net_dm_queue_len) __skb_queue_tail(&hw_data->drop_queue, nskb); else goto unlock_free; spin_unlock_irqrestore(&hw_data->drop_queue.lock, flags); schedule_work(&hw_data->dm_alert_work); return; unlock_free: spin_unlock_irqrestore(&hw_data->drop_queue.lock, flags); u64_stats_update_begin(&hw_data->stats.syncp); u64_stats_inc(&hw_data->stats.dropped); u64_stats_update_end(&hw_data->stats.syncp); net_dm_hw_metadata_free(n_hw_metadata); free: consume_skb(nskb); } static const struct net_dm_alert_ops net_dm_alert_packet_ops = { .kfree_skb_probe = net_dm_packet_trace_kfree_skb_hit, .napi_poll_probe = net_dm_packet_trace_napi_poll_hit, .work_item_func = net_dm_packet_work, .hw_work_item_func = net_dm_hw_packet_work, .hw_trap_probe = net_dm_hw_trap_packet_probe, }; static const struct net_dm_alert_ops *net_dm_alert_ops_arr[] = { [NET_DM_ALERT_MODE_SUMMARY] = &net_dm_alert_summary_ops, [NET_DM_ALERT_MODE_PACKET] = &net_dm_alert_packet_ops, }; #if IS_ENABLED(CONFIG_NET_DEVLINK) static int net_dm_hw_probe_register(const struct net_dm_alert_ops *ops) { return register_trace_devlink_trap_report(ops->hw_trap_probe, NULL); } static void net_dm_hw_probe_unregister(const struct net_dm_alert_ops *ops) { unregister_trace_devlink_trap_report(ops->hw_trap_probe, NULL); tracepoint_synchronize_unregister(); } #else static int net_dm_hw_probe_register(const struct net_dm_alert_ops *ops) { return -EOPNOTSUPP; } static void net_dm_hw_probe_unregister(const struct net_dm_alert_ops *ops) { } #endif static int net_dm_hw_monitor_start(struct netlink_ext_ack *extack) { const struct net_dm_alert_ops *ops; int cpu, rc; if (monitor_hw) { NL_SET_ERR_MSG_MOD(extack, "Hardware monitoring already enabled"); return -EAGAIN; } ops = net_dm_alert_ops_arr[net_dm_alert_mode]; if (!try_module_get(THIS_MODULE)) { NL_SET_ERR_MSG_MOD(extack, "Failed to take reference on module"); return -ENODEV; } for_each_possible_cpu(cpu) { struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu); struct net_dm_hw_entries *hw_entries; INIT_WORK(&hw_data->dm_alert_work, ops->hw_work_item_func); timer_setup(&hw_data->send_timer, sched_send_work, 0); hw_entries = net_dm_hw_reset_per_cpu_data(hw_data); kfree(hw_entries); } rc = net_dm_hw_probe_register(ops); if (rc) { NL_SET_ERR_MSG_MOD(extack, "Failed to connect probe to devlink_trap_probe() tracepoint"); goto err_module_put; } monitor_hw = true; return 0; err_module_put: for_each_possible_cpu(cpu) { struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu); struct sk_buff *skb; del_timer_sync(&hw_data->send_timer); cancel_work_sync(&hw_data->dm_alert_work); while ((skb = __skb_dequeue(&hw_data->drop_queue))) { struct devlink_trap_metadata *hw_metadata; hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata; net_dm_hw_metadata_free(hw_metadata); consume_skb(skb); } } module_put(THIS_MODULE); return rc; } static void net_dm_hw_monitor_stop(struct netlink_ext_ack *extack) { const struct net_dm_alert_ops *ops; int cpu; if (!monitor_hw) { NL_SET_ERR_MSG_MOD(extack, "Hardware monitoring already disabled"); return; } ops = net_dm_alert_ops_arr[net_dm_alert_mode]; monitor_hw = false; net_dm_hw_probe_unregister(ops); for_each_possible_cpu(cpu) { struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu); struct sk_buff *skb; del_timer_sync(&hw_data->send_timer); cancel_work_sync(&hw_data->dm_alert_work); while ((skb = __skb_dequeue(&hw_data->drop_queue))) { struct devlink_trap_metadata *hw_metadata; hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata; net_dm_hw_metadata_free(hw_metadata); consume_skb(skb); } } module_put(THIS_MODULE); } static int net_dm_trace_on_set(struct netlink_ext_ack *extack) { const struct net_dm_alert_ops *ops; int cpu, rc; ops = net_dm_alert_ops_arr[net_dm_alert_mode]; if (!try_module_get(THIS_MODULE)) { NL_SET_ERR_MSG_MOD(extack, "Failed to take reference on module"); return -ENODEV; } for_each_possible_cpu(cpu) { struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu); struct sk_buff *skb; INIT_WORK(&data->dm_alert_work, ops->work_item_func); timer_setup(&data->send_timer, sched_send_work, 0); /* Allocate a new per-CPU skb for the summary alert message and * free the old one which might contain stale data from * previous tracing. */ skb = reset_per_cpu_data(data); consume_skb(skb); } rc = register_trace_kfree_skb(ops->kfree_skb_probe, NULL); if (rc) { NL_SET_ERR_MSG_MOD(extack, "Failed to connect probe to kfree_skb() tracepoint"); goto err_module_put; } rc = register_trace_napi_poll(ops->napi_poll_probe, NULL); if (rc) { NL_SET_ERR_MSG_MOD(extack, "Failed to connect probe to napi_poll() tracepoint"); goto err_unregister_trace; } return 0; err_unregister_trace: unregister_trace_kfree_skb(ops->kfree_skb_probe, NULL); err_module_put: for_each_possible_cpu(cpu) { struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu); struct sk_buff *skb; del_timer_sync(&data->send_timer); cancel_work_sync(&data->dm_alert_work); while ((skb = __skb_dequeue(&data->drop_queue))) consume_skb(skb); } module_put(THIS_MODULE); return rc; } static void net_dm_trace_off_set(void) { const struct net_dm_alert_ops *ops; int cpu; ops = net_dm_alert_ops_arr[net_dm_alert_mode]; unregister_trace_napi_poll(ops->napi_poll_probe, NULL); unregister_trace_kfree_skb(ops->kfree_skb_probe, NULL); tracepoint_synchronize_unregister(); /* Make sure we do not send notifications to user space after request * to stop tracing returns. */ for_each_possible_cpu(cpu) { struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu); struct sk_buff *skb; del_timer_sync(&data->send_timer); cancel_work_sync(&data->dm_alert_work); while ((skb = __skb_dequeue(&data->drop_queue))) consume_skb(skb); } module_put(THIS_MODULE); } static int set_all_monitor_traces(int state, struct netlink_ext_ack *extack) { int rc = 0; if (state == trace_state) { NL_SET_ERR_MSG_MOD(extack, "Trace state already set to requested state"); return -EAGAIN; } switch (state) { case TRACE_ON: rc = net_dm_trace_on_set(extack); break; case TRACE_OFF: net_dm_trace_off_set(); break; default: rc = 1; break; } if (!rc) trace_state = state; else rc = -EINPROGRESS; return rc; } static bool net_dm_is_monitoring(void) { return trace_state == TRACE_ON || monitor_hw; } static int net_dm_alert_mode_get_from_info(struct genl_info *info, enum net_dm_alert_mode *p_alert_mode) { u8 val; val = nla_get_u8(info->attrs[NET_DM_ATTR_ALERT_MODE]); switch (val) { case NET_DM_ALERT_MODE_SUMMARY: case NET_DM_ALERT_MODE_PACKET: *p_alert_mode = val; break; default: return -EINVAL; } return 0; } static int net_dm_alert_mode_set(struct genl_info *info) { struct netlink_ext_ack *extack = info->extack; enum net_dm_alert_mode alert_mode; int rc; if (!info->attrs[NET_DM_ATTR_ALERT_MODE]) return 0; rc = net_dm_alert_mode_get_from_info(info, &alert_mode); if (rc) { NL_SET_ERR_MSG_MOD(extack, "Invalid alert mode"); return -EINVAL; } net_dm_alert_mode = alert_mode; return 0; } static void net_dm_trunc_len_set(struct genl_info *info) { if (!info->attrs[NET_DM_ATTR_TRUNC_LEN]) return; net_dm_trunc_len = nla_get_u32(info->attrs[NET_DM_ATTR_TRUNC_LEN]); } static void net_dm_queue_len_set(struct genl_info *info) { if (!info->attrs[NET_DM_ATTR_QUEUE_LEN]) return; net_dm_queue_len = nla_get_u32(info->attrs[NET_DM_ATTR_QUEUE_LEN]); } static int net_dm_cmd_config(struct sk_buff *skb, struct genl_info *info) { struct netlink_ext_ack *extack = info->extack; int rc; if (net_dm_is_monitoring()) { NL_SET_ERR_MSG_MOD(extack, "Cannot configure drop monitor during monitoring"); return -EBUSY; } rc = net_dm_alert_mode_set(info); if (rc) return rc; net_dm_trunc_len_set(info); net_dm_queue_len_set(info); return 0; } static int net_dm_monitor_start(bool set_sw, bool set_hw, struct netlink_ext_ack *extack) { bool sw_set = false; int rc; if (set_sw) { rc = set_all_monitor_traces(TRACE_ON, extack); if (rc) return rc; sw_set = true; } if (set_hw) { rc = net_dm_hw_monitor_start(extack); if (rc) goto err_monitor_hw; } return 0; err_monitor_hw: if (sw_set) set_all_monitor_traces(TRACE_OFF, extack); return rc; } static void net_dm_monitor_stop(bool set_sw, bool set_hw, struct netlink_ext_ack *extack) { if (set_hw) net_dm_hw_monitor_stop(extack); if (set_sw) set_all_monitor_traces(TRACE_OFF, extack); } static int net_dm_cmd_trace(struct sk_buff *skb, struct genl_info *info) { bool set_sw = !!info->attrs[NET_DM_ATTR_SW_DROPS]; bool set_hw = !!info->attrs[NET_DM_ATTR_HW_DROPS]; struct netlink_ext_ack *extack = info->extack; /* To maintain backward compatibility, we start / stop monitoring of * software drops if no flag is specified. */ if (!set_sw && !set_hw) set_sw = true; switch (info->genlhdr->cmd) { case NET_DM_CMD_START: return net_dm_monitor_start(set_sw, set_hw, extack); case NET_DM_CMD_STOP: net_dm_monitor_stop(set_sw, set_hw, extack); return 0; } return -EOPNOTSUPP; } static int net_dm_config_fill(struct sk_buff *msg, struct genl_info *info) { void *hdr; hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, &net_drop_monitor_family, 0, NET_DM_CMD_CONFIG_NEW); if (!hdr) return -EMSGSIZE; if (nla_put_u8(msg, NET_DM_ATTR_ALERT_MODE, net_dm_alert_mode)) goto nla_put_failure; if (nla_put_u32(msg, NET_DM_ATTR_TRUNC_LEN, net_dm_trunc_len)) goto nla_put_failure; if (nla_put_u32(msg, NET_DM_ATTR_QUEUE_LEN, net_dm_queue_len)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int net_dm_cmd_config_get(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; int rc; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; rc = net_dm_config_fill(msg, info); if (rc) goto free_msg; return genlmsg_reply(msg, info); free_msg: nlmsg_free(msg); return rc; } static void net_dm_stats_read(struct net_dm_stats *stats) { int cpu; memset(stats, 0, sizeof(*stats)); for_each_possible_cpu(cpu) { struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu); struct net_dm_stats *cpu_stats = &data->stats; unsigned int start; u64 dropped; do { start = u64_stats_fetch_begin(&cpu_stats->syncp); dropped = u64_stats_read(&cpu_stats->dropped); } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); u64_stats_add(&stats->dropped, dropped); } } static int net_dm_stats_put(struct sk_buff *msg) { struct net_dm_stats stats; struct nlattr *attr; net_dm_stats_read(&stats); attr = nla_nest_start(msg, NET_DM_ATTR_STATS); if (!attr) return -EMSGSIZE; if (nla_put_u64_64bit(msg, NET_DM_ATTR_STATS_DROPPED, u64_stats_read(&stats.dropped), NET_DM_ATTR_PAD)) goto nla_put_failure; nla_nest_end(msg, attr); return 0; nla_put_failure: nla_nest_cancel(msg, attr); return -EMSGSIZE; } static void net_dm_hw_stats_read(struct net_dm_stats *stats) { int cpu; memset(stats, 0, sizeof(*stats)); for_each_possible_cpu(cpu) { struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu); struct net_dm_stats *cpu_stats = &hw_data->stats; unsigned int start; u64 dropped; do { start = u64_stats_fetch_begin(&cpu_stats->syncp); dropped = u64_stats_read(&cpu_stats->dropped); } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); u64_stats_add(&stats->dropped, dropped); } } static int net_dm_hw_stats_put(struct sk_buff *msg) { struct net_dm_stats stats; struct nlattr *attr; net_dm_hw_stats_read(&stats); attr = nla_nest_start(msg, NET_DM_ATTR_HW_STATS); if (!attr) return -EMSGSIZE; if (nla_put_u64_64bit(msg, NET_DM_ATTR_STATS_DROPPED, u64_stats_read(&stats.dropped), NET_DM_ATTR_PAD)) goto nla_put_failure; nla_nest_end(msg, attr); return 0; nla_put_failure: nla_nest_cancel(msg, attr); return -EMSGSIZE; } static int net_dm_stats_fill(struct sk_buff *msg, struct genl_info *info) { void *hdr; int rc; hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, &net_drop_monitor_family, 0, NET_DM_CMD_STATS_NEW); if (!hdr) return -EMSGSIZE; rc = net_dm_stats_put(msg); if (rc) goto nla_put_failure; rc = net_dm_hw_stats_put(msg); if (rc) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int net_dm_cmd_stats_get(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; int rc; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; rc = net_dm_stats_fill(msg, info); if (rc) goto free_msg; return genlmsg_reply(msg, info); free_msg: nlmsg_free(msg); return rc; } static int dropmon_net_event(struct notifier_block *ev_block, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct dm_hw_stat_delta *stat; switch (event) { case NETDEV_REGISTER: if (WARN_ON_ONCE(rtnl_dereference(dev->dm_private))) break; stat = kzalloc(sizeof(*stat), GFP_KERNEL); if (!stat) break; stat->last_rx = jiffies; rcu_assign_pointer(dev->dm_private, stat); break; case NETDEV_UNREGISTER: stat = rtnl_dereference(dev->dm_private); if (stat) { rcu_assign_pointer(dev->dm_private, NULL); kfree_rcu(stat, rcu); } break; } return NOTIFY_DONE; } static const struct nla_policy net_dm_nl_policy[NET_DM_ATTR_MAX + 1] = { [NET_DM_ATTR_UNSPEC] = { .strict_start_type = NET_DM_ATTR_UNSPEC + 1 }, [NET_DM_ATTR_ALERT_MODE] = { .type = NLA_U8 }, [NET_DM_ATTR_TRUNC_LEN] = { .type = NLA_U32 }, [NET_DM_ATTR_QUEUE_LEN] = { .type = NLA_U32 }, [NET_DM_ATTR_SW_DROPS] = {. type = NLA_FLAG }, [NET_DM_ATTR_HW_DROPS] = {. type = NLA_FLAG }, }; static const struct genl_small_ops dropmon_ops[] = { { .cmd = NET_DM_CMD_CONFIG, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = net_dm_cmd_config, .flags = GENL_ADMIN_PERM, }, { .cmd = NET_DM_CMD_START, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = net_dm_cmd_trace, }, { .cmd = NET_DM_CMD_STOP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = net_dm_cmd_trace, }, { .cmd = NET_DM_CMD_CONFIG_GET, .doit = net_dm_cmd_config_get, }, { .cmd = NET_DM_CMD_STATS_GET, .doit = net_dm_cmd_stats_get, }, }; static int net_dm_nl_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { mutex_lock(&net_dm_mutex); return 0; } static void net_dm_nl_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { mutex_unlock(&net_dm_mutex); } static struct genl_family net_drop_monitor_family __ro_after_init = { .hdrsize = 0, .name = "NET_DM", .version = 2, .maxattr = NET_DM_ATTR_MAX, .policy = net_dm_nl_policy, .pre_doit = net_dm_nl_pre_doit, .post_doit = net_dm_nl_post_doit, .module = THIS_MODULE, .small_ops = dropmon_ops, .n_small_ops = ARRAY_SIZE(dropmon_ops), .resv_start_op = NET_DM_CMD_STATS_GET + 1, .mcgrps = dropmon_mcgrps, .n_mcgrps = ARRAY_SIZE(dropmon_mcgrps), }; static struct notifier_block dropmon_net_notifier = { .notifier_call = dropmon_net_event }; static void __net_dm_cpu_data_init(struct per_cpu_dm_data *data) { spin_lock_init(&data->lock); skb_queue_head_init(&data->drop_queue); u64_stats_init(&data->stats.syncp); } static void __net_dm_cpu_data_fini(struct per_cpu_dm_data *data) { WARN_ON(!skb_queue_empty(&data->drop_queue)); } static void net_dm_cpu_data_init(int cpu) { struct per_cpu_dm_data *data; data = &per_cpu(dm_cpu_data, cpu); __net_dm_cpu_data_init(data); } static void net_dm_cpu_data_fini(int cpu) { struct per_cpu_dm_data *data; data = &per_cpu(dm_cpu_data, cpu); /* At this point, we should have exclusive access * to this struct and can free the skb inside it. */ consume_skb(data->skb); __net_dm_cpu_data_fini(data); } static void net_dm_hw_cpu_data_init(int cpu) { struct per_cpu_dm_data *hw_data; hw_data = &per_cpu(dm_hw_cpu_data, cpu); __net_dm_cpu_data_init(hw_data); } static void net_dm_hw_cpu_data_fini(int cpu) { struct per_cpu_dm_data *hw_data; hw_data = &per_cpu(dm_hw_cpu_data, cpu); kfree(hw_data->hw_entries); __net_dm_cpu_data_fini(hw_data); } static int __init init_net_drop_monitor(void) { int cpu, rc; pr_info("Initializing network drop monitor service\n"); if (sizeof(void *) > 8) { pr_err("Unable to store program counters on this arch, Drop monitor failed\n"); return -ENOSPC; } rc = genl_register_family(&net_drop_monitor_family); if (rc) { pr_err("Could not create drop monitor netlink family\n"); return rc; } WARN_ON(net_drop_monitor_family.mcgrp_offset != NET_DM_GRP_ALERT); rc = register_netdevice_notifier(&dropmon_net_notifier); if (rc < 0) { pr_crit("Failed to register netdevice notifier\n"); goto out_unreg; } rc = 0; for_each_possible_cpu(cpu) { net_dm_cpu_data_init(cpu); net_dm_hw_cpu_data_init(cpu); } goto out; out_unreg: genl_unregister_family(&net_drop_monitor_family); out: return rc; } static void exit_net_drop_monitor(void) { int cpu; BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier)); /* * Because of the module_get/put we do in the trace state change path * we are guaranteed not to have any current users when we get here */ for_each_possible_cpu(cpu) { net_dm_hw_cpu_data_fini(cpu); net_dm_cpu_data_fini(cpu); } BUG_ON(genl_unregister_family(&net_drop_monitor_family)); } module_init(init_net_drop_monitor); module_exit(exit_net_drop_monitor); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Neil Horman <nhorman@tuxdriver.com>"); MODULE_ALIAS_GENL_FAMILY("NET_DM"); MODULE_DESCRIPTION("Monitoring code for network dropped packet alerts");
102 102 49 101 102 887 102 102 887 363 363 351 351 351 351 207 351 351 351 1720 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2008 IBM Corporation * * Authors: * Mimi Zohar <zohar@us.ibm.com> * * File: integrity_iint.c * - implements the integrity hooks: integrity_inode_alloc, * integrity_inode_free * - cache integrity information associated with an inode * using a rbtree tree. */ #include <linux/slab.h> #include <linux/init.h> #include <linux/spinlock.h> #include <linux/rbtree.h> #include <linux/file.h> #include <linux/uaccess.h> #include <linux/security.h> #include <linux/lsm_hooks.h> #include "integrity.h" static struct rb_root integrity_iint_tree = RB_ROOT; static DEFINE_RWLOCK(integrity_iint_lock); static struct kmem_cache *iint_cache __ro_after_init; struct dentry *integrity_dir; /* * __integrity_iint_find - return the iint associated with an inode */ static struct integrity_iint_cache *__integrity_iint_find(struct inode *inode) { struct integrity_iint_cache *iint; struct rb_node *n = integrity_iint_tree.rb_node; while (n) { iint = rb_entry(n, struct integrity_iint_cache, rb_node); if (inode < iint->inode) n = n->rb_left; else if (inode > iint->inode) n = n->rb_right; else return iint; } return NULL; } /* * integrity_iint_find - return the iint associated with an inode */ struct integrity_iint_cache *integrity_iint_find(struct inode *inode) { struct integrity_iint_cache *iint; if (!IS_IMA(inode)) return NULL; read_lock(&integrity_iint_lock); iint = __integrity_iint_find(inode); read_unlock(&integrity_iint_lock); return iint; } #define IMA_MAX_NESTING (FILESYSTEM_MAX_STACK_DEPTH+1) /* * It is not clear that IMA should be nested at all, but as long is it measures * files both on overlayfs and on underlying fs, we need to annotate the iint * mutex to avoid lockdep false positives related to IMA + overlayfs. * See ovl_lockdep_annotate_inode_mutex_key() for more details. */ static inline void iint_lockdep_annotate(struct integrity_iint_cache *iint, struct inode *inode) { #ifdef CONFIG_LOCKDEP static struct lock_class_key iint_mutex_key[IMA_MAX_NESTING]; int depth = inode->i_sb->s_stack_depth; if (WARN_ON_ONCE(depth < 0 || depth >= IMA_MAX_NESTING)) depth = 0; lockdep_set_class(&iint->mutex, &iint_mutex_key[depth]); #endif } static void iint_init_always(struct integrity_iint_cache *iint, struct inode *inode) { iint->ima_hash = NULL; iint->version = 0; iint->flags = 0UL; iint->atomic_flags = 0UL; iint->ima_file_status = INTEGRITY_UNKNOWN; iint->ima_mmap_status = INTEGRITY_UNKNOWN; iint->ima_bprm_status = INTEGRITY_UNKNOWN; iint->ima_read_status = INTEGRITY_UNKNOWN; iint->ima_creds_status = INTEGRITY_UNKNOWN; iint->evm_status = INTEGRITY_UNKNOWN; iint->measured_pcrs = 0; mutex_init(&iint->mutex); iint_lockdep_annotate(iint, inode); } static void iint_free(struct integrity_iint_cache *iint) { kfree(iint->ima_hash); mutex_destroy(&iint->mutex); kmem_cache_free(iint_cache, iint); } /** * integrity_inode_get - find or allocate an iint associated with an inode * @inode: pointer to the inode * @return: allocated iint * * Caller must lock i_mutex */ struct integrity_iint_cache *integrity_inode_get(struct inode *inode) { struct rb_node **p; struct rb_node *node, *parent = NULL; struct integrity_iint_cache *iint, *test_iint; iint = integrity_iint_find(inode); if (iint) return iint; iint = kmem_cache_alloc(iint_cache, GFP_NOFS); if (!iint) return NULL; iint_init_always(iint, inode); write_lock(&integrity_iint_lock); p = &integrity_iint_tree.rb_node; while (*p) { parent = *p; test_iint = rb_entry(parent, struct integrity_iint_cache, rb_node); if (inode < test_iint->inode) { p = &(*p)->rb_left; } else if (inode > test_iint->inode) { p = &(*p)->rb_right; } else { write_unlock(&integrity_iint_lock); kmem_cache_free(iint_cache, iint); return test_iint; } } iint->inode = inode; node = &iint->rb_node; inode->i_flags |= S_IMA; rb_link_node(node, parent, p); rb_insert_color(node, &integrity_iint_tree); write_unlock(&integrity_iint_lock); return iint; } /** * integrity_inode_free - called on security_inode_free * @inode: pointer to the inode * * Free the integrity information(iint) associated with an inode. */ void integrity_inode_free(struct inode *inode) { struct integrity_iint_cache *iint; if (!IS_IMA(inode)) return; write_lock(&integrity_iint_lock); iint = __integrity_iint_find(inode); rb_erase(&iint->rb_node, &integrity_iint_tree); write_unlock(&integrity_iint_lock); iint_free(iint); } static void iint_init_once(void *foo) { struct integrity_iint_cache *iint = (struct integrity_iint_cache *) foo; memset(iint, 0, sizeof(*iint)); } static int __init integrity_iintcache_init(void) { iint_cache = kmem_cache_create("iint_cache", sizeof(struct integrity_iint_cache), 0, SLAB_PANIC, iint_init_once); return 0; } DEFINE_LSM(integrity) = { .name = "integrity", .init = integrity_iintcache_init, .order = LSM_ORDER_LAST, }; /* * integrity_kernel_read - read data from the file * * This is a function for reading file content instead of kernel_read(). * It does not perform locking checks to ensure it cannot be blocked. * It does not perform security checks because it is irrelevant for IMA. * */ int integrity_kernel_read(struct file *file, loff_t offset, void *addr, unsigned long count) { return __kernel_read(file, addr, count, &offset); } /* * integrity_load_keys - load integrity keys hook * * Hooks is called from init/main.c:kernel_init_freeable() * when rootfs is ready */ void __init integrity_load_keys(void) { ima_load_x509(); if (!IS_ENABLED(CONFIG_IMA_LOAD_X509)) evm_load_x509(); } static int __init integrity_fs_init(void) { integrity_dir = securityfs_create_dir("integrity", NULL); if (IS_ERR(integrity_dir)) { int ret = PTR_ERR(integrity_dir); if (ret != -ENODEV) pr_err("Unable to create integrity sysfs dir: %d\n", ret); integrity_dir = NULL; return ret; } return 0; } late_initcall(integrity_fs_init)
120 319 187 187 168 168 2 110 218 72 72 218 218 168 168 168 168 53 53 53 53 53 53 130 316 104 19 19 149 149 138 42 42 104 139 139 139 222 104 53 37 61 61 60 60 61 187 187 307 307 307 307 139 305 307 307 187 89 9 89 53 89 321 321 321 89 89 89 89 89 89 53 89 89 89 53 89 110 110 39 39 88 89 110 110 53 110 110 110 110 110 53 110 53 110 89 53 37 53 110 110 168 168 168 168 61 61 61 61 2 139 139 168 2 168 19 168 149 139 139 139 139 149 110 20 29 24 29 29 29 29 29 26 25 4 4 4 4 307 307 306 94 94 94 94 94 94 94 94 94 94 94 94 94 94 168 94 94 94 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992 Linus Torvalds */ /* * 'tty_io.c' gives an orthogonal feeling to tty's, be they consoles * or rs-channels. It also implements echoing, cooked mode etc. * * Kill-line thanks to John T Kohl, who also corrected VMIN = VTIME = 0. * * Modified by Theodore Ts'o, 9/14/92, to dynamically allocate the * tty_struct and tty_queue structures. Previously there was an array * of 256 tty_struct's which was statically allocated, and the * tty_queue structures were allocated at boot time. Both are now * dynamically allocated only when the tty is open. * * Also restructured routines so that there is more of a separation * between the high-level tty routines (tty_io.c and tty_ioctl.c) and * the low-level tty routines (serial.c, pty.c, console.c). This * makes for cleaner and more compact code. -TYT, 9/17/92 * * Modified by Fred N. van Kempen, 01/29/93, to add line disciplines * which can be dynamically activated and de-activated by the line * discipline handling modules (like SLIP). * * NOTE: pay no attention to the line discipline code (yet); its * interface is still subject to change in this version... * -- TYT, 1/31/92 * * Added functionality to the OPOST tty handling. No delays, but all * other bits should be there. * -- Nick Holloway <alfie@dcs.warwick.ac.uk>, 27th May 1993. * * Rewrote canonical mode and added more termios flags. * -- julian@uhunix.uhcc.hawaii.edu (J. Cowley), 13Jan94 * * Reorganized FASYNC support so mouse code can share it. * -- ctm@ardi.com, 9Sep95 * * New TIOCLINUX variants added. * -- mj@k332.feld.cvut.cz, 19-Nov-95 * * Restrict vt switching via ioctl() * -- grif@cs.ucr.edu, 5-Dec-95 * * Move console and virtual terminal code to more appropriate files, * implement CONFIG_VT and generalize console device interface. * -- Marko Kohtala <Marko.Kohtala@hut.fi>, March 97 * * Rewrote tty_init_dev and tty_release_dev to eliminate races. * -- Bill Hawes <whawes@star.net>, June 97 * * Added devfs support. * -- C. Scott Ananian <cananian@alumni.princeton.edu>, 13-Jan-1998 * * Added support for a Unix98-style ptmx device. * -- C. Scott Ananian <cananian@alumni.princeton.edu>, 14-Jan-1998 * * Reduced memory usage for older ARM systems * -- Russell King <rmk@arm.linux.org.uk> * * Move do_SAK() into process context. Less stack use in devfs functions. * alloc_tty_struct() always uses kmalloc() * -- Andrew Morton <andrewm@uow.edu.eu> 17Mar01 */ #include <linux/types.h> #include <linux/major.h> #include <linux/errno.h> #include <linux/signal.h> #include <linux/fcntl.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/interrupt.h> #include <linux/tty.h> #include <linux/tty_driver.h> #include <linux/tty_flip.h> #include <linux/devpts_fs.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/console.h> #include <linux/timer.h> #include <linux/ctype.h> #include <linux/kd.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/ppp-ioctl.h> #include <linux/proc_fs.h> #include <linux/init.h> #include <linux/module.h> #include <linux/device.h> #include <linux/wait.h> #include <linux/bitops.h> #include <linux/delay.h> #include <linux/seq_file.h> #include <linux/serial.h> #include <linux/ratelimit.h> #include <linux/compat.h> #include <linux/uaccess.h> #include <linux/termios_internal.h> #include <linux/fs.h> #include <linux/kbd_kern.h> #include <linux/vt_kern.h> #include <linux/selection.h> #include <linux/kmod.h> #include <linux/nsproxy.h> #include "tty.h" #undef TTY_DEBUG_HANGUP #ifdef TTY_DEBUG_HANGUP # define tty_debug_hangup(tty, f, args...) tty_debug(tty, f, ##args) #else # define tty_debug_hangup(tty, f, args...) do { } while (0) #endif #define TTY_PARANOIA_CHECK 1 #define CHECK_TTY_COUNT 1 struct ktermios tty_std_termios = { /* for the benefit of tty drivers */ .c_iflag = ICRNL | IXON, .c_oflag = OPOST | ONLCR, .c_cflag = B38400 | CS8 | CREAD | HUPCL, .c_lflag = ISIG | ICANON | ECHO | ECHOE | ECHOK | ECHOCTL | ECHOKE | IEXTEN, .c_cc = INIT_C_CC, .c_ispeed = 38400, .c_ospeed = 38400, /* .c_line = N_TTY, */ }; EXPORT_SYMBOL(tty_std_termios); /* This list gets poked at by procfs and various bits of boot up code. This * could do with some rationalisation such as pulling the tty proc function * into this file. */ LIST_HEAD(tty_drivers); /* linked list of tty drivers */ /* Mutex to protect creating and releasing a tty */ DEFINE_MUTEX(tty_mutex); static ssize_t tty_read(struct kiocb *, struct iov_iter *); static ssize_t tty_write(struct kiocb *, struct iov_iter *); static __poll_t tty_poll(struct file *, poll_table *); static int tty_open(struct inode *, struct file *); #ifdef CONFIG_COMPAT static long tty_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); #else #define tty_compat_ioctl NULL #endif static int __tty_fasync(int fd, struct file *filp, int on); static int tty_fasync(int fd, struct file *filp, int on); static void release_tty(struct tty_struct *tty, int idx); /** * free_tty_struct - free a disused tty * @tty: tty struct to free * * Free the write buffers, tty queue and tty memory itself. * * Locking: none. Must be called after tty is definitely unused */ static void free_tty_struct(struct tty_struct *tty) { tty_ldisc_deinit(tty); put_device(tty->dev); kvfree(tty->write_buf); kfree(tty); } static inline struct tty_struct *file_tty(struct file *file) { return ((struct tty_file_private *)file->private_data)->tty; } int tty_alloc_file(struct file *file) { struct tty_file_private *priv; priv = kmalloc(sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; file->private_data = priv; return 0; } /* Associate a new file with the tty structure */ void tty_add_file(struct tty_struct *tty, struct file *file) { struct tty_file_private *priv = file->private_data; priv->tty = tty; priv->file = file; spin_lock(&tty->files_lock); list_add(&priv->list, &tty->tty_files); spin_unlock(&tty->files_lock); } /** * tty_free_file - free file->private_data * @file: to free private_data of * * This shall be used only for fail path handling when tty_add_file was not * called yet. */ void tty_free_file(struct file *file) { struct tty_file_private *priv = file->private_data; file->private_data = NULL; kfree(priv); } /* Delete file from its tty */ static void tty_del_file(struct file *file) { struct tty_file_private *priv = file->private_data; struct tty_struct *tty = priv->tty; spin_lock(&tty->files_lock); list_del(&priv->list); spin_unlock(&tty->files_lock); tty_free_file(file); } /** * tty_name - return tty naming * @tty: tty structure * * Convert a tty structure into a name. The name reflects the kernel naming * policy and if udev is in use may not reflect user space * * Locking: none */ const char *tty_name(const struct tty_struct *tty) { if (!tty) /* Hmm. NULL pointer. That's fun. */ return "NULL tty"; return tty->name; } EXPORT_SYMBOL(tty_name); const char *tty_driver_name(const struct tty_struct *tty) { if (!tty || !tty->driver) return ""; return tty->driver->name; } static int tty_paranoia_check(struct tty_struct *tty, struct inode *inode, const char *routine) { #ifdef TTY_PARANOIA_CHECK if (!tty) { pr_warn("(%d:%d): %s: NULL tty\n", imajor(inode), iminor(inode), routine); return 1; } #endif return 0; } /* Caller must hold tty_lock */ static void check_tty_count(struct tty_struct *tty, const char *routine) { #ifdef CHECK_TTY_COUNT struct list_head *p; int count = 0, kopen_count = 0; spin_lock(&tty->files_lock); list_for_each(p, &tty->tty_files) { count++; } spin_unlock(&tty->files_lock); if (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_SLAVE && tty->link && tty->link->count) count++; if (tty_port_kopened(tty->port)) kopen_count++; if (tty->count != (count + kopen_count)) { tty_warn(tty, "%s: tty->count(%d) != (#fd's(%d) + #kopen's(%d))\n", routine, tty->count, count, kopen_count); } #endif } /** * get_tty_driver - find device of a tty * @device: device identifier * @index: returns the index of the tty * * This routine returns a tty driver structure, given a device number and also * passes back the index number. * * Locking: caller must hold tty_mutex */ static struct tty_driver *get_tty_driver(dev_t device, int *index) { struct tty_driver *p; list_for_each_entry(p, &tty_drivers, tty_drivers) { dev_t base = MKDEV(p->major, p->minor_start); if (device < base || device >= base + p->num) continue; *index = device - base; return tty_driver_kref_get(p); } return NULL; } /** * tty_dev_name_to_number - return dev_t for device name * @name: user space name of device under /dev * @number: pointer to dev_t that this function will populate * * This function converts device names like ttyS0 or ttyUSB1 into dev_t like * (4, 64) or (188, 1). If no corresponding driver is registered then the * function returns -%ENODEV. * * Locking: this acquires tty_mutex to protect the tty_drivers list from * being modified while we are traversing it, and makes sure to * release it before exiting. */ int tty_dev_name_to_number(const char *name, dev_t *number) { struct tty_driver *p; int ret; int index, prefix_length = 0; const char *str; for (str = name; *str && !isdigit(*str); str++) ; if (!*str) return -EINVAL; ret = kstrtoint(str, 10, &index); if (ret) return ret; prefix_length = str - name; mutex_lock(&tty_mutex); list_for_each_entry(p, &tty_drivers, tty_drivers) if (prefix_length == strlen(p->name) && strncmp(name, p->name, prefix_length) == 0) { if (index < p->num) { *number = MKDEV(p->major, p->minor_start + index); goto out; } } /* if here then driver wasn't found */ ret = -ENODEV; out: mutex_unlock(&tty_mutex); return ret; } EXPORT_SYMBOL_GPL(tty_dev_name_to_number); #ifdef CONFIG_CONSOLE_POLL /** * tty_find_polling_driver - find device of a polled tty * @name: name string to match * @line: pointer to resulting tty line nr * * This routine returns a tty driver structure, given a name and the condition * that the tty driver is capable of polled operation. */ struct tty_driver *tty_find_polling_driver(char *name, int *line) { struct tty_driver *p, *res = NULL; int tty_line = 0; int len; char *str, *stp; for (str = name; *str; str++) if ((*str >= '0' && *str <= '9') || *str == ',') break; if (!*str) return NULL; len = str - name; tty_line = simple_strtoul(str, &str, 10); mutex_lock(&tty_mutex); /* Search through the tty devices to look for a match */ list_for_each_entry(p, &tty_drivers, tty_drivers) { if (!len || strncmp(name, p->name, len) != 0) continue; stp = str; if (*stp == ',') stp++; if (*stp == '\0') stp = NULL; if (tty_line >= 0 && tty_line < p->num && p->ops && p->ops->poll_init && !p->ops->poll_init(p, tty_line, stp)) { res = tty_driver_kref_get(p); *line = tty_line; break; } } mutex_unlock(&tty_mutex); return res; } EXPORT_SYMBOL_GPL(tty_find_polling_driver); #endif static ssize_t hung_up_tty_read(struct kiocb *iocb, struct iov_iter *to) { return 0; } static ssize_t hung_up_tty_write(struct kiocb *iocb, struct iov_iter *from) { return -EIO; } /* No kernel lock held - none needed ;) */ static __poll_t hung_up_tty_poll(struct file *filp, poll_table *wait) { return EPOLLIN | EPOLLOUT | EPOLLERR | EPOLLHUP | EPOLLRDNORM | EPOLLWRNORM; } static long hung_up_tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return cmd == TIOCSPGRP ? -ENOTTY : -EIO; } static long hung_up_tty_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return cmd == TIOCSPGRP ? -ENOTTY : -EIO; } static int hung_up_tty_fasync(int fd, struct file *file, int on) { return -ENOTTY; } static void tty_show_fdinfo(struct seq_file *m, struct file *file) { struct tty_struct *tty = file_tty(file); if (tty && tty->ops && tty->ops->show_fdinfo) tty->ops->show_fdinfo(tty, m); } static const struct file_operations tty_fops = { .llseek = no_llseek, .read_iter = tty_read, .write_iter = tty_write, .splice_read = copy_splice_read, .splice_write = iter_file_splice_write, .poll = tty_poll, .unlocked_ioctl = tty_ioctl, .compat_ioctl = tty_compat_ioctl, .open = tty_open, .release = tty_release, .fasync = tty_fasync, .show_fdinfo = tty_show_fdinfo, }; static const struct file_operations console_fops = { .llseek = no_llseek, .read_iter = tty_read, .write_iter = redirected_tty_write, .splice_read = copy_splice_read, .splice_write = iter_file_splice_write, .poll = tty_poll, .unlocked_ioctl = tty_ioctl, .compat_ioctl = tty_compat_ioctl, .open = tty_open, .release = tty_release, .fasync = tty_fasync, }; static const struct file_operations hung_up_tty_fops = { .llseek = no_llseek, .read_iter = hung_up_tty_read, .write_iter = hung_up_tty_write, .poll = hung_up_tty_poll, .unlocked_ioctl = hung_up_tty_ioctl, .compat_ioctl = hung_up_tty_compat_ioctl, .release = tty_release, .fasync = hung_up_tty_fasync, }; static DEFINE_SPINLOCK(redirect_lock); static struct file *redirect; /** * tty_wakeup - request more data * @tty: terminal * * Internal and external helper for wakeups of tty. This function informs the * line discipline if present that the driver is ready to receive more output * data. */ void tty_wakeup(struct tty_struct *tty) { struct tty_ldisc *ld; if (test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags)) { ld = tty_ldisc_ref(tty); if (ld) { if (ld->ops->write_wakeup) ld->ops->write_wakeup(tty); tty_ldisc_deref(ld); } } wake_up_interruptible_poll(&tty->write_wait, EPOLLOUT); } EXPORT_SYMBOL_GPL(tty_wakeup); /** * tty_release_redirect - Release a redirect on a pty if present * @tty: tty device * * This is available to the pty code so if the master closes, if the slave is a * redirect it can release the redirect. */ static struct file *tty_release_redirect(struct tty_struct *tty) { struct file *f = NULL; spin_lock(&redirect_lock); if (redirect && file_tty(redirect) == tty) { f = redirect; redirect = NULL; } spin_unlock(&redirect_lock); return f; } /** * __tty_hangup - actual handler for hangup events * @tty: tty device * @exit_session: if non-zero, signal all foreground group processes * * This can be called by a "kworker" kernel thread. That is process synchronous * but doesn't hold any locks, so we need to make sure we have the appropriate * locks for what we're doing. * * The hangup event clears any pending redirections onto the hung up device. It * ensures future writes will error and it does the needed line discipline * hangup and signal delivery. The tty object itself remains intact. * * Locking: * * BTM * * * redirect lock for undoing redirection * * file list lock for manipulating list of ttys * * tty_ldiscs_lock from called functions * * termios_rwsem resetting termios data * * tasklist_lock to walk task list for hangup event * * * ->siglock to protect ->signal/->sighand * */ static void __tty_hangup(struct tty_struct *tty, int exit_session) { struct file *cons_filp = NULL; struct file *filp, *f; struct tty_file_private *priv; int closecount = 0, n; int refs; if (!tty) return; f = tty_release_redirect(tty); tty_lock(tty); if (test_bit(TTY_HUPPED, &tty->flags)) { tty_unlock(tty); return; } /* * Some console devices aren't actually hung up for technical and * historical reasons, which can lead to indefinite interruptible * sleep in n_tty_read(). The following explicitly tells * n_tty_read() to abort readers. */ set_bit(TTY_HUPPING, &tty->flags); /* inuse_filps is protected by the single tty lock, * this really needs to change if we want to flush the * workqueue with the lock held. */ check_tty_count(tty, "tty_hangup"); spin_lock(&tty->files_lock); /* This breaks for file handles being sent over AF_UNIX sockets ? */ list_for_each_entry(priv, &tty->tty_files, list) { filp = priv->file; if (filp->f_op->write_iter == redirected_tty_write) cons_filp = filp; if (filp->f_op->write_iter != tty_write) continue; closecount++; __tty_fasync(-1, filp, 0); /* can't block */ filp->f_op = &hung_up_tty_fops; } spin_unlock(&tty->files_lock); refs = tty_signal_session_leader(tty, exit_session); /* Account for the p->signal references we killed */ while (refs--) tty_kref_put(tty); tty_ldisc_hangup(tty, cons_filp != NULL); spin_lock_irq(&tty->ctrl.lock); clear_bit(TTY_THROTTLED, &tty->flags); clear_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); put_pid(tty->ctrl.session); put_pid(tty->ctrl.pgrp); tty->ctrl.session = NULL; tty->ctrl.pgrp = NULL; tty->ctrl.pktstatus = 0; spin_unlock_irq(&tty->ctrl.lock); /* * If one of the devices matches a console pointer, we * cannot just call hangup() because that will cause * tty->count and state->count to go out of sync. * So we just call close() the right number of times. */ if (cons_filp) { if (tty->ops->close) for (n = 0; n < closecount; n++) tty->ops->close(tty, cons_filp); } else if (tty->ops->hangup) tty->ops->hangup(tty); /* * We don't want to have driver/ldisc interactions beyond the ones * we did here. The driver layer expects no calls after ->hangup() * from the ldisc side, which is now guaranteed. */ set_bit(TTY_HUPPED, &tty->flags); clear_bit(TTY_HUPPING, &tty->flags); tty_unlock(tty); if (f) fput(f); } static void do_tty_hangup(struct work_struct *work) { struct tty_struct *tty = container_of(work, struct tty_struct, hangup_work); __tty_hangup(tty, 0); } /** * tty_hangup - trigger a hangup event * @tty: tty to hangup * * A carrier loss (virtual or otherwise) has occurred on @tty. Schedule a * hangup sequence to run after this event. */ void tty_hangup(struct tty_struct *tty) { tty_debug_hangup(tty, "hangup\n"); schedule_work(&tty->hangup_work); } EXPORT_SYMBOL(tty_hangup); /** * tty_vhangup - process vhangup * @tty: tty to hangup * * The user has asked via system call for the terminal to be hung up. We do * this synchronously so that when the syscall returns the process is complete. * That guarantee is necessary for security reasons. */ void tty_vhangup(struct tty_struct *tty) { tty_debug_hangup(tty, "vhangup\n"); __tty_hangup(tty, 0); } EXPORT_SYMBOL(tty_vhangup); /** * tty_vhangup_self - process vhangup for own ctty * * Perform a vhangup on the current controlling tty */ void tty_vhangup_self(void) { struct tty_struct *tty; tty = get_current_tty(); if (tty) { tty_vhangup(tty); tty_kref_put(tty); } } /** * tty_vhangup_session - hangup session leader exit * @tty: tty to hangup * * The session leader is exiting and hanging up its controlling terminal. * Every process in the foreground process group is signalled %SIGHUP. * * We do this synchronously so that when the syscall returns the process is * complete. That guarantee is necessary for security reasons. */ void tty_vhangup_session(struct tty_struct *tty) { tty_debug_hangup(tty, "session hangup\n"); __tty_hangup(tty, 1); } /** * tty_hung_up_p - was tty hung up * @filp: file pointer of tty * * Return: true if the tty has been subject to a vhangup or a carrier loss */ int tty_hung_up_p(struct file *filp) { return (filp && filp->f_op == &hung_up_tty_fops); } EXPORT_SYMBOL(tty_hung_up_p); void __stop_tty(struct tty_struct *tty) { if (tty->flow.stopped) return; tty->flow.stopped = true; if (tty->ops->stop) tty->ops->stop(tty); } /** * stop_tty - propagate flow control * @tty: tty to stop * * Perform flow control to the driver. May be called on an already stopped * device and will not re-call the &tty_driver->stop() method. * * This functionality is used by both the line disciplines for halting incoming * flow and by the driver. It may therefore be called from any context, may be * under the tty %atomic_write_lock but not always. * * Locking: * flow.lock */ void stop_tty(struct tty_struct *tty) { unsigned long flags; spin_lock_irqsave(&tty->flow.lock, flags); __stop_tty(tty); spin_unlock_irqrestore(&tty->flow.lock, flags); } EXPORT_SYMBOL(stop_tty); void __start_tty(struct tty_struct *tty) { if (!tty->flow.stopped || tty->flow.tco_stopped) return; tty->flow.stopped = false; if (tty->ops->start) tty->ops->start(tty); tty_wakeup(tty); } /** * start_tty - propagate flow control * @tty: tty to start * * Start a tty that has been stopped if at all possible. If @tty was previously * stopped and is now being started, the &tty_driver->start() method is invoked * and the line discipline woken. * * Locking: * flow.lock */ void start_tty(struct tty_struct *tty) { unsigned long flags; spin_lock_irqsave(&tty->flow.lock, flags); __start_tty(tty); spin_unlock_irqrestore(&tty->flow.lock, flags); } EXPORT_SYMBOL(start_tty); static void tty_update_time(struct tty_struct *tty, bool mtime) { time64_t sec = ktime_get_real_seconds(); struct tty_file_private *priv; spin_lock(&tty->files_lock); list_for_each_entry(priv, &tty->tty_files, list) { struct inode *inode = file_inode(priv->file); struct timespec64 time = mtime ? inode_get_mtime(inode) : inode_get_atime(inode); /* * We only care if the two values differ in anything other than the * lower three bits (i.e every 8 seconds). If so, then we can update * the time of the tty device, otherwise it could be construded as a * security leak to let userspace know the exact timing of the tty. */ if ((sec ^ time.tv_sec) & ~7) { if (mtime) inode_set_mtime(inode, sec, 0); else inode_set_atime(inode, sec, 0); } } spin_unlock(&tty->files_lock); } /* * Iterate on the ldisc ->read() function until we've gotten all * the data the ldisc has for us. * * The "cookie" is something that the ldisc read function can fill * in to let us know that there is more data to be had. * * We promise to continue to call the ldisc until it stops returning * data or clears the cookie. The cookie may be something that the * ldisc maintains state for and needs to free. */ static ssize_t iterate_tty_read(struct tty_ldisc *ld, struct tty_struct *tty, struct file *file, struct iov_iter *to) { void *cookie = NULL; unsigned long offset = 0; char kernel_buf[64]; ssize_t retval = 0; size_t copied, count = iov_iter_count(to); do { ssize_t size = min(count, sizeof(kernel_buf)); size = ld->ops->read(tty, file, kernel_buf, size, &cookie, offset); if (!size) break; if (size < 0) { /* Did we have an earlier error (ie -EFAULT)? */ if (retval) break; retval = size; /* * -EOVERFLOW means we didn't have enough space * for a whole packet, and we shouldn't return * a partial result. */ if (retval == -EOVERFLOW) offset = 0; break; } copied = copy_to_iter(kernel_buf, size, to); offset += copied; count -= copied; /* * If the user copy failed, we still need to do another ->read() * call if we had a cookie to let the ldisc clear up. * * But make sure size is zeroed. */ if (unlikely(copied != size)) { count = 0; retval = -EFAULT; } } while (cookie); /* We always clear tty buffer in case they contained passwords */ memzero_explicit(kernel_buf, sizeof(kernel_buf)); return offset ? offset : retval; } /** * tty_read - read method for tty device files * @iocb: kernel I/O control block * @to: destination for the data read * * Perform the read system call function on this terminal device. Checks * for hung up devices before calling the line discipline method. * * Locking: * Locks the line discipline internally while needed. Multiple read calls * may be outstanding in parallel. */ static ssize_t tty_read(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct tty_struct *tty = file_tty(file); struct tty_ldisc *ld; ssize_t ret; if (tty_paranoia_check(tty, inode, "tty_read")) return -EIO; if (!tty || tty_io_error(tty)) return -EIO; /* We want to wait for the line discipline to sort out in this * situation. */ ld = tty_ldisc_ref_wait(tty); if (!ld) return hung_up_tty_read(iocb, to); ret = -EIO; if (ld->ops->read) ret = iterate_tty_read(ld, tty, file, to); tty_ldisc_deref(ld); if (ret > 0) tty_update_time(tty, false); return ret; } void tty_write_unlock(struct tty_struct *tty) { mutex_unlock(&tty->atomic_write_lock); wake_up_interruptible_poll(&tty->write_wait, EPOLLOUT); } int tty_write_lock(struct tty_struct *tty, bool ndelay) { if (!mutex_trylock(&tty->atomic_write_lock)) { if (ndelay) return -EAGAIN; if (mutex_lock_interruptible(&tty->atomic_write_lock)) return -ERESTARTSYS; } return 0; } /* * Split writes up in sane blocksizes to avoid * denial-of-service type attacks */ static ssize_t iterate_tty_write(struct tty_ldisc *ld, struct tty_struct *tty, struct file *file, struct iov_iter *from) { size_t chunk, count = iov_iter_count(from); ssize_t ret, written = 0; ret = tty_write_lock(tty, file->f_flags & O_NDELAY); if (ret < 0) return ret; /* * We chunk up writes into a temporary buffer. This * simplifies low-level drivers immensely, since they * don't have locking issues and user mode accesses. * * But if TTY_NO_WRITE_SPLIT is set, we should use a * big chunk-size.. * * The default chunk-size is 2kB, because the NTTY * layer has problems with bigger chunks. It will * claim to be able to handle more characters than * it actually does. */ chunk = 2048; if (test_bit(TTY_NO_WRITE_SPLIT, &tty->flags)) chunk = 65536; if (count < chunk) chunk = count; /* write_buf/write_cnt is protected by the atomic_write_lock mutex */ if (tty->write_cnt < chunk) { unsigned char *buf_chunk; if (chunk < 1024) chunk = 1024; buf_chunk = kvmalloc(chunk, GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!buf_chunk) { ret = -ENOMEM; goto out; } kvfree(tty->write_buf); tty->write_cnt = chunk; tty->write_buf = buf_chunk; } /* Do the write .. */ for (;;) { size_t size = min(chunk, count); ret = -EFAULT; if (copy_from_iter(tty->write_buf, size, from) != size) break; ret = ld->ops->write(tty, file, tty->write_buf, size); if (ret <= 0) break; written += ret; if (ret > size) break; /* FIXME! Have Al check this! */ if (ret != size) iov_iter_revert(from, size-ret); count -= ret; if (!count) break; ret = -ERESTARTSYS; if (signal_pending(current)) break; cond_resched(); } if (written) { tty_update_time(tty, true); ret = written; } out: tty_write_unlock(tty); return ret; } /** * tty_write_message - write a message to a certain tty, not just the console. * @tty: the destination tty_struct * @msg: the message to write * * This is used for messages that need to be redirected to a specific tty. We * don't put it into the syslog queue right now maybe in the future if really * needed. * * We must still hold the BTM and test the CLOSING flag for the moment. */ void tty_write_message(struct tty_struct *tty, char *msg) { if (tty) { mutex_lock(&tty->atomic_write_lock); tty_lock(tty); if (tty->ops->write && tty->count > 0) tty->ops->write(tty, msg, strlen(msg)); tty_unlock(tty); tty_write_unlock(tty); } } static ssize_t file_tty_write(struct file *file, struct kiocb *iocb, struct iov_iter *from) { struct tty_struct *tty = file_tty(file); struct tty_ldisc *ld; ssize_t ret; if (tty_paranoia_check(tty, file_inode(file), "tty_write")) return -EIO; if (!tty || !tty->ops->write || tty_io_error(tty)) return -EIO; /* Short term debug to catch buggy drivers */ if (tty->ops->write_room == NULL) tty_err(tty, "missing write_room method\n"); ld = tty_ldisc_ref_wait(tty); if (!ld) return hung_up_tty_write(iocb, from); if (!ld->ops->write) ret = -EIO; else ret = iterate_tty_write(ld, tty, file, from); tty_ldisc_deref(ld); return ret; } /** * tty_write - write method for tty device file * @iocb: kernel I/O control block * @from: iov_iter with data to write * * Write data to a tty device via the line discipline. * * Locking: * Locks the line discipline as required * Writes to the tty driver are serialized by the atomic_write_lock * and are then processed in chunks to the device. The line * discipline write method will not be invoked in parallel for * each device. */ static ssize_t tty_write(struct kiocb *iocb, struct iov_iter *from) { return file_tty_write(iocb->ki_filp, iocb, from); } ssize_t redirected_tty_write(struct kiocb *iocb, struct iov_iter *iter) { struct file *p = NULL; spin_lock(&redirect_lock); if (redirect) p = get_file(redirect); spin_unlock(&redirect_lock); /* * We know the redirected tty is just another tty, we can * call file_tty_write() directly with that file pointer. */ if (p) { ssize_t res; res = file_tty_write(p, iocb, iter); fput(p); return res; } return tty_write(iocb, iter); } /** * tty_send_xchar - send priority character * @tty: the tty to send to * @ch: xchar to send * * Send a high priority character to the tty even if stopped. * * Locking: none for xchar method, write ordering for write method. */ int tty_send_xchar(struct tty_struct *tty, char ch) { bool was_stopped = tty->flow.stopped; if (tty->ops->send_xchar) { down_read(&tty->termios_rwsem); tty->ops->send_xchar(tty, ch); up_read(&tty->termios_rwsem); return 0; } if (tty_write_lock(tty, false) < 0) return -ERESTARTSYS; down_read(&tty->termios_rwsem); if (was_stopped) start_tty(tty); tty->ops->write(tty, &ch, 1); if (was_stopped) stop_tty(tty); up_read(&tty->termios_rwsem); tty_write_unlock(tty); return 0; } /** * pty_line_name - generate name for a pty * @driver: the tty driver in use * @index: the minor number * @p: output buffer of at least 6 bytes * * Generate a name from a @driver reference and write it to the output buffer * @p. * * Locking: None */ static void pty_line_name(struct tty_driver *driver, int index, char *p) { static const char ptychar[] = "pqrstuvwxyzabcde"; int i = index + driver->name_base; /* ->name is initialized to "ttyp", but "tty" is expected */ sprintf(p, "%s%c%x", driver->subtype == PTY_TYPE_SLAVE ? "tty" : driver->name, ptychar[i >> 4 & 0xf], i & 0xf); } /** * tty_line_name - generate name for a tty * @driver: the tty driver in use * @index: the minor number * @p: output buffer of at least 7 bytes * * Generate a name from a @driver reference and write it to the output buffer * @p. * * Locking: None */ static ssize_t tty_line_name(struct tty_driver *driver, int index, char *p) { if (driver->flags & TTY_DRIVER_UNNUMBERED_NODE) return sprintf(p, "%s", driver->name); else return sprintf(p, "%s%d", driver->name, index + driver->name_base); } /** * tty_driver_lookup_tty() - find an existing tty, if any * @driver: the driver for the tty * @file: file object * @idx: the minor number * * Return: the tty, if found. If not found, return %NULL or ERR_PTR() if the * driver lookup() method returns an error. * * Locking: tty_mutex must be held. If the tty is found, bump the tty kref. */ static struct tty_struct *tty_driver_lookup_tty(struct tty_driver *driver, struct file *file, int idx) { struct tty_struct *tty; if (driver->ops->lookup) { if (!file) tty = ERR_PTR(-EIO); else tty = driver->ops->lookup(driver, file, idx); } else { if (idx >= driver->num) return ERR_PTR(-EINVAL); tty = driver->ttys[idx]; } if (!IS_ERR(tty)) tty_kref_get(tty); return tty; } /** * tty_init_termios - helper for termios setup * @tty: the tty to set up * * Initialise the termios structure for this tty. This runs under the * %tty_mutex currently so we can be relaxed about ordering. */ void tty_init_termios(struct tty_struct *tty) { struct ktermios *tp; int idx = tty->index; if (tty->driver->flags & TTY_DRIVER_RESET_TERMIOS) tty->termios = tty->driver->init_termios; else { /* Check for lazy saved data */ tp = tty->driver->termios[idx]; if (tp != NULL) { tty->termios = *tp; tty->termios.c_line = tty->driver->init_termios.c_line; } else tty->termios = tty->driver->init_termios; } /* Compatibility until drivers always set this */ tty->termios.c_ispeed = tty_termios_input_baud_rate(&tty->termios); tty->termios.c_ospeed = tty_termios_baud_rate(&tty->termios); } EXPORT_SYMBOL_GPL(tty_init_termios); /** * tty_standard_install - usual tty->ops->install * @driver: the driver for the tty * @tty: the tty * * If the @driver overrides @tty->ops->install, it still can call this function * to perform the standard install operations. */ int tty_standard_install(struct tty_driver *driver, struct tty_struct *tty) { tty_init_termios(tty); tty_driver_kref_get(driver); tty->count++; driver->ttys[tty->index] = tty; return 0; } EXPORT_SYMBOL_GPL(tty_standard_install); /** * tty_driver_install_tty() - install a tty entry in the driver * @driver: the driver for the tty * @tty: the tty * * Install a tty object into the driver tables. The @tty->index field will be * set by the time this is called. This method is responsible for ensuring any * need additional structures are allocated and configured. * * Locking: tty_mutex for now */ static int tty_driver_install_tty(struct tty_driver *driver, struct tty_struct *tty) { return driver->ops->install ? driver->ops->install(driver, tty) : tty_standard_install(driver, tty); } /** * tty_driver_remove_tty() - remove a tty from the driver tables * @driver: the driver for the tty * @tty: tty to remove * * Remove a tty object from the driver tables. The tty->index field will be set * by the time this is called. * * Locking: tty_mutex for now */ static void tty_driver_remove_tty(struct tty_driver *driver, struct tty_struct *tty) { if (driver->ops->remove) driver->ops->remove(driver, tty); else driver->ttys[tty->index] = NULL; } /** * tty_reopen() - fast re-open of an open tty * @tty: the tty to open * * Re-opens on master ptys are not allowed and return -%EIO. * * Locking: Caller must hold tty_lock * Return: 0 on success, -errno on error. */ static int tty_reopen(struct tty_struct *tty) { struct tty_driver *driver = tty->driver; struct tty_ldisc *ld; int retval = 0; if (driver->type == TTY_DRIVER_TYPE_PTY && driver->subtype == PTY_TYPE_MASTER) return -EIO; if (!tty->count) return -EAGAIN; if (test_bit(TTY_EXCLUSIVE, &tty->flags) && !capable(CAP_SYS_ADMIN)) return -EBUSY; ld = tty_ldisc_ref_wait(tty); if (ld) { tty_ldisc_deref(ld); } else { retval = tty_ldisc_lock(tty, 5 * HZ); if (retval) return retval; if (!tty->ldisc) retval = tty_ldisc_reinit(tty, tty->termios.c_line); tty_ldisc_unlock(tty); } if (retval == 0) tty->count++; return retval; } /** * tty_init_dev - initialise a tty device * @driver: tty driver we are opening a device on * @idx: device index * * Prepare a tty device. This may not be a "new" clean device but could also be * an active device. The pty drivers require special handling because of this. * * Locking: * The function is called under the tty_mutex, which protects us from the * tty struct or driver itself going away. * * On exit the tty device has the line discipline attached and a reference * count of 1. If a pair was created for pty/tty use and the other was a pty * master then it too has a reference count of 1. * * WSH 06/09/97: Rewritten to remove races and properly clean up after a failed * open. The new code protects the open with a mutex, so it's really quite * straightforward. The mutex locking can probably be relaxed for the (most * common) case of reopening a tty. * * Return: new tty structure */ struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx) { struct tty_struct *tty; int retval; /* * First time open is complex, especially for PTY devices. * This code guarantees that either everything succeeds and the * TTY is ready for operation, or else the table slots are vacated * and the allocated memory released. (Except that the termios * may be retained.) */ if (!try_module_get(driver->owner)) return ERR_PTR(-ENODEV); tty = alloc_tty_struct(driver, idx); if (!tty) { retval = -ENOMEM; goto err_module_put; } tty_lock(tty); retval = tty_driver_install_tty(driver, tty); if (retval < 0) goto err_free_tty; if (!tty->port) tty->port = driver->ports[idx]; if (WARN_RATELIMIT(!tty->port, "%s: %s driver does not set tty->port. This would crash the kernel. Fix the driver!\n", __func__, tty->driver->name)) { retval = -EINVAL; goto err_release_lock; } retval = tty_ldisc_lock(tty, 5 * HZ); if (retval) goto err_release_lock; tty->port->itty = tty; /* * Structures all installed ... call the ldisc open routines. * If we fail here just call release_tty to clean up. No need * to decrement the use counts, as release_tty doesn't care. */ retval = tty_ldisc_setup(tty, tty->link); if (retval) goto err_release_tty; tty_ldisc_unlock(tty); /* Return the tty locked so that it cannot vanish under the caller */ return tty; err_free_tty: tty_unlock(tty); free_tty_struct(tty); err_module_put: module_put(driver->owner); return ERR_PTR(retval); /* call the tty release_tty routine to clean out this slot */ err_release_tty: tty_ldisc_unlock(tty); tty_info_ratelimited(tty, "ldisc open failed (%d), clearing slot %d\n", retval, idx); err_release_lock: tty_unlock(tty); release_tty(tty, idx); return ERR_PTR(retval); } /** * tty_save_termios() - save tty termios data in driver table * @tty: tty whose termios data to save * * Locking: Caller guarantees serialisation with tty_init_termios(). */ void tty_save_termios(struct tty_struct *tty) { struct ktermios *tp; int idx = tty->index; /* If the port is going to reset then it has no termios to save */ if (tty->driver->flags & TTY_DRIVER_RESET_TERMIOS) return; /* Stash the termios data */ tp = tty->driver->termios[idx]; if (tp == NULL) { tp = kmalloc(sizeof(*tp), GFP_KERNEL); if (tp == NULL) return; tty->driver->termios[idx] = tp; } *tp = tty->termios; } EXPORT_SYMBOL_GPL(tty_save_termios); /** * tty_flush_works - flush all works of a tty/pty pair * @tty: tty device to flush works for (or either end of a pty pair) * * Sync flush all works belonging to @tty (and the 'other' tty). */ static void tty_flush_works(struct tty_struct *tty) { flush_work(&tty->SAK_work); flush_work(&tty->hangup_work); if (tty->link) { flush_work(&tty->link->SAK_work); flush_work(&tty->link->hangup_work); } } /** * release_one_tty - release tty structure memory * @work: work of tty we are obliterating * * Releases memory associated with a tty structure, and clears out the * driver table slots. This function is called when a device is no longer * in use. It also gets called when setup of a device fails. * * Locking: * takes the file list lock internally when working on the list of ttys * that the driver keeps. * * This method gets called from a work queue so that the driver private * cleanup ops can sleep (needed for USB at least) */ static void release_one_tty(struct work_struct *work) { struct tty_struct *tty = container_of(work, struct tty_struct, hangup_work); struct tty_driver *driver = tty->driver; struct module *owner = driver->owner; if (tty->ops->cleanup) tty->ops->cleanup(tty); tty_driver_kref_put(driver); module_put(owner); spin_lock(&tty->files_lock); list_del_init(&tty->tty_files); spin_unlock(&tty->files_lock); put_pid(tty->ctrl.pgrp); put_pid(tty->ctrl.session); free_tty_struct(tty); } static void queue_release_one_tty(struct kref *kref) { struct tty_struct *tty = container_of(kref, struct tty_struct, kref); /* The hangup queue is now free so we can reuse it rather than * waste a chunk of memory for each port. */ INIT_WORK(&tty->hangup_work, release_one_tty); schedule_work(&tty->hangup_work); } /** * tty_kref_put - release a tty kref * @tty: tty device * * Release a reference to the @tty device and if need be let the kref layer * destruct the object for us. */ void tty_kref_put(struct tty_struct *tty) { if (tty) kref_put(&tty->kref, queue_release_one_tty); } EXPORT_SYMBOL(tty_kref_put); /** * release_tty - release tty structure memory * @tty: tty device release * @idx: index of the tty device release * * Release both @tty and a possible linked partner (think pty pair), * and decrement the refcount of the backing module. * * Locking: * tty_mutex * takes the file list lock internally when working on the list of ttys * that the driver keeps. */ static void release_tty(struct tty_struct *tty, int idx) { /* This should always be true but check for the moment */ WARN_ON(tty->index != idx); WARN_ON(!mutex_is_locked(&tty_mutex)); if (tty->ops->shutdown) tty->ops->shutdown(tty); tty_save_termios(tty); tty_driver_remove_tty(tty->driver, tty); if (tty->port) tty->port->itty = NULL; if (tty->link) tty->link->port->itty = NULL; if (tty->port) tty_buffer_cancel_work(tty->port); if (tty->link) tty_buffer_cancel_work(tty->link->port); tty_kref_put(tty->link); tty_kref_put(tty); } /** * tty_release_checks - check a tty before real release * @tty: tty to check * @idx: index of the tty * * Performs some paranoid checking before true release of the @tty. This is a * no-op unless %TTY_PARANOIA_CHECK is defined. */ static int tty_release_checks(struct tty_struct *tty, int idx) { #ifdef TTY_PARANOIA_CHECK if (idx < 0 || idx >= tty->driver->num) { tty_debug(tty, "bad idx %d\n", idx); return -1; } /* not much to check for devpts */ if (tty->driver->flags & TTY_DRIVER_DEVPTS_MEM) return 0; if (tty != tty->driver->ttys[idx]) { tty_debug(tty, "bad driver table[%d] = %p\n", idx, tty->driver->ttys[idx]); return -1; } if (tty->driver->other) { struct tty_struct *o_tty = tty->link; if (o_tty != tty->driver->other->ttys[idx]) { tty_debug(tty, "bad other table[%d] = %p\n", idx, tty->driver->other->ttys[idx]); return -1; } if (o_tty->link != tty) { tty_debug(tty, "bad link = %p\n", o_tty->link); return -1; } } #endif return 0; } /** * tty_kclose - closes tty opened by tty_kopen * @tty: tty device * * Performs the final steps to release and free a tty device. It is the same as * tty_release_struct() except that it also resets %TTY_PORT_KOPENED flag on * @tty->port. */ void tty_kclose(struct tty_struct *tty) { /* * Ask the line discipline code to release its structures */ tty_ldisc_release(tty); /* Wait for pending work before tty destruction commences */ tty_flush_works(tty); tty_debug_hangup(tty, "freeing structure\n"); /* * The release_tty function takes care of the details of clearing * the slots and preserving the termios structure. */ mutex_lock(&tty_mutex); tty_port_set_kopened(tty->port, 0); release_tty(tty, tty->index); mutex_unlock(&tty_mutex); } EXPORT_SYMBOL_GPL(tty_kclose); /** * tty_release_struct - release a tty struct * @tty: tty device * @idx: index of the tty * * Performs the final steps to release and free a tty device. It is roughly the * reverse of tty_init_dev(). */ void tty_release_struct(struct tty_struct *tty, int idx) { /* * Ask the line discipline code to release its structures */ tty_ldisc_release(tty); /* Wait for pending work before tty destruction commmences */ tty_flush_works(tty); tty_debug_hangup(tty, "freeing structure\n"); /* * The release_tty function takes care of the details of clearing * the slots and preserving the termios structure. */ mutex_lock(&tty_mutex); release_tty(tty, idx); mutex_unlock(&tty_mutex); } EXPORT_SYMBOL_GPL(tty_release_struct); /** * tty_release - vfs callback for close * @inode: inode of tty * @filp: file pointer for handle to tty * * Called the last time each file handle is closed that references this tty. * There may however be several such references. * * Locking: * Takes BKL. See tty_release_dev(). * * Even releasing the tty structures is a tricky business. We have to be very * careful that the structures are all released at the same time, as interrupts * might otherwise get the wrong pointers. * * WSH 09/09/97: rewritten to avoid some nasty race conditions that could * lead to double frees or releasing memory still in use. */ int tty_release(struct inode *inode, struct file *filp) { struct tty_struct *tty = file_tty(filp); struct tty_struct *o_tty = NULL; int do_sleep, final; int idx; long timeout = 0; int once = 1; if (tty_paranoia_check(tty, inode, __func__)) return 0; tty_lock(tty); check_tty_count(tty, __func__); __tty_fasync(-1, filp, 0); idx = tty->index; if (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_MASTER) o_tty = tty->link; if (tty_release_checks(tty, idx)) { tty_unlock(tty); return 0; } tty_debug_hangup(tty, "releasing (count=%d)\n", tty->count); if (tty->ops->close) tty->ops->close(tty, filp); /* If tty is pty master, lock the slave pty (stable lock order) */ tty_lock_slave(o_tty); /* * Sanity check: if tty->count is going to zero, there shouldn't be * any waiters on tty->read_wait or tty->write_wait. We test the * wait queues and kick everyone out _before_ actually starting to * close. This ensures that we won't block while releasing the tty * structure. * * The test for the o_tty closing is necessary, since the master and * slave sides may close in any order. If the slave side closes out * first, its count will be one, since the master side holds an open. * Thus this test wouldn't be triggered at the time the slave closed, * so we do it now. */ while (1) { do_sleep = 0; if (tty->count <= 1) { if (waitqueue_active(&tty->read_wait)) { wake_up_poll(&tty->read_wait, EPOLLIN); do_sleep++; } if (waitqueue_active(&tty->write_wait)) { wake_up_poll(&tty->write_wait, EPOLLOUT); do_sleep++; } } if (o_tty && o_tty->count <= 1) { if (waitqueue_active(&o_tty->read_wait)) { wake_up_poll(&o_tty->read_wait, EPOLLIN); do_sleep++; } if (waitqueue_active(&o_tty->write_wait)) { wake_up_poll(&o_tty->write_wait, EPOLLOUT); do_sleep++; } } if (!do_sleep) break; if (once) { once = 0; tty_warn(tty, "read/write wait queue active!\n"); } schedule_timeout_killable(timeout); if (timeout < 120 * HZ) timeout = 2 * timeout + 1; else timeout = MAX_SCHEDULE_TIMEOUT; } if (o_tty) { if (--o_tty->count < 0) { tty_warn(tty, "bad slave count (%d)\n", o_tty->count); o_tty->count = 0; } } if (--tty->count < 0) { tty_warn(tty, "bad tty->count (%d)\n", tty->count); tty->count = 0; } /* * We've decremented tty->count, so we need to remove this file * descriptor off the tty->tty_files list; this serves two * purposes: * - check_tty_count sees the correct number of file descriptors * associated with this tty. * - do_tty_hangup no longer sees this file descriptor as * something that needs to be handled for hangups. */ tty_del_file(filp); /* * Perform some housekeeping before deciding whether to return. * * If _either_ side is closing, make sure there aren't any * processes that still think tty or o_tty is their controlling * tty. */ if (!tty->count) { read_lock(&tasklist_lock); session_clear_tty(tty->ctrl.session); if (o_tty) session_clear_tty(o_tty->ctrl.session); read_unlock(&tasklist_lock); } /* check whether both sides are closing ... */ final = !tty->count && !(o_tty && o_tty->count); tty_unlock_slave(o_tty); tty_unlock(tty); /* At this point, the tty->count == 0 should ensure a dead tty * cannot be re-opened by a racing opener. */ if (!final) return 0; tty_debug_hangup(tty, "final close\n"); tty_release_struct(tty, idx); return 0; } /** * tty_open_current_tty - get locked tty of current task * @device: device number * @filp: file pointer to tty * @return: locked tty of the current task iff @device is /dev/tty * * Performs a re-open of the current task's controlling tty. * * We cannot return driver and index like for the other nodes because devpts * will not work then. It expects inodes to be from devpts FS. */ static struct tty_struct *tty_open_current_tty(dev_t device, struct file *filp) { struct tty_struct *tty; int retval; if (device != MKDEV(TTYAUX_MAJOR, 0)) return NULL; tty = get_current_tty(); if (!tty) return ERR_PTR(-ENXIO); filp->f_flags |= O_NONBLOCK; /* Don't let /dev/tty block */ /* noctty = 1; */ tty_lock(tty); tty_kref_put(tty); /* safe to drop the kref now */ retval = tty_reopen(tty); if (retval < 0) { tty_unlock(tty); tty = ERR_PTR(retval); } return tty; } /** * tty_lookup_driver - lookup a tty driver for a given device file * @device: device number * @filp: file pointer to tty * @index: index for the device in the @return driver * * If returned value is not erroneous, the caller is responsible to decrement * the refcount by tty_driver_kref_put(). * * Locking: %tty_mutex protects get_tty_driver() * * Return: driver for this inode (with increased refcount) */ static struct tty_driver *tty_lookup_driver(dev_t device, struct file *filp, int *index) { struct tty_driver *driver = NULL; switch (device) { #ifdef CONFIG_VT case MKDEV(TTY_MAJOR, 0): { extern struct tty_driver *console_driver; driver = tty_driver_kref_get(console_driver); *index = fg_console; break; } #endif case MKDEV(TTYAUX_MAJOR, 1): { struct tty_driver *console_driver = console_device(index); if (console_driver) { driver = tty_driver_kref_get(console_driver); if (driver && filp) { /* Don't let /dev/console block */ filp->f_flags |= O_NONBLOCK; break; } } if (driver) tty_driver_kref_put(driver); return ERR_PTR(-ENODEV); } default: driver = get_tty_driver(device, index); if (!driver) return ERR_PTR(-ENODEV); break; } return driver; } static struct tty_struct *tty_kopen(dev_t device, int shared) { struct tty_struct *tty; struct tty_driver *driver; int index = -1; mutex_lock(&tty_mutex); driver = tty_lookup_driver(device, NULL, &index); if (IS_ERR(driver)) { mutex_unlock(&tty_mutex); return ERR_CAST(driver); } /* check whether we're reopening an existing tty */ tty = tty_driver_lookup_tty(driver, NULL, index); if (IS_ERR(tty) || shared) goto out; if (tty) { /* drop kref from tty_driver_lookup_tty() */ tty_kref_put(tty); tty = ERR_PTR(-EBUSY); } else { /* tty_init_dev returns tty with the tty_lock held */ tty = tty_init_dev(driver, index); if (IS_ERR(tty)) goto out; tty_port_set_kopened(tty->port, 1); } out: mutex_unlock(&tty_mutex); tty_driver_kref_put(driver); return tty; } /** * tty_kopen_exclusive - open a tty device for kernel * @device: dev_t of device to open * * Opens tty exclusively for kernel. Performs the driver lookup, makes sure * it's not already opened and performs the first-time tty initialization. * * Claims the global %tty_mutex to serialize: * * concurrent first-time tty initialization * * concurrent tty driver removal w/ lookup * * concurrent tty removal from driver table * * Return: the locked initialized &tty_struct */ struct tty_struct *tty_kopen_exclusive(dev_t device) { return tty_kopen(device, 0); } EXPORT_SYMBOL_GPL(tty_kopen_exclusive); /** * tty_kopen_shared - open a tty device for shared in-kernel use * @device: dev_t of device to open * * Opens an already existing tty for in-kernel use. Compared to * tty_kopen_exclusive() above it doesn't ensure to be the only user. * * Locking: identical to tty_kopen() above. */ struct tty_struct *tty_kopen_shared(dev_t device) { return tty_kopen(device, 1); } EXPORT_SYMBOL_GPL(tty_kopen_shared); /** * tty_open_by_driver - open a tty device * @device: dev_t of device to open * @filp: file pointer to tty * * Performs the driver lookup, checks for a reopen, or otherwise performs the * first-time tty initialization. * * * Claims the global tty_mutex to serialize: * * concurrent first-time tty initialization * * concurrent tty driver removal w/ lookup * * concurrent tty removal from driver table * * Return: the locked initialized or re-opened &tty_struct */ static struct tty_struct *tty_open_by_driver(dev_t device, struct file *filp) { struct tty_struct *tty; struct tty_driver *driver = NULL; int index = -1; int retval; mutex_lock(&tty_mutex); driver = tty_lookup_driver(device, filp, &index); if (IS_ERR(driver)) { mutex_unlock(&tty_mutex); return ERR_CAST(driver); } /* check whether we're reopening an existing tty */ tty = tty_driver_lookup_tty(driver, filp, index); if (IS_ERR(tty)) { mutex_unlock(&tty_mutex); goto out; } if (tty) { if (tty_port_kopened(tty->port)) { tty_kref_put(tty); mutex_unlock(&tty_mutex); tty = ERR_PTR(-EBUSY); goto out; } mutex_unlock(&tty_mutex); retval = tty_lock_interruptible(tty); tty_kref_put(tty); /* drop kref from tty_driver_lookup_tty() */ if (retval) { if (retval == -EINTR) retval = -ERESTARTSYS; tty = ERR_PTR(retval); goto out; } retval = tty_reopen(tty); if (retval < 0) { tty_unlock(tty); tty = ERR_PTR(retval); } } else { /* Returns with the tty_lock held for now */ tty = tty_init_dev(driver, index); mutex_unlock(&tty_mutex); } out: tty_driver_kref_put(driver); return tty; } /** * tty_open - open a tty device * @inode: inode of device file * @filp: file pointer to tty * * tty_open() and tty_release() keep up the tty count that contains the number * of opens done on a tty. We cannot use the inode-count, as different inodes * might point to the same tty. * * Open-counting is needed for pty masters, as well as for keeping track of * serial lines: DTR is dropped when the last close happens. * (This is not done solely through tty->count, now. - Ted 1/27/92) * * The termios state of a pty is reset on the first open so that settings don't * persist across reuse. * * Locking: * * %tty_mutex protects tty, tty_lookup_driver() and tty_init_dev(). * * @tty->count should protect the rest. * * ->siglock protects ->signal/->sighand * * Note: the tty_unlock/lock cases without a ref are only safe due to %tty_mutex */ static int tty_open(struct inode *inode, struct file *filp) { struct tty_struct *tty; int noctty, retval; dev_t device = inode->i_rdev; unsigned saved_flags = filp->f_flags; nonseekable_open(inode, filp); retry_open: retval = tty_alloc_file(filp); if (retval) return -ENOMEM; tty = tty_open_current_tty(device, filp); if (!tty) tty = tty_open_by_driver(device, filp); if (IS_ERR(tty)) { tty_free_file(filp); retval = PTR_ERR(tty); if (retval != -EAGAIN || signal_pending(current)) return retval; schedule(); goto retry_open; } tty_add_file(tty, filp); check_tty_count(tty, __func__); tty_debug_hangup(tty, "opening (count=%d)\n", tty->count); if (tty->ops->open) retval = tty->ops->open(tty, filp); else retval = -ENODEV; filp->f_flags = saved_flags; if (retval) { tty_debug_hangup(tty, "open error %d, releasing\n", retval); tty_unlock(tty); /* need to call tty_release without BTM */ tty_release(inode, filp); if (retval != -ERESTARTSYS) return retval; if (signal_pending(current)) return retval; schedule(); /* * Need to reset f_op in case a hangup happened. */ if (tty_hung_up_p(filp)) filp->f_op = &tty_fops; goto retry_open; } clear_bit(TTY_HUPPED, &tty->flags); noctty = (filp->f_flags & O_NOCTTY) || (IS_ENABLED(CONFIG_VT) && device == MKDEV(TTY_MAJOR, 0)) || device == MKDEV(TTYAUX_MAJOR, 1) || (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_MASTER); if (!noctty) tty_open_proc_set_tty(filp, tty); tty_unlock(tty); return 0; } /** * tty_poll - check tty status * @filp: file being polled * @wait: poll wait structures to update * * Call the line discipline polling method to obtain the poll status of the * device. * * Locking: locks called line discipline but ldisc poll method may be * re-entered freely by other callers. */ static __poll_t tty_poll(struct file *filp, poll_table *wait) { struct tty_struct *tty = file_tty(filp); struct tty_ldisc *ld; __poll_t ret = 0; if (tty_paranoia_check(tty, file_inode(filp), "tty_poll")) return 0; ld = tty_ldisc_ref_wait(tty); if (!ld) return hung_up_tty_poll(filp, wait); if (ld->ops->poll) ret = ld->ops->poll(tty, filp, wait); tty_ldisc_deref(ld); return ret; } static int __tty_fasync(int fd, struct file *filp, int on) { struct tty_struct *tty = file_tty(filp); unsigned long flags; int retval = 0; if (tty_paranoia_check(tty, file_inode(filp), "tty_fasync")) goto out; retval = fasync_helper(fd, filp, on, &tty->fasync); if (retval <= 0) goto out; if (on) { enum pid_type type; struct pid *pid; spin_lock_irqsave(&tty->ctrl.lock, flags); if (tty->ctrl.pgrp) { pid = tty->ctrl.pgrp; type = PIDTYPE_PGID; } else { pid = task_pid(current); type = PIDTYPE_TGID; } get_pid(pid); spin_unlock_irqrestore(&tty->ctrl.lock, flags); __f_setown(filp, pid, type, 0); put_pid(pid); retval = 0; } out: return retval; } static int tty_fasync(int fd, struct file *filp, int on) { struct tty_struct *tty = file_tty(filp); int retval = -ENOTTY; tty_lock(tty); if (!tty_hung_up_p(filp)) retval = __tty_fasync(fd, filp, on); tty_unlock(tty); return retval; } static bool tty_legacy_tiocsti __read_mostly = IS_ENABLED(CONFIG_LEGACY_TIOCSTI); /** * tiocsti - fake input character * @tty: tty to fake input into * @p: pointer to character * * Fake input to a tty device. Does the necessary locking and input management. * * FIXME: does not honour flow control ?? * * Locking: * * Called functions take tty_ldiscs_lock * * current->signal->tty check is safe without locks */ static int tiocsti(struct tty_struct *tty, char __user *p) { char ch, mbz = 0; struct tty_ldisc *ld; if (!tty_legacy_tiocsti && !capable(CAP_SYS_ADMIN)) return -EIO; if ((current->signal->tty != tty) && !capable(CAP_SYS_ADMIN)) return -EPERM; if (get_user(ch, p)) return -EFAULT; tty_audit_tiocsti(tty, ch); ld = tty_ldisc_ref_wait(tty); if (!ld) return -EIO; tty_buffer_lock_exclusive(tty->port); if (ld->ops->receive_buf) ld->ops->receive_buf(tty, &ch, &mbz, 1); tty_buffer_unlock_exclusive(tty->port); tty_ldisc_deref(ld); return 0; } /** * tiocgwinsz - implement window query ioctl * @tty: tty * @arg: user buffer for result * * Copies the kernel idea of the window size into the user buffer. * * Locking: @tty->winsize_mutex is taken to ensure the winsize data is * consistent. */ static int tiocgwinsz(struct tty_struct *tty, struct winsize __user *arg) { int err; mutex_lock(&tty->winsize_mutex); err = copy_to_user(arg, &tty->winsize, sizeof(*arg)); mutex_unlock(&tty->winsize_mutex); return err ? -EFAULT : 0; } /** * tty_do_resize - resize event * @tty: tty being resized * @ws: new dimensions * * Update the termios variables and send the necessary signals to peform a * terminal resize correctly. */ int tty_do_resize(struct tty_struct *tty, struct winsize *ws) { struct pid *pgrp; /* Lock the tty */ mutex_lock(&tty->winsize_mutex); if (!memcmp(ws, &tty->winsize, sizeof(*ws))) goto done; /* Signal the foreground process group */ pgrp = tty_get_pgrp(tty); if (pgrp) kill_pgrp(pgrp, SIGWINCH, 1); put_pid(pgrp); tty->winsize = *ws; done: mutex_unlock(&tty->winsize_mutex); return 0; } EXPORT_SYMBOL(tty_do_resize); /** * tiocswinsz - implement window size set ioctl * @tty: tty side of tty * @arg: user buffer for result * * Copies the user idea of the window size to the kernel. Traditionally this is * just advisory information but for the Linux console it actually has driver * level meaning and triggers a VC resize. * * Locking: * Driver dependent. The default do_resize method takes the tty termios * mutex and ctrl.lock. The console takes its own lock then calls into the * default method. */ static int tiocswinsz(struct tty_struct *tty, struct winsize __user *arg) { struct winsize tmp_ws; if (copy_from_user(&tmp_ws, arg, sizeof(*arg))) return -EFAULT; if (tty->ops->resize) return tty->ops->resize(tty, &tmp_ws); else return tty_do_resize(tty, &tmp_ws); } /** * tioccons - allow admin to move logical console * @file: the file to become console * * Allow the administrator to move the redirected console device. * * Locking: uses redirect_lock to guard the redirect information */ static int tioccons(struct file *file) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (file->f_op->write_iter == redirected_tty_write) { struct file *f; spin_lock(&redirect_lock); f = redirect; redirect = NULL; spin_unlock(&redirect_lock); if (f) fput(f); return 0; } if (file->f_op->write_iter != tty_write) return -ENOTTY; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; spin_lock(&redirect_lock); if (redirect) { spin_unlock(&redirect_lock); return -EBUSY; } redirect = get_file(file); spin_unlock(&redirect_lock); return 0; } /** * tiocsetd - set line discipline * @tty: tty device * @p: pointer to user data * * Set the line discipline according to user request. * * Locking: see tty_set_ldisc(), this function is just a helper */ static int tiocsetd(struct tty_struct *tty, int __user *p) { int disc; int ret; if (get_user(disc, p)) return -EFAULT; ret = tty_set_ldisc(tty, disc); return ret; } /** * tiocgetd - get line discipline * @tty: tty device * @p: pointer to user data * * Retrieves the line discipline id directly from the ldisc. * * Locking: waits for ldisc reference (in case the line discipline is changing * or the @tty is being hungup) */ static int tiocgetd(struct tty_struct *tty, int __user *p) { struct tty_ldisc *ld; int ret; ld = tty_ldisc_ref_wait(tty); if (!ld) return -EIO; ret = put_user(ld->ops->num, p); tty_ldisc_deref(ld); return ret; } /** * send_break - performed time break * @tty: device to break on * @duration: timeout in mS * * Perform a timed break on hardware that lacks its own driver level timed * break functionality. * * Locking: * @tty->atomic_write_lock serializes */ static int send_break(struct tty_struct *tty, unsigned int duration) { int retval; if (tty->ops->break_ctl == NULL) return 0; if (tty->driver->flags & TTY_DRIVER_HARDWARE_BREAK) return tty->ops->break_ctl(tty, duration); /* Do the work ourselves */ if (tty_write_lock(tty, false) < 0) return -EINTR; retval = tty->ops->break_ctl(tty, -1); if (!retval) { msleep_interruptible(duration); retval = tty->ops->break_ctl(tty, 0); } tty_write_unlock(tty); if (signal_pending(current)) retval = -EINTR; return retval; } /** * tty_tiocmget - get modem status * @tty: tty device * @p: pointer to result * * Obtain the modem status bits from the tty driver if the feature is * supported. Return -%ENOTTY if it is not available. * * Locking: none (up to the driver) */ static int tty_tiocmget(struct tty_struct *tty, int __user *p) { int retval = -ENOTTY; if (tty->ops->tiocmget) { retval = tty->ops->tiocmget(tty); if (retval >= 0) retval = put_user(retval, p); } return retval; } /** * tty_tiocmset - set modem status * @tty: tty device * @cmd: command - clear bits, set bits or set all * @p: pointer to desired bits * * Set the modem status bits from the tty driver if the feature * is supported. Return -%ENOTTY if it is not available. * * Locking: none (up to the driver) */ static int tty_tiocmset(struct tty_struct *tty, unsigned int cmd, unsigned __user *p) { int retval; unsigned int set, clear, val; if (tty->ops->tiocmset == NULL) return -ENOTTY; retval = get_user(val, p); if (retval) return retval; set = clear = 0; switch (cmd) { case TIOCMBIS: set = val; break; case TIOCMBIC: clear = val; break; case TIOCMSET: set = val; clear = ~val; break; } set &= TIOCM_DTR|TIOCM_RTS|TIOCM_OUT1|TIOCM_OUT2|TIOCM_LOOP; clear &= TIOCM_DTR|TIOCM_RTS|TIOCM_OUT1|TIOCM_OUT2|TIOCM_LOOP; return tty->ops->tiocmset(tty, set, clear); } /** * tty_get_icount - get tty statistics * @tty: tty device * @icount: output parameter * * Gets a copy of the @tty's icount statistics. * * Locking: none (up to the driver) */ int tty_get_icount(struct tty_struct *tty, struct serial_icounter_struct *icount) { memset(icount, 0, sizeof(*icount)); if (tty->ops->get_icount) return tty->ops->get_icount(tty, icount); else return -ENOTTY; } EXPORT_SYMBOL_GPL(tty_get_icount); static int tty_tiocgicount(struct tty_struct *tty, void __user *arg) { struct serial_icounter_struct icount; int retval; retval = tty_get_icount(tty, &icount); if (retval != 0) return retval; if (copy_to_user(arg, &icount, sizeof(icount))) return -EFAULT; return 0; } static int tty_set_serial(struct tty_struct *tty, struct serial_struct *ss) { char comm[TASK_COMM_LEN]; int flags; flags = ss->flags & ASYNC_DEPRECATED; if (flags) pr_warn_ratelimited("%s: '%s' is using deprecated serial flags (with no effect): %.8x\n", __func__, get_task_comm(comm, current), flags); if (!tty->ops->set_serial) return -ENOTTY; return tty->ops->set_serial(tty, ss); } static int tty_tiocsserial(struct tty_struct *tty, struct serial_struct __user *ss) { struct serial_struct v; if (copy_from_user(&v, ss, sizeof(*ss))) return -EFAULT; return tty_set_serial(tty, &v); } static int tty_tiocgserial(struct tty_struct *tty, struct serial_struct __user *ss) { struct serial_struct v; int err; memset(&v, 0, sizeof(v)); if (!tty->ops->get_serial) return -ENOTTY; err = tty->ops->get_serial(tty, &v); if (!err && copy_to_user(ss, &v, sizeof(v))) err = -EFAULT; return err; } /* * if pty, return the slave side (real_tty) * otherwise, return self */ static struct tty_struct *tty_pair_get_tty(struct tty_struct *tty) { if (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_MASTER) tty = tty->link; return tty; } /* * Split this up, as gcc can choke on it otherwise.. */ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct tty_struct *tty = file_tty(file); struct tty_struct *real_tty; void __user *p = (void __user *)arg; int retval; struct tty_ldisc *ld; if (tty_paranoia_check(tty, file_inode(file), "tty_ioctl")) return -EINVAL; real_tty = tty_pair_get_tty(tty); /* * Factor out some common prep work */ switch (cmd) { case TIOCSETD: case TIOCSBRK: case TIOCCBRK: case TCSBRK: case TCSBRKP: retval = tty_check_change(tty); if (retval) return retval; if (cmd != TIOCCBRK) { tty_wait_until_sent(tty, 0); if (signal_pending(current)) return -EINTR; } break; } /* * Now do the stuff. */ switch (cmd) { case TIOCSTI: return tiocsti(tty, p); case TIOCGWINSZ: return tiocgwinsz(real_tty, p); case TIOCSWINSZ: return tiocswinsz(real_tty, p); case TIOCCONS: return real_tty != tty ? -EINVAL : tioccons(file); case TIOCEXCL: set_bit(TTY_EXCLUSIVE, &tty->flags); return 0; case TIOCNXCL: clear_bit(TTY_EXCLUSIVE, &tty->flags); return 0; case TIOCGEXCL: { int excl = test_bit(TTY_EXCLUSIVE, &tty->flags); return put_user(excl, (int __user *)p); } case TIOCGETD: return tiocgetd(tty, p); case TIOCSETD: return tiocsetd(tty, p); case TIOCVHANGUP: if (!capable(CAP_SYS_ADMIN)) return -EPERM; tty_vhangup(tty); return 0; case TIOCGDEV: { unsigned int ret = new_encode_dev(tty_devnum(real_tty)); return put_user(ret, (unsigned int __user *)p); } /* * Break handling */ case TIOCSBRK: /* Turn break on, unconditionally */ if (tty->ops->break_ctl) return tty->ops->break_ctl(tty, -1); return 0; case TIOCCBRK: /* Turn break off, unconditionally */ if (tty->ops->break_ctl) return tty->ops->break_ctl(tty, 0); return 0; case TCSBRK: /* SVID version: non-zero arg --> no break */ /* non-zero arg means wait for all output data * to be sent (performed above) but don't send break. * This is used by the tcdrain() termios function. */ if (!arg) return send_break(tty, 250); return 0; case TCSBRKP: /* support for POSIX tcsendbreak() */ return send_break(tty, arg ? arg*100 : 250); case TIOCMGET: return tty_tiocmget(tty, p); case TIOCMSET: case TIOCMBIC: case TIOCMBIS: return tty_tiocmset(tty, cmd, p); case TIOCGICOUNT: return tty_tiocgicount(tty, p); case TCFLSH: switch (arg) { case TCIFLUSH: case TCIOFLUSH: /* flush tty buffer and allow ldisc to process ioctl */ tty_buffer_flush(tty, NULL); break; } break; case TIOCSSERIAL: return tty_tiocsserial(tty, p); case TIOCGSERIAL: return tty_tiocgserial(tty, p); case TIOCGPTPEER: /* Special because the struct file is needed */ return ptm_open_peer(file, tty, (int)arg); default: retval = tty_jobctrl_ioctl(tty, real_tty, file, cmd, arg); if (retval != -ENOIOCTLCMD) return retval; } if (tty->ops->ioctl) { retval = tty->ops->ioctl(tty, cmd, arg); if (retval != -ENOIOCTLCMD) return retval; } ld = tty_ldisc_ref_wait(tty); if (!ld) return hung_up_tty_ioctl(file, cmd, arg); retval = -EINVAL; if (ld->ops->ioctl) { retval = ld->ops->ioctl(tty, cmd, arg); if (retval == -ENOIOCTLCMD) retval = -ENOTTY; } tty_ldisc_deref(ld); return retval; } #ifdef CONFIG_COMPAT struct serial_struct32 { compat_int_t type; compat_int_t line; compat_uint_t port; compat_int_t irq; compat_int_t flags; compat_int_t xmit_fifo_size; compat_int_t custom_divisor; compat_int_t baud_base; unsigned short close_delay; char io_type; char reserved_char; compat_int_t hub6; unsigned short closing_wait; /* time to wait before closing */ unsigned short closing_wait2; /* no longer used... */ compat_uint_t iomem_base; unsigned short iomem_reg_shift; unsigned int port_high; /* compat_ulong_t iomap_base FIXME */ compat_int_t reserved; }; static int compat_tty_tiocsserial(struct tty_struct *tty, struct serial_struct32 __user *ss) { struct serial_struct32 v32; struct serial_struct v; if (copy_from_user(&v32, ss, sizeof(*ss))) return -EFAULT; memcpy(&v, &v32, offsetof(struct serial_struct32, iomem_base)); v.iomem_base = compat_ptr(v32.iomem_base); v.iomem_reg_shift = v32.iomem_reg_shift; v.port_high = v32.port_high; v.iomap_base = 0; return tty_set_serial(tty, &v); } static int compat_tty_tiocgserial(struct tty_struct *tty, struct serial_struct32 __user *ss) { struct serial_struct32 v32; struct serial_struct v; int err; memset(&v, 0, sizeof(v)); memset(&v32, 0, sizeof(v32)); if (!tty->ops->get_serial) return -ENOTTY; err = tty->ops->get_serial(tty, &v); if (!err) { memcpy(&v32, &v, offsetof(struct serial_struct32, iomem_base)); v32.iomem_base = (unsigned long)v.iomem_base >> 32 ? 0xfffffff : ptr_to_compat(v.iomem_base); v32.iomem_reg_shift = v.iomem_reg_shift; v32.port_high = v.port_high; if (copy_to_user(ss, &v32, sizeof(v32))) err = -EFAULT; } return err; } static long tty_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct tty_struct *tty = file_tty(file); struct tty_ldisc *ld; int retval = -ENOIOCTLCMD; switch (cmd) { case TIOCOUTQ: case TIOCSTI: case TIOCGWINSZ: case TIOCSWINSZ: case TIOCGEXCL: case TIOCGETD: case TIOCSETD: case TIOCGDEV: case TIOCMGET: case TIOCMSET: case TIOCMBIC: case TIOCMBIS: case TIOCGICOUNT: case TIOCGPGRP: case TIOCSPGRP: case TIOCGSID: case TIOCSERGETLSR: case TIOCGRS485: case TIOCSRS485: #ifdef TIOCGETP case TIOCGETP: case TIOCSETP: case TIOCSETN: #endif #ifdef TIOCGETC case TIOCGETC: case TIOCSETC: #endif #ifdef TIOCGLTC case TIOCGLTC: case TIOCSLTC: #endif case TCSETSF: case TCSETSW: case TCSETS: case TCGETS: #ifdef TCGETS2 case TCGETS2: case TCSETSF2: case TCSETSW2: case TCSETS2: #endif case TCGETA: case TCSETAF: case TCSETAW: case TCSETA: case TIOCGLCKTRMIOS: case TIOCSLCKTRMIOS: #ifdef TCGETX case TCGETX: case TCSETX: case TCSETXW: case TCSETXF: #endif case TIOCGSOFTCAR: case TIOCSSOFTCAR: case PPPIOCGCHAN: case PPPIOCGUNIT: return tty_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); case TIOCCONS: case TIOCEXCL: case TIOCNXCL: case TIOCVHANGUP: case TIOCSBRK: case TIOCCBRK: case TCSBRK: case TCSBRKP: case TCFLSH: case TIOCGPTPEER: case TIOCNOTTY: case TIOCSCTTY: case TCXONC: case TIOCMIWAIT: case TIOCSERCONFIG: return tty_ioctl(file, cmd, arg); } if (tty_paranoia_check(tty, file_inode(file), "tty_ioctl")) return -EINVAL; switch (cmd) { case TIOCSSERIAL: return compat_tty_tiocsserial(tty, compat_ptr(arg)); case TIOCGSERIAL: return compat_tty_tiocgserial(tty, compat_ptr(arg)); } if (tty->ops->compat_ioctl) { retval = tty->ops->compat_ioctl(tty, cmd, arg); if (retval != -ENOIOCTLCMD) return retval; } ld = tty_ldisc_ref_wait(tty); if (!ld) return hung_up_tty_compat_ioctl(file, cmd, arg); if (ld->ops->compat_ioctl) retval = ld->ops->compat_ioctl(tty, cmd, arg); if (retval == -ENOIOCTLCMD && ld->ops->ioctl) retval = ld->ops->ioctl(tty, (unsigned long)compat_ptr(cmd), arg); tty_ldisc_deref(ld); return retval; } #endif static int this_tty(const void *t, struct file *file, unsigned fd) { if (likely(file->f_op->read_iter != tty_read)) return 0; return file_tty(file) != t ? 0 : fd + 1; } /* * This implements the "Secure Attention Key" --- the idea is to * prevent trojan horses by killing all processes associated with this * tty when the user hits the "Secure Attention Key". Required for * super-paranoid applications --- see the Orange Book for more details. * * This code could be nicer; ideally it should send a HUP, wait a few * seconds, then send a INT, and then a KILL signal. But you then * have to coordinate with the init process, since all processes associated * with the current tty must be dead before the new getty is allowed * to spawn. * * Now, if it would be correct ;-/ The current code has a nasty hole - * it doesn't catch files in flight. We may send the descriptor to ourselves * via AF_UNIX socket, close it and later fetch from socket. FIXME. * * Nasty bug: do_SAK is being called in interrupt context. This can * deadlock. We punt it up to process context. AKPM - 16Mar2001 */ void __do_SAK(struct tty_struct *tty) { struct task_struct *g, *p; struct pid *session; int i; unsigned long flags; spin_lock_irqsave(&tty->ctrl.lock, flags); session = get_pid(tty->ctrl.session); spin_unlock_irqrestore(&tty->ctrl.lock, flags); tty_ldisc_flush(tty); tty_driver_flush_buffer(tty); read_lock(&tasklist_lock); /* Kill the entire session */ do_each_pid_task(session, PIDTYPE_SID, p) { tty_notice(tty, "SAK: killed process %d (%s): by session\n", task_pid_nr(p), p->comm); group_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_SID); } while_each_pid_task(session, PIDTYPE_SID, p); /* Now kill any processes that happen to have the tty open */ for_each_process_thread(g, p) { if (p->signal->tty == tty) { tty_notice(tty, "SAK: killed process %d (%s): by controlling tty\n", task_pid_nr(p), p->comm); group_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_SID); continue; } task_lock(p); i = iterate_fd(p->files, 0, this_tty, tty); if (i != 0) { tty_notice(tty, "SAK: killed process %d (%s): by fd#%d\n", task_pid_nr(p), p->comm, i - 1); group_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_SID); } task_unlock(p); } read_unlock(&tasklist_lock); put_pid(session); } static void do_SAK_work(struct work_struct *work) { struct tty_struct *tty = container_of(work, struct tty_struct, SAK_work); __do_SAK(tty); } /* * The tq handling here is a little racy - tty->SAK_work may already be queued. * Fortunately we don't need to worry, because if ->SAK_work is already queued, * the values which we write to it will be identical to the values which it * already has. --akpm */ void do_SAK(struct tty_struct *tty) { if (!tty) return; schedule_work(&tty->SAK_work); } EXPORT_SYMBOL(do_SAK); /* Must put_device() after it's unused! */ static struct device *tty_get_device(struct tty_struct *tty) { dev_t devt = tty_devnum(tty); return class_find_device_by_devt(&tty_class, devt); } /** * alloc_tty_struct - allocate a new tty * @driver: driver which will handle the returned tty * @idx: minor of the tty * * This subroutine allocates and initializes a tty structure. * * Locking: none - @tty in question is not exposed at this point */ struct tty_struct *alloc_tty_struct(struct tty_driver *driver, int idx) { struct tty_struct *tty; tty = kzalloc(sizeof(*tty), GFP_KERNEL_ACCOUNT); if (!tty) return NULL; kref_init(&tty->kref); if (tty_ldisc_init(tty)) { kfree(tty); return NULL; } tty->ctrl.session = NULL; tty->ctrl.pgrp = NULL; mutex_init(&tty->legacy_mutex); mutex_init(&tty->throttle_mutex); init_rwsem(&tty->termios_rwsem); mutex_init(&tty->winsize_mutex); init_ldsem(&tty->ldisc_sem); init_waitqueue_head(&tty->write_wait); init_waitqueue_head(&tty->read_wait); INIT_WORK(&tty->hangup_work, do_tty_hangup); mutex_init(&tty->atomic_write_lock); spin_lock_init(&tty->ctrl.lock); spin_lock_init(&tty->flow.lock); spin_lock_init(&tty->files_lock); INIT_LIST_HEAD(&tty->tty_files); INIT_WORK(&tty->SAK_work, do_SAK_work); tty->driver = driver; tty->ops = driver->ops; tty->index = idx; tty_line_name(driver, idx, tty->name); tty->dev = tty_get_device(tty); return tty; } /** * tty_put_char - write one character to a tty * @tty: tty * @ch: character to write * * Write one byte to the @tty using the provided @tty->ops->put_char() method * if present. * * Note: the specific put_char operation in the driver layer may go * away soon. Don't call it directly, use this method * * Return: the number of characters successfully output. */ int tty_put_char(struct tty_struct *tty, unsigned char ch) { if (tty->ops->put_char) return tty->ops->put_char(tty, ch); return tty->ops->write(tty, &ch, 1); } EXPORT_SYMBOL_GPL(tty_put_char); static int tty_cdev_add(struct tty_driver *driver, dev_t dev, unsigned int index, unsigned int count) { int err; /* init here, since reused cdevs cause crashes */ driver->cdevs[index] = cdev_alloc(); if (!driver->cdevs[index]) return -ENOMEM; driver->cdevs[index]->ops = &tty_fops; driver->cdevs[index]->owner = driver->owner; err = cdev_add(driver->cdevs[index], dev, count); if (err) kobject_put(&driver->cdevs[index]->kobj); return err; } /** * tty_register_device - register a tty device * @driver: the tty driver that describes the tty device * @index: the index in the tty driver for this tty device * @device: a struct device that is associated with this tty device. * This field is optional, if there is no known struct device * for this tty device it can be set to NULL safely. * * This call is required to be made to register an individual tty device * if the tty driver's flags have the %TTY_DRIVER_DYNAMIC_DEV bit set. If * that bit is not set, this function should not be called by a tty * driver. * * Locking: ?? * * Return: A pointer to the struct device for this tty device (or * ERR_PTR(-EFOO) on error). */ struct device *tty_register_device(struct tty_driver *driver, unsigned index, struct device *device) { return tty_register_device_attr(driver, index, device, NULL, NULL); } EXPORT_SYMBOL(tty_register_device); static void tty_device_create_release(struct device *dev) { dev_dbg(dev, "releasing...\n"); kfree(dev); } /** * tty_register_device_attr - register a tty device * @driver: the tty driver that describes the tty device * @index: the index in the tty driver for this tty device * @device: a struct device that is associated with this tty device. * This field is optional, if there is no known struct device * for this tty device it can be set to %NULL safely. * @drvdata: Driver data to be set to device. * @attr_grp: Attribute group to be set on device. * * This call is required to be made to register an individual tty device if the * tty driver's flags have the %TTY_DRIVER_DYNAMIC_DEV bit set. If that bit is * not set, this function should not be called by a tty driver. * * Locking: ?? * * Return: A pointer to the struct device for this tty device (or * ERR_PTR(-EFOO) on error). */ struct device *tty_register_device_attr(struct tty_driver *driver, unsigned index, struct device *device, void *drvdata, const struct attribute_group **attr_grp) { char name[64]; dev_t devt = MKDEV(driver->major, driver->minor_start) + index; struct ktermios *tp; struct device *dev; int retval; if (index >= driver->num) { pr_err("%s: Attempt to register invalid tty line number (%d)\n", driver->name, index); return ERR_PTR(-EINVAL); } if (driver->type == TTY_DRIVER_TYPE_PTY) pty_line_name(driver, index, name); else tty_line_name(driver, index, name); dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return ERR_PTR(-ENOMEM); dev->devt = devt; dev->class = &tty_class; dev->parent = device; dev->release = tty_device_create_release; dev_set_name(dev, "%s", name); dev->groups = attr_grp; dev_set_drvdata(dev, drvdata); dev_set_uevent_suppress(dev, 1); retval = device_register(dev); if (retval) goto err_put; if (!(driver->flags & TTY_DRIVER_DYNAMIC_ALLOC)) { /* * Free any saved termios data so that the termios state is * reset when reusing a minor number. */ tp = driver->termios[index]; if (tp) { driver->termios[index] = NULL; kfree(tp); } retval = tty_cdev_add(driver, devt, index, 1); if (retval) goto err_del; } dev_set_uevent_suppress(dev, 0); kobject_uevent(&dev->kobj, KOBJ_ADD); return dev; err_del: device_del(dev); err_put: put_device(dev); return ERR_PTR(retval); } EXPORT_SYMBOL_GPL(tty_register_device_attr); /** * tty_unregister_device - unregister a tty device * @driver: the tty driver that describes the tty device * @index: the index in the tty driver for this tty device * * If a tty device is registered with a call to tty_register_device() then * this function must be called when the tty device is gone. * * Locking: ?? */ void tty_unregister_device(struct tty_driver *driver, unsigned index) { device_destroy(&tty_class, MKDEV(driver->major, driver->minor_start) + index); if (!(driver->flags & TTY_DRIVER_DYNAMIC_ALLOC)) { cdev_del(driver->cdevs[index]); driver->cdevs[index] = NULL; } } EXPORT_SYMBOL(tty_unregister_device); /** * __tty_alloc_driver - allocate tty driver * @lines: count of lines this driver can handle at most * @owner: module which is responsible for this driver * @flags: some of %TTY_DRIVER_ flags, will be set in driver->flags * * This should not be called directly, some of the provided macros should be * used instead. Use IS_ERR() and friends on @retval. */ struct tty_driver *__tty_alloc_driver(unsigned int lines, struct module *owner, unsigned long flags) { struct tty_driver *driver; unsigned int cdevs = 1; int err; if (!lines || (flags & TTY_DRIVER_UNNUMBERED_NODE && lines > 1)) return ERR_PTR(-EINVAL); driver = kzalloc(sizeof(*driver), GFP_KERNEL); if (!driver) return ERR_PTR(-ENOMEM); kref_init(&driver->kref); driver->num = lines; driver->owner = owner; driver->flags = flags; if (!(flags & TTY_DRIVER_DEVPTS_MEM)) { driver->ttys = kcalloc(lines, sizeof(*driver->ttys), GFP_KERNEL); driver->termios = kcalloc(lines, sizeof(*driver->termios), GFP_KERNEL); if (!driver->ttys || !driver->termios) { err = -ENOMEM; goto err_free_all; } } if (!(flags & TTY_DRIVER_DYNAMIC_ALLOC)) { driver->ports = kcalloc(lines, sizeof(*driver->ports), GFP_KERNEL); if (!driver->ports) { err = -ENOMEM; goto err_free_all; } cdevs = lines; } driver->cdevs = kcalloc(cdevs, sizeof(*driver->cdevs), GFP_KERNEL); if (!driver->cdevs) { err = -ENOMEM; goto err_free_all; } return driver; err_free_all: kfree(driver->ports); kfree(driver->ttys); kfree(driver->termios); kfree(driver->cdevs); kfree(driver); return ERR_PTR(err); } EXPORT_SYMBOL(__tty_alloc_driver); static void destruct_tty_driver(struct kref *kref) { struct tty_driver *driver = container_of(kref, struct tty_driver, kref); int i; struct ktermios *tp; if (driver->flags & TTY_DRIVER_INSTALLED) { for (i = 0; i < driver->num; i++) { tp = driver->termios[i]; if (tp) { driver->termios[i] = NULL; kfree(tp); } if (!(driver->flags & TTY_DRIVER_DYNAMIC_DEV)) tty_unregister_device(driver, i); } proc_tty_unregister_driver(driver); if (driver->flags & TTY_DRIVER_DYNAMIC_ALLOC) cdev_del(driver->cdevs[0]); } kfree(driver->cdevs); kfree(driver->ports); kfree(driver->termios); kfree(driver->ttys); kfree(driver); } /** * tty_driver_kref_put - drop a reference to a tty driver * @driver: driver of which to drop the reference * * The final put will destroy and free up the driver. */ void tty_driver_kref_put(struct tty_driver *driver) { kref_put(&driver->kref, destruct_tty_driver); } EXPORT_SYMBOL(tty_driver_kref_put); /** * tty_register_driver - register a tty driver * @driver: driver to register * * Called by a tty driver to register itself. */ int tty_register_driver(struct tty_driver *driver) { int error; int i; dev_t dev; struct device *d; if (!driver->major) { error = alloc_chrdev_region(&dev, driver->minor_start, driver->num, driver->name); if (!error) { driver->major = MAJOR(dev); driver->minor_start = MINOR(dev); } } else { dev = MKDEV(driver->major, driver->minor_start); error = register_chrdev_region(dev, driver->num, driver->name); } if (error < 0) goto err; if (driver->flags & TTY_DRIVER_DYNAMIC_ALLOC) { error = tty_cdev_add(driver, dev, 0, driver->num); if (error) goto err_unreg_char; } mutex_lock(&tty_mutex); list_add(&driver->tty_drivers, &tty_drivers); mutex_unlock(&tty_mutex); if (!(driver->flags & TTY_DRIVER_DYNAMIC_DEV)) { for (i = 0; i < driver->num; i++) { d = tty_register_device(driver, i, NULL); if (IS_ERR(d)) { error = PTR_ERR(d); goto err_unreg_devs; } } } proc_tty_register_driver(driver); driver->flags |= TTY_DRIVER_INSTALLED; return 0; err_unreg_devs: for (i--; i >= 0; i--) tty_unregister_device(driver, i); mutex_lock(&tty_mutex); list_del(&driver->tty_drivers); mutex_unlock(&tty_mutex); err_unreg_char: unregister_chrdev_region(dev, driver->num); err: return error; } EXPORT_SYMBOL(tty_register_driver); /** * tty_unregister_driver - unregister a tty driver * @driver: driver to unregister * * Called by a tty driver to unregister itself. */ void tty_unregister_driver(struct tty_driver *driver) { unregister_chrdev_region(MKDEV(driver->major, driver->minor_start), driver->num); mutex_lock(&tty_mutex); list_del(&driver->tty_drivers); mutex_unlock(&tty_mutex); } EXPORT_SYMBOL(tty_unregister_driver); dev_t tty_devnum(struct tty_struct *tty) { return MKDEV(tty->driver->major, tty->driver->minor_start) + tty->index; } EXPORT_SYMBOL(tty_devnum); void tty_default_fops(struct file_operations *fops) { *fops = tty_fops; } static char *tty_devnode(const struct device *dev, umode_t *mode) { if (!mode) return NULL; if (dev->devt == MKDEV(TTYAUX_MAJOR, 0) || dev->devt == MKDEV(TTYAUX_MAJOR, 2)) *mode = 0666; return NULL; } const struct class tty_class = { .name = "tty", .devnode = tty_devnode, }; static int __init tty_class_init(void) { return class_register(&tty_class); } postcore_initcall(tty_class_init); /* 3/2004 jmc: why do these devices exist? */ static struct cdev tty_cdev, console_cdev; static ssize_t show_cons_active(struct device *dev, struct device_attribute *attr, char *buf) { struct console *cs[16]; int i = 0; struct console *c; ssize_t count = 0; /* * Hold the console_list_lock to guarantee that no consoles are * unregistered until all console processing is complete. * This also allows safe traversal of the console list and * race-free reading of @flags. */ console_list_lock(); for_each_console(c) { if (!c->device) continue; if (!c->write) continue; if ((c->flags & CON_ENABLED) == 0) continue; cs[i++] = c; if (i >= ARRAY_SIZE(cs)) break; } /* * Take console_lock to serialize device() callback with * other console operations. For example, fg_console is * modified under console_lock when switching vt. */ console_lock(); while (i--) { int index = cs[i]->index; struct tty_driver *drv = cs[i]->device(cs[i], &index); /* don't resolve tty0 as some programs depend on it */ if (drv && (cs[i]->index > 0 || drv->major != TTY_MAJOR)) count += tty_line_name(drv, index, buf + count); else count += sprintf(buf + count, "%s%d", cs[i]->name, cs[i]->index); count += sprintf(buf + count, "%c", i ? ' ':'\n'); } console_unlock(); console_list_unlock(); return count; } static DEVICE_ATTR(active, S_IRUGO, show_cons_active, NULL); static struct attribute *cons_dev_attrs[] = { &dev_attr_active.attr, NULL }; ATTRIBUTE_GROUPS(cons_dev); static struct device *consdev; void console_sysfs_notify(void) { if (consdev) sysfs_notify(&consdev->kobj, NULL, "active"); } static struct ctl_table tty_table[] = { { .procname = "legacy_tiocsti", .data = &tty_legacy_tiocsti, .maxlen = sizeof(tty_legacy_tiocsti), .mode = 0644, .proc_handler = proc_dobool, }, { .procname = "ldisc_autoload", .data = &tty_ldisc_autoload, .maxlen = sizeof(tty_ldisc_autoload), .mode = 0644, .proc_handler = proc_dointvec, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, }; /* * Ok, now we can initialize the rest of the tty devices and can count * on memory allocations, interrupts etc.. */ int __init tty_init(void) { register_sysctl_init("dev/tty", tty_table); cdev_init(&tty_cdev, &tty_fops); if (cdev_add(&tty_cdev, MKDEV(TTYAUX_MAJOR, 0), 1) || register_chrdev_region(MKDEV(TTYAUX_MAJOR, 0), 1, "/dev/tty") < 0) panic("Couldn't register /dev/tty driver\n"); device_create(&tty_class, NULL, MKDEV(TTYAUX_MAJOR, 0), NULL, "tty"); cdev_init(&console_cdev, &console_fops); if (cdev_add(&console_cdev, MKDEV(TTYAUX_MAJOR, 1), 1) || register_chrdev_region(MKDEV(TTYAUX_MAJOR, 1), 1, "/dev/console") < 0) panic("Couldn't register /dev/console driver\n"); consdev = device_create_with_groups(&tty_class, NULL, MKDEV(TTYAUX_MAJOR, 1), NULL, cons_dev_groups, "console"); if (IS_ERR(consdev)) consdev = NULL; #ifdef CONFIG_VT vty_init(&console_fops); #endif return 0; }
213 3046 1 1984 1986 2051 2 3 1 3015 3078 3076 3038 3335 19 3336 3039 185 3021 3285 1920 3285 3285 3019 3016 3016 3337 3337 3285 3284 3337 3337 3337 3337 3016 3019 3016 3018 19 3337 3074 239 3018 3077 3335 534 534 534 247 247 247 218 25 367 367 27 5 335 377 377 377 534 43 43 1914 1914 1914 213 213 3 1982 1982 1982 1982 21 20 19 8 1982 7 7 7 7 7 1905 185 1905 1905 185 185 43 3020 43 3021 3019 5 3021 3020 344 3019 3019 3019 3018 3018 3020 3019 3019 176 170 176 3321 3320 3321 3312 3312 1896 3321 3044 3016 24 24 24 19 19 72 3019 3044 3046 24 24 3046 3016 3019 3015 3017 2867 41 3044 3016 49 49 36 19 19 3018 3018 3019 3017 46 3047 3019 15 54 54 54 54 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 // SPDX-License-Identifier: GPL-2.0+ /* * dummy_hcd.c -- Dummy/Loopback USB host and device emulator driver. * * Maintainer: Alan Stern <stern@rowland.harvard.edu> * * Copyright (C) 2003 David Brownell * Copyright (C) 2003-2005 Alan Stern */ /* * This exposes a device side "USB gadget" API, driven by requests to a * Linux-USB host controller driver. USB traffic is simulated; there's * no need for USB hardware. Use this with two other drivers: * * - Gadget driver, responding to requests (device); * - Host-side device driver, as already familiar in Linux. * * Having this all in one kernel can help some stages of development, * bypassing some hardware (and driver) issues. UML could help too. * * Note: The emulation does not include isochronous transfers! */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/delay.h> #include <linux/ioport.h> #include <linux/slab.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/timer.h> #include <linux/list.h> #include <linux/interrupt.h> #include <linux/platform_device.h> #include <linux/usb.h> #include <linux/usb/gadget.h> #include <linux/usb/hcd.h> #include <linux/scatterlist.h> #include <asm/byteorder.h> #include <linux/io.h> #include <asm/irq.h> #include <asm/unaligned.h> #define DRIVER_DESC "USB Host+Gadget Emulator" #define DRIVER_VERSION "02 May 2005" #define POWER_BUDGET 500 /* in mA; use 8 for low-power port testing */ #define POWER_BUDGET_3 900 /* in mA */ static const char driver_name[] = "dummy_hcd"; static const char driver_desc[] = "USB Host+Gadget Emulator"; static const char gadget_name[] = "dummy_udc"; MODULE_DESCRIPTION(DRIVER_DESC); MODULE_AUTHOR("David Brownell"); MODULE_LICENSE("GPL"); struct dummy_hcd_module_parameters { bool is_super_speed; bool is_high_speed; unsigned int num; }; static struct dummy_hcd_module_parameters mod_data = { .is_super_speed = false, .is_high_speed = true, .num = 1, }; module_param_named(is_super_speed, mod_data.is_super_speed, bool, S_IRUGO); MODULE_PARM_DESC(is_super_speed, "true to simulate SuperSpeed connection"); module_param_named(is_high_speed, mod_data.is_high_speed, bool, S_IRUGO); MODULE_PARM_DESC(is_high_speed, "true to simulate HighSpeed connection"); module_param_named(num, mod_data.num, uint, S_IRUGO); MODULE_PARM_DESC(num, "number of emulated controllers"); /*-------------------------------------------------------------------------*/ /* gadget side driver data structres */ struct dummy_ep { struct list_head queue; unsigned long last_io; /* jiffies timestamp */ struct usb_gadget *gadget; const struct usb_endpoint_descriptor *desc; struct usb_ep ep; unsigned halted:1; unsigned wedged:1; unsigned already_seen:1; unsigned setup_stage:1; unsigned stream_en:1; }; struct dummy_request { struct list_head queue; /* ep's requests */ struct usb_request req; }; static inline struct dummy_ep *usb_ep_to_dummy_ep(struct usb_ep *_ep) { return container_of(_ep, struct dummy_ep, ep); } static inline struct dummy_request *usb_request_to_dummy_request (struct usb_request *_req) { return container_of(_req, struct dummy_request, req); } /*-------------------------------------------------------------------------*/ /* * Every device has ep0 for control requests, plus up to 30 more endpoints, * in one of two types: * * - Configurable: direction (in/out), type (bulk, iso, etc), and endpoint * number can be changed. Names like "ep-a" are used for this type. * * - Fixed Function: in other cases. some characteristics may be mutable; * that'd be hardware-specific. Names like "ep12out-bulk" are used. * * Gadget drivers are responsible for not setting up conflicting endpoint * configurations, illegal or unsupported packet lengths, and so on. */ static const char ep0name[] = "ep0"; static const struct { const char *name; const struct usb_ep_caps caps; } ep_info[] = { #define EP_INFO(_name, _caps) \ { \ .name = _name, \ .caps = _caps, \ } /* we don't provide isochronous endpoints since we don't support them */ #define TYPE_BULK_OR_INT (USB_EP_CAPS_TYPE_BULK | USB_EP_CAPS_TYPE_INT) /* everyone has ep0 */ EP_INFO(ep0name, USB_EP_CAPS(USB_EP_CAPS_TYPE_CONTROL, USB_EP_CAPS_DIR_ALL)), /* act like a pxa250: fifteen fixed function endpoints */ EP_INFO("ep1in-bulk", USB_EP_CAPS(USB_EP_CAPS_TYPE_BULK, USB_EP_CAPS_DIR_IN)), EP_INFO("ep2out-bulk", USB_EP_CAPS(USB_EP_CAPS_TYPE_BULK, USB_EP_CAPS_DIR_OUT)), /* EP_INFO("ep3in-iso", USB_EP_CAPS(USB_EP_CAPS_TYPE_ISO, USB_EP_CAPS_DIR_IN)), EP_INFO("ep4out-iso", USB_EP_CAPS(USB_EP_CAPS_TYPE_ISO, USB_EP_CAPS_DIR_OUT)), */ EP_INFO("ep5in-int", USB_EP_CAPS(USB_EP_CAPS_TYPE_INT, USB_EP_CAPS_DIR_IN)), EP_INFO("ep6in-bulk", USB_EP_CAPS(USB_EP_CAPS_TYPE_BULK, USB_EP_CAPS_DIR_IN)), EP_INFO("ep7out-bulk", USB_EP_CAPS(USB_EP_CAPS_TYPE_BULK, USB_EP_CAPS_DIR_OUT)), /* EP_INFO("ep8in-iso", USB_EP_CAPS(USB_EP_CAPS_TYPE_ISO, USB_EP_CAPS_DIR_IN)), EP_INFO("ep9out-iso", USB_EP_CAPS(USB_EP_CAPS_TYPE_ISO, USB_EP_CAPS_DIR_OUT)), */ EP_INFO("ep10in-int", USB_EP_CAPS(USB_EP_CAPS_TYPE_INT, USB_EP_CAPS_DIR_IN)), EP_INFO("ep11in-bulk", USB_EP_CAPS(USB_EP_CAPS_TYPE_BULK, USB_EP_CAPS_DIR_IN)), EP_INFO("ep12out-bulk", USB_EP_CAPS(USB_EP_CAPS_TYPE_BULK, USB_EP_CAPS_DIR_OUT)), /* EP_INFO("ep13in-iso", USB_EP_CAPS(USB_EP_CAPS_TYPE_ISO, USB_EP_CAPS_DIR_IN)), EP_INFO("ep14out-iso", USB_EP_CAPS(USB_EP_CAPS_TYPE_ISO, USB_EP_CAPS_DIR_OUT)), */ EP_INFO("ep15in-int", USB_EP_CAPS(USB_EP_CAPS_TYPE_INT, USB_EP_CAPS_DIR_IN)), /* or like sa1100: two fixed function endpoints */ EP_INFO("ep1out-bulk", USB_EP_CAPS(USB_EP_CAPS_TYPE_BULK, USB_EP_CAPS_DIR_OUT)), EP_INFO("ep2in-bulk", USB_EP_CAPS(USB_EP_CAPS_TYPE_BULK, USB_EP_CAPS_DIR_IN)), /* and now some generic EPs so we have enough in multi config */ EP_INFO("ep-aout", USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_OUT)), EP_INFO("ep-bin", USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_IN)), EP_INFO("ep-cout", USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_OUT)), EP_INFO("ep-dout", USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_OUT)), EP_INFO("ep-ein", USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_IN)), EP_INFO("ep-fout", USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_OUT)), EP_INFO("ep-gin", USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_IN)), EP_INFO("ep-hout", USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_OUT)), EP_INFO("ep-iout", USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_OUT)), EP_INFO("ep-jin", USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_IN)), EP_INFO("ep-kout", USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_OUT)), EP_INFO("ep-lin", USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_IN)), EP_INFO("ep-mout", USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_OUT)), #undef EP_INFO }; #define DUMMY_ENDPOINTS ARRAY_SIZE(ep_info) /*-------------------------------------------------------------------------*/ #define FIFO_SIZE 64 struct urbp { struct urb *urb; struct list_head urbp_list; struct sg_mapping_iter miter; u32 miter_started; }; enum dummy_rh_state { DUMMY_RH_RESET, DUMMY_RH_SUSPENDED, DUMMY_RH_RUNNING }; struct dummy_hcd { struct dummy *dum; enum dummy_rh_state rh_state; struct timer_list timer; u32 port_status; u32 old_status; unsigned long re_timeout; struct usb_device *udev; struct list_head urbp_list; struct urbp *next_frame_urbp; u32 stream_en_ep; u8 num_stream[30 / 2]; unsigned active:1; unsigned old_active:1; unsigned resuming:1; }; struct dummy { spinlock_t lock; /* * DEVICE/GADGET side support */ struct dummy_ep ep[DUMMY_ENDPOINTS]; int address; int callback_usage; struct usb_gadget gadget; struct usb_gadget_driver *driver; struct dummy_request fifo_req; u8 fifo_buf[FIFO_SIZE]; u16 devstatus; unsigned ints_enabled:1; unsigned udc_suspended:1; unsigned pullup:1; /* * HOST side support */ struct dummy_hcd *hs_hcd; struct dummy_hcd *ss_hcd; }; static inline struct dummy_hcd *hcd_to_dummy_hcd(struct usb_hcd *hcd) { return (struct dummy_hcd *) (hcd->hcd_priv); } static inline struct usb_hcd *dummy_hcd_to_hcd(struct dummy_hcd *dum) { return container_of((void *) dum, struct usb_hcd, hcd_priv); } static inline struct device *dummy_dev(struct dummy_hcd *dum) { return dummy_hcd_to_hcd(dum)->self.controller; } static inline struct device *udc_dev(struct dummy *dum) { return dum->gadget.dev.parent; } static inline struct dummy *ep_to_dummy(struct dummy_ep *ep) { return container_of(ep->gadget, struct dummy, gadget); } static inline struct dummy_hcd *gadget_to_dummy_hcd(struct usb_gadget *gadget) { struct dummy *dum = container_of(gadget, struct dummy, gadget); if (dum->gadget.speed == USB_SPEED_SUPER) return dum->ss_hcd; else return dum->hs_hcd; } static inline struct dummy *gadget_dev_to_dummy(struct device *dev) { return container_of(dev, struct dummy, gadget.dev); } /*-------------------------------------------------------------------------*/ /* DEVICE/GADGET SIDE UTILITY ROUTINES */ /* called with spinlock held */ static void nuke(struct dummy *dum, struct dummy_ep *ep) { while (!list_empty(&ep->queue)) { struct dummy_request *req; req = list_entry(ep->queue.next, struct dummy_request, queue); list_del_init(&req->queue); req->req.status = -ESHUTDOWN; spin_unlock(&dum->lock); usb_gadget_giveback_request(&ep->ep, &req->req); spin_lock(&dum->lock); } } /* caller must hold lock */ static void stop_activity(struct dummy *dum) { int i; /* prevent any more requests */ dum->address = 0; /* The timer is left running so that outstanding URBs can fail */ /* nuke any pending requests first, so driver i/o is quiesced */ for (i = 0; i < DUMMY_ENDPOINTS; ++i) nuke(dum, &dum->ep[i]); /* driver now does any non-usb quiescing necessary */ } /** * set_link_state_by_speed() - Sets the current state of the link according to * the hcd speed * @dum_hcd: pointer to the dummy_hcd structure to update the link state for * * This function updates the port_status according to the link state and the * speed of the hcd. */ static void set_link_state_by_speed(struct dummy_hcd *dum_hcd) { struct dummy *dum = dum_hcd->dum; if (dummy_hcd_to_hcd(dum_hcd)->speed == HCD_USB3) { if ((dum_hcd->port_status & USB_SS_PORT_STAT_POWER) == 0) { dum_hcd->port_status = 0; } else if (!dum->pullup || dum->udc_suspended) { /* UDC suspend must cause a disconnect */ dum_hcd->port_status &= ~(USB_PORT_STAT_CONNECTION | USB_PORT_STAT_ENABLE); if ((dum_hcd->old_status & USB_PORT_STAT_CONNECTION) != 0) dum_hcd->port_status |= (USB_PORT_STAT_C_CONNECTION << 16); } else { /* device is connected and not suspended */ dum_hcd->port_status |= (USB_PORT_STAT_CONNECTION | USB_PORT_STAT_SPEED_5GBPS) ; if ((dum_hcd->old_status & USB_PORT_STAT_CONNECTION) == 0) dum_hcd->port_status |= (USB_PORT_STAT_C_CONNECTION << 16); if ((dum_hcd->port_status & USB_PORT_STAT_ENABLE) && (dum_hcd->port_status & USB_PORT_STAT_LINK_STATE) == USB_SS_PORT_LS_U0 && dum_hcd->rh_state != DUMMY_RH_SUSPENDED) dum_hcd->active = 1; } } else { if ((dum_hcd->port_status & USB_PORT_STAT_POWER) == 0) { dum_hcd->port_status = 0; } else if (!dum->pullup || dum->udc_suspended) { /* UDC suspend must cause a disconnect */ dum_hcd->port_status &= ~(USB_PORT_STAT_CONNECTION | USB_PORT_STAT_ENABLE | USB_PORT_STAT_LOW_SPEED | USB_PORT_STAT_HIGH_SPEED | USB_PORT_STAT_SUSPEND); if ((dum_hcd->old_status & USB_PORT_STAT_CONNECTION) != 0) dum_hcd->port_status |= (USB_PORT_STAT_C_CONNECTION << 16); } else { dum_hcd->port_status |= USB_PORT_STAT_CONNECTION; if ((dum_hcd->old_status & USB_PORT_STAT_CONNECTION) == 0) dum_hcd->port_status |= (USB_PORT_STAT_C_CONNECTION << 16); if ((dum_hcd->port_status & USB_PORT_STAT_ENABLE) == 0) dum_hcd->port_status &= ~USB_PORT_STAT_SUSPEND; else if ((dum_hcd->port_status & USB_PORT_STAT_SUSPEND) == 0 && dum_hcd->rh_state != DUMMY_RH_SUSPENDED) dum_hcd->active = 1; } } } /* caller must hold lock */ static void set_link_state(struct dummy_hcd *dum_hcd) __must_hold(&dum->lock) { struct dummy *dum = dum_hcd->dum; unsigned int power_bit; dum_hcd->active = 0; if (dum->pullup) if ((dummy_hcd_to_hcd(dum_hcd)->speed == HCD_USB3 && dum->gadget.speed != USB_SPEED_SUPER) || (dummy_hcd_to_hcd(dum_hcd)->speed != HCD_USB3 && dum->gadget.speed == USB_SPEED_SUPER)) return; set_link_state_by_speed(dum_hcd); power_bit = (dummy_hcd_to_hcd(dum_hcd)->speed == HCD_USB3 ? USB_SS_PORT_STAT_POWER : USB_PORT_STAT_POWER); if ((dum_hcd->port_status & USB_PORT_STAT_ENABLE) == 0 || dum_hcd->active) dum_hcd->resuming = 0; /* Currently !connected or in reset */ if ((dum_hcd->port_status & power_bit) == 0 || (dum_hcd->port_status & USB_PORT_STAT_RESET) != 0) { unsigned int disconnect = power_bit & dum_hcd->old_status & (~dum_hcd->port_status); unsigned int reset = USB_PORT_STAT_RESET & (~dum_hcd->old_status) & dum_hcd->port_status; /* Report reset and disconnect events to the driver */ if (dum->ints_enabled && (disconnect || reset)) { stop_activity(dum); ++dum->callback_usage; spin_unlock(&dum->lock); if (reset) usb_gadget_udc_reset(&dum->gadget, dum->driver); else dum->driver->disconnect(&dum->gadget); spin_lock(&dum->lock); --dum->callback_usage; } } else if (dum_hcd->active != dum_hcd->old_active && dum->ints_enabled) { ++dum->callback_usage; spin_unlock(&dum->lock); if (dum_hcd->old_active && dum->driver->suspend) dum->driver->suspend(&dum->gadget); else if (!dum_hcd->old_active && dum->driver->resume) dum->driver->resume(&dum->gadget); spin_lock(&dum->lock); --dum->callback_usage; } dum_hcd->old_status = dum_hcd->port_status; dum_hcd->old_active = dum_hcd->active; } /*-------------------------------------------------------------------------*/ /* DEVICE/GADGET SIDE DRIVER * * This only tracks gadget state. All the work is done when the host * side tries some (emulated) i/o operation. Real device controller * drivers would do real i/o using dma, fifos, irqs, timers, etc. */ #define is_enabled(dum) \ (dum->port_status & USB_PORT_STAT_ENABLE) static int dummy_enable(struct usb_ep *_ep, const struct usb_endpoint_descriptor *desc) { struct dummy *dum; struct dummy_hcd *dum_hcd; struct dummy_ep *ep; unsigned max; int retval; ep = usb_ep_to_dummy_ep(_ep); if (!_ep || !desc || ep->desc || _ep->name == ep0name || desc->bDescriptorType != USB_DT_ENDPOINT) return -EINVAL; dum = ep_to_dummy(ep); if (!dum->driver) return -ESHUTDOWN; dum_hcd = gadget_to_dummy_hcd(&dum->gadget); if (!is_enabled(dum_hcd)) return -ESHUTDOWN; /* * For HS/FS devices only bits 0..10 of the wMaxPacketSize represent the * maximum packet size. * For SS devices the wMaxPacketSize is limited by 1024. */ max = usb_endpoint_maxp(desc); /* drivers must not request bad settings, since lower levels * (hardware or its drivers) may not check. some endpoints * can't do iso, many have maxpacket limitations, etc. * * since this "hardware" driver is here to help debugging, we * have some extra sanity checks. (there could be more though, * especially for "ep9out" style fixed function ones.) */ retval = -EINVAL; switch (usb_endpoint_type(desc)) { case USB_ENDPOINT_XFER_BULK: if (strstr(ep->ep.name, "-iso") || strstr(ep->ep.name, "-int")) { goto done; } switch (dum->gadget.speed) { case USB_SPEED_SUPER: if (max == 1024) break; goto done; case USB_SPEED_HIGH: if (max == 512) break; goto done; case USB_SPEED_FULL: if (max == 8 || max == 16 || max == 32 || max == 64) /* we'll fake any legal size */ break; /* save a return statement */ fallthrough; default: goto done; } break; case USB_ENDPOINT_XFER_INT: if (strstr(ep->ep.name, "-iso")) /* bulk is ok */ goto done; /* real hardware might not handle all packet sizes */ switch (dum->gadget.speed) { case USB_SPEED_SUPER: case USB_SPEED_HIGH: if (max <= 1024) break; /* save a return statement */ fallthrough; case USB_SPEED_FULL: if (max <= 64) break; /* save a return statement */ fallthrough; default: if (max <= 8) break; goto done; } break; case USB_ENDPOINT_XFER_ISOC: if (strstr(ep->ep.name, "-bulk") || strstr(ep->ep.name, "-int")) goto done; /* real hardware might not handle all packet sizes */ switch (dum->gadget.speed) { case USB_SPEED_SUPER: case USB_SPEED_HIGH: if (max <= 1024) break; /* save a return statement */ fallthrough; case USB_SPEED_FULL: if (max <= 1023) break; /* save a return statement */ fallthrough; default: goto done; } break; default: /* few chips support control except on ep0 */ goto done; } _ep->maxpacket = max; if (usb_ss_max_streams(_ep->comp_desc)) { if (!usb_endpoint_xfer_bulk(desc)) { dev_err(udc_dev(dum), "Can't enable stream support on " "non-bulk ep %s\n", _ep->name); return -EINVAL; } ep->stream_en = 1; } ep->desc = desc; dev_dbg(udc_dev(dum), "enabled %s (ep%d%s-%s) maxpacket %d stream %s\n", _ep->name, desc->bEndpointAddress & 0x0f, (desc->bEndpointAddress & USB_DIR_IN) ? "in" : "out", usb_ep_type_string(usb_endpoint_type(desc)), max, ep->stream_en ? "enabled" : "disabled"); /* at this point real hardware should be NAKing transfers * to that endpoint, until a buffer is queued to it. */ ep->halted = ep->wedged = 0; retval = 0; done: return retval; } static int dummy_disable(struct usb_ep *_ep) { struct dummy_ep *ep; struct dummy *dum; unsigned long flags; ep = usb_ep_to_dummy_ep(_ep); if (!_ep || !ep->desc || _ep->name == ep0name) return -EINVAL; dum = ep_to_dummy(ep); spin_lock_irqsave(&dum->lock, flags); ep->desc = NULL; ep->stream_en = 0; nuke(dum, ep); spin_unlock_irqrestore(&dum->lock, flags); dev_dbg(udc_dev(dum), "disabled %s\n", _ep->name); return 0; } static struct usb_request *dummy_alloc_request(struct usb_ep *_ep, gfp_t mem_flags) { struct dummy_request *req; if (!_ep) return NULL; req = kzalloc(sizeof(*req), mem_flags); if (!req) return NULL; INIT_LIST_HEAD(&req->queue); return &req->req; } static void dummy_free_request(struct usb_ep *_ep, struct usb_request *_req) { struct dummy_request *req; if (!_ep || !_req) { WARN_ON(1); return; } req = usb_request_to_dummy_request(_req); WARN_ON(!list_empty(&req->queue)); kfree(req); } static void fifo_complete(struct usb_ep *ep, struct usb_request *req) { } static int dummy_queue(struct usb_ep *_ep, struct usb_request *_req, gfp_t mem_flags) { struct dummy_ep *ep; struct dummy_request *req; struct dummy *dum; struct dummy_hcd *dum_hcd; unsigned long flags; req = usb_request_to_dummy_request(_req); if (!_req || !list_empty(&req->queue) || !_req->complete) return -EINVAL; ep = usb_ep_to_dummy_ep(_ep); if (!_ep || (!ep->desc && _ep->name != ep0name)) return -EINVAL; dum = ep_to_dummy(ep); dum_hcd = gadget_to_dummy_hcd(&dum->gadget); if (!dum->driver || !is_enabled(dum_hcd)) return -ESHUTDOWN; #if 0 dev_dbg(udc_dev(dum), "ep %p queue req %p to %s, len %d buf %p\n", ep, _req, _ep->name, _req->length, _req->buf); #endif _req->status = -EINPROGRESS; _req->actual = 0; spin_lock_irqsave(&dum->lock, flags); /* implement an emulated single-request FIFO */ if (ep->desc && (ep->desc->bEndpointAddress & USB_DIR_IN) && list_empty(&dum->fifo_req.queue) && list_empty(&ep->queue) && _req->length <= FIFO_SIZE) { req = &dum->fifo_req; req->req = *_req; req->req.buf = dum->fifo_buf; memcpy(dum->fifo_buf, _req->buf, _req->length); req->req.context = dum; req->req.complete = fifo_complete; list_add_tail(&req->queue, &ep->queue); spin_unlock(&dum->lock); _req->actual = _req->length; _req->status = 0; usb_gadget_giveback_request(_ep, _req); spin_lock(&dum->lock); } else list_add_tail(&req->queue, &ep->queue); spin_unlock_irqrestore(&dum->lock, flags); /* real hardware would likely enable transfers here, in case * it'd been left NAKing. */ return 0; } static int dummy_dequeue(struct usb_ep *_ep, struct usb_request *_req) { struct dummy_ep *ep; struct dummy *dum; int retval = -EINVAL; unsigned long flags; struct dummy_request *req = NULL, *iter; if (!_ep || !_req) return retval; ep = usb_ep_to_dummy_ep(_ep); dum = ep_to_dummy(ep); if (!dum->driver) return -ESHUTDOWN; local_irq_save(flags); spin_lock(&dum->lock); list_for_each_entry(iter, &ep->queue, queue) { if (&iter->req != _req) continue; list_del_init(&iter->queue); _req->status = -ECONNRESET; req = iter; retval = 0; break; } spin_unlock(&dum->lock); if (retval == 0) { dev_dbg(udc_dev(dum), "dequeued req %p from %s, len %d buf %p\n", req, _ep->name, _req->length, _req->buf); usb_gadget_giveback_request(_ep, _req); } local_irq_restore(flags); return retval; } static int dummy_set_halt_and_wedge(struct usb_ep *_ep, int value, int wedged) { struct dummy_ep *ep; struct dummy *dum; if (!_ep) return -EINVAL; ep = usb_ep_to_dummy_ep(_ep); dum = ep_to_dummy(ep); if (!dum->driver) return -ESHUTDOWN; if (!value) ep->halted = ep->wedged = 0; else if (ep->desc && (ep->desc->bEndpointAddress & USB_DIR_IN) && !list_empty(&ep->queue)) return -EAGAIN; else { ep->halted = 1; if (wedged) ep->wedged = 1; } /* FIXME clear emulated data toggle too */ return 0; } static int dummy_set_halt(struct usb_ep *_ep, int value) { return dummy_set_halt_and_wedge(_ep, value, 0); } static int dummy_set_wedge(struct usb_ep *_ep) { if (!_ep || _ep->name == ep0name) return -EINVAL; return dummy_set_halt_and_wedge(_ep, 1, 1); } static const struct usb_ep_ops dummy_ep_ops = { .enable = dummy_enable, .disable = dummy_disable, .alloc_request = dummy_alloc_request, .free_request = dummy_free_request, .queue = dummy_queue, .dequeue = dummy_dequeue, .set_halt = dummy_set_halt, .set_wedge = dummy_set_wedge, }; /*-------------------------------------------------------------------------*/ /* there are both host and device side versions of this call ... */ static int dummy_g_get_frame(struct usb_gadget *_gadget) { struct timespec64 ts64; ktime_get_ts64(&ts64); return ts64.tv_nsec / NSEC_PER_MSEC; } static int dummy_wakeup(struct usb_gadget *_gadget) { struct dummy_hcd *dum_hcd; dum_hcd = gadget_to_dummy_hcd(_gadget); if (!(dum_hcd->dum->devstatus & ((1 << USB_DEVICE_B_HNP_ENABLE) | (1 << USB_DEVICE_REMOTE_WAKEUP)))) return -EINVAL; if ((dum_hcd->port_status & USB_PORT_STAT_CONNECTION) == 0) return -ENOLINK; if ((dum_hcd->port_status & USB_PORT_STAT_SUSPEND) == 0 && dum_hcd->rh_state != DUMMY_RH_SUSPENDED) return -EIO; /* FIXME: What if the root hub is suspended but the port isn't? */ /* hub notices our request, issues downstream resume, etc */ dum_hcd->resuming = 1; dum_hcd->re_timeout = jiffies + msecs_to_jiffies(20); mod_timer(&dummy_hcd_to_hcd(dum_hcd)->rh_timer, dum_hcd->re_timeout); return 0; } static int dummy_set_selfpowered(struct usb_gadget *_gadget, int value) { struct dummy *dum; _gadget->is_selfpowered = (value != 0); dum = gadget_to_dummy_hcd(_gadget)->dum; if (value) dum->devstatus |= (1 << USB_DEVICE_SELF_POWERED); else dum->devstatus &= ~(1 << USB_DEVICE_SELF_POWERED); return 0; } static void dummy_udc_update_ep0(struct dummy *dum) { if (dum->gadget.speed == USB_SPEED_SUPER) dum->ep[0].ep.maxpacket = 9; else dum->ep[0].ep.maxpacket = 64; } static int dummy_pullup(struct usb_gadget *_gadget, int value) { struct dummy_hcd *dum_hcd; struct dummy *dum; unsigned long flags; dum = gadget_dev_to_dummy(&_gadget->dev); dum_hcd = gadget_to_dummy_hcd(_gadget); spin_lock_irqsave(&dum->lock, flags); dum->pullup = (value != 0); set_link_state(dum_hcd); if (value == 0) { /* * Emulate synchronize_irq(): wait for callbacks to finish. * This seems to be the best place to emulate the call to * synchronize_irq() that's in usb_gadget_remove_driver(). * Doing it in dummy_udc_stop() would be too late since it * is called after the unbind callback and unbind shouldn't * be invoked until all the other callbacks are finished. */ while (dum->callback_usage > 0) { spin_unlock_irqrestore(&dum->lock, flags); usleep_range(1000, 2000); spin_lock_irqsave(&dum->lock, flags); } } spin_unlock_irqrestore(&dum->lock, flags); usb_hcd_poll_rh_status(dummy_hcd_to_hcd(dum_hcd)); return 0; } static void dummy_udc_set_speed(struct usb_gadget *_gadget, enum usb_device_speed speed) { struct dummy *dum; dum = gadget_dev_to_dummy(&_gadget->dev); dum->gadget.speed = speed; dummy_udc_update_ep0(dum); } static void dummy_udc_async_callbacks(struct usb_gadget *_gadget, bool enable) { struct dummy *dum = gadget_dev_to_dummy(&_gadget->dev); spin_lock_irq(&dum->lock); dum->ints_enabled = enable; spin_unlock_irq(&dum->lock); } static int dummy_udc_start(struct usb_gadget *g, struct usb_gadget_driver *driver); static int dummy_udc_stop(struct usb_gadget *g); static const struct usb_gadget_ops dummy_ops = { .get_frame = dummy_g_get_frame, .wakeup = dummy_wakeup, .set_selfpowered = dummy_set_selfpowered, .pullup = dummy_pullup, .udc_start = dummy_udc_start, .udc_stop = dummy_udc_stop, .udc_set_speed = dummy_udc_set_speed, .udc_async_callbacks = dummy_udc_async_callbacks, }; /*-------------------------------------------------------------------------*/ /* "function" sysfs attribute */ static ssize_t function_show(struct device *dev, struct device_attribute *attr, char *buf) { struct dummy *dum = gadget_dev_to_dummy(dev); if (!dum->driver || !dum->driver->function) return 0; return scnprintf(buf, PAGE_SIZE, "%s\n", dum->driver->function); } static DEVICE_ATTR_RO(function); /*-------------------------------------------------------------------------*/ /* * Driver registration/unregistration. * * This is basically hardware-specific; there's usually only one real USB * device (not host) controller since that's how USB devices are intended * to work. So most implementations of these api calls will rely on the * fact that only one driver will ever bind to the hardware. But curious * hardware can be built with discrete components, so the gadget API doesn't * require that assumption. * * For this emulator, it might be convenient to create a usb device * for each driver that registers: just add to a big root hub. */ static int dummy_udc_start(struct usb_gadget *g, struct usb_gadget_driver *driver) { struct dummy_hcd *dum_hcd = gadget_to_dummy_hcd(g); struct dummy *dum = dum_hcd->dum; switch (g->speed) { /* All the speeds we support */ case USB_SPEED_LOW: case USB_SPEED_FULL: case USB_SPEED_HIGH: case USB_SPEED_SUPER: break; default: dev_err(dummy_dev(dum_hcd), "Unsupported driver max speed %d\n", driver->max_speed); return -EINVAL; } /* * DEVICE side init ... the layer above hardware, which * can't enumerate without help from the driver we're binding. */ spin_lock_irq(&dum->lock); dum->devstatus = 0; dum->driver = driver; spin_unlock_irq(&dum->lock); return 0; } static int dummy_udc_stop(struct usb_gadget *g) { struct dummy_hcd *dum_hcd = gadget_to_dummy_hcd(g); struct dummy *dum = dum_hcd->dum; spin_lock_irq(&dum->lock); dum->ints_enabled = 0; stop_activity(dum); dum->driver = NULL; spin_unlock_irq(&dum->lock); return 0; } #undef is_enabled /* The gadget structure is stored inside the hcd structure and will be * released along with it. */ static void init_dummy_udc_hw(struct dummy *dum) { int i; INIT_LIST_HEAD(&dum->gadget.ep_list); for (i = 0; i < DUMMY_ENDPOINTS; i++) { struct dummy_ep *ep = &dum->ep[i]; if (!ep_info[i].name) break; ep->ep.name = ep_info[i].name; ep->ep.caps = ep_info[i].caps; ep->ep.ops = &dummy_ep_ops; list_add_tail(&ep->ep.ep_list, &dum->gadget.ep_list); ep->halted = ep->wedged = ep->already_seen = ep->setup_stage = 0; usb_ep_set_maxpacket_limit(&ep->ep, ~0); ep->ep.max_streams = 16; ep->last_io = jiffies; ep->gadget = &dum->gadget; ep->desc = NULL; INIT_LIST_HEAD(&ep->queue); } dum->gadget.ep0 = &dum->ep[0].ep; list_del_init(&dum->ep[0].ep.ep_list); INIT_LIST_HEAD(&dum->fifo_req.queue); #ifdef CONFIG_USB_OTG dum->gadget.is_otg = 1; #endif } static int dummy_udc_probe(struct platform_device *pdev) { struct dummy *dum; int rc; dum = *((void **)dev_get_platdata(&pdev->dev)); /* Clear usb_gadget region for new registration to udc-core */ memzero_explicit(&dum->gadget, sizeof(struct usb_gadget)); dum->gadget.name = gadget_name; dum->gadget.ops = &dummy_ops; if (mod_data.is_super_speed) dum->gadget.max_speed = USB_SPEED_SUPER; else if (mod_data.is_high_speed) dum->gadget.max_speed = USB_SPEED_HIGH; else dum->gadget.max_speed = USB_SPEED_FULL; dum->gadget.dev.parent = &pdev->dev; init_dummy_udc_hw(dum); rc = usb_add_gadget_udc(&pdev->dev, &dum->gadget); if (rc < 0) goto err_udc; rc = device_create_file(&dum->gadget.dev, &dev_attr_function); if (rc < 0) goto err_dev; platform_set_drvdata(pdev, dum); return rc; err_dev: usb_del_gadget_udc(&dum->gadget); err_udc: return rc; } static void dummy_udc_remove(struct platform_device *pdev) { struct dummy *dum = platform_get_drvdata(pdev); device_remove_file(&dum->gadget.dev, &dev_attr_function); usb_del_gadget_udc(&dum->gadget); } static void dummy_udc_pm(struct dummy *dum, struct dummy_hcd *dum_hcd, int suspend) { spin_lock_irq(&dum->lock); dum->udc_suspended = suspend; set_link_state(dum_hcd); spin_unlock_irq(&dum->lock); } static int dummy_udc_suspend(struct platform_device *pdev, pm_message_t state) { struct dummy *dum = platform_get_drvdata(pdev); struct dummy_hcd *dum_hcd = gadget_to_dummy_hcd(&dum->gadget); dev_dbg(&pdev->dev, "%s\n", __func__); dummy_udc_pm(dum, dum_hcd, 1); usb_hcd_poll_rh_status(dummy_hcd_to_hcd(dum_hcd)); return 0; } static int dummy_udc_resume(struct platform_device *pdev) { struct dummy *dum = platform_get_drvdata(pdev); struct dummy_hcd *dum_hcd = gadget_to_dummy_hcd(&dum->gadget); dev_dbg(&pdev->dev, "%s\n", __func__); dummy_udc_pm(dum, dum_hcd, 0); usb_hcd_poll_rh_status(dummy_hcd_to_hcd(dum_hcd)); return 0; } static struct platform_driver dummy_udc_driver = { .probe = dummy_udc_probe, .remove_new = dummy_udc_remove, .suspend = dummy_udc_suspend, .resume = dummy_udc_resume, .driver = { .name = gadget_name, }, }; /*-------------------------------------------------------------------------*/ static unsigned int dummy_get_ep_idx(const struct usb_endpoint_descriptor *desc) { unsigned int index; index = usb_endpoint_num(desc) << 1; if (usb_endpoint_dir_in(desc)) index |= 1; return index; } /* HOST SIDE DRIVER * * this uses the hcd framework to hook up to host side drivers. * its root hub will only have one device, otherwise it acts like * a normal host controller. * * when urbs are queued, they're just stuck on a list that we * scan in a timer callback. that callback connects writes from * the host with reads from the device, and so on, based on the * usb 2.0 rules. */ static int dummy_ep_stream_en(struct dummy_hcd *dum_hcd, struct urb *urb) { const struct usb_endpoint_descriptor *desc = &urb->ep->desc; u32 index; if (!usb_endpoint_xfer_bulk(desc)) return 0; index = dummy_get_ep_idx(desc); return (1 << index) & dum_hcd->stream_en_ep; } /* * The max stream number is saved as a nibble so for the 30 possible endpoints * we only 15 bytes of memory. Therefore we are limited to max 16 streams (0 * means we use only 1 stream). The maximum according to the spec is 16bit so * if the 16 stream limit is about to go, the array size should be incremented * to 30 elements of type u16. */ static int get_max_streams_for_pipe(struct dummy_hcd *dum_hcd, unsigned int pipe) { int max_streams; max_streams = dum_hcd->num_stream[usb_pipeendpoint(pipe)]; if (usb_pipeout(pipe)) max_streams >>= 4; else max_streams &= 0xf; max_streams++; return max_streams; } static void set_max_streams_for_pipe(struct dummy_hcd *dum_hcd, unsigned int pipe, unsigned int streams) { int max_streams; streams--; max_streams = dum_hcd->num_stream[usb_pipeendpoint(pipe)]; if (usb_pipeout(pipe)) { streams <<= 4; max_streams &= 0xf; } else { max_streams &= 0xf0; } max_streams |= streams; dum_hcd->num_stream[usb_pipeendpoint(pipe)] = max_streams; } static int dummy_validate_stream(struct dummy_hcd *dum_hcd, struct urb *urb) { unsigned int max_streams; int enabled; enabled = dummy_ep_stream_en(dum_hcd, urb); if (!urb->stream_id) { if (enabled) return -EINVAL; return 0; } if (!enabled) return -EINVAL; max_streams = get_max_streams_for_pipe(dum_hcd, usb_pipeendpoint(urb->pipe)); if (urb->stream_id > max_streams) { dev_err(dummy_dev(dum_hcd), "Stream id %d is out of range.\n", urb->stream_id); BUG(); return -EINVAL; } return 0; } static int dummy_urb_enqueue( struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flags ) { struct dummy_hcd *dum_hcd; struct urbp *urbp; unsigned long flags; int rc; urbp = kmalloc(sizeof *urbp, mem_flags); if (!urbp) return -ENOMEM; urbp->urb = urb; urbp->miter_started = 0; dum_hcd = hcd_to_dummy_hcd(hcd); spin_lock_irqsave(&dum_hcd->dum->lock, flags); rc = dummy_validate_stream(dum_hcd, urb); if (rc) { kfree(urbp); goto done; } rc = usb_hcd_link_urb_to_ep(hcd, urb); if (rc) { kfree(urbp); goto done; } if (!dum_hcd->udev) { dum_hcd->udev = urb->dev; usb_get_dev(dum_hcd->udev); } else if (unlikely(dum_hcd->udev != urb->dev)) dev_err(dummy_dev(dum_hcd), "usb_device address has changed!\n"); list_add_tail(&urbp->urbp_list, &dum_hcd->urbp_list); urb->hcpriv = urbp; if (!dum_hcd->next_frame_urbp) dum_hcd->next_frame_urbp = urbp; if (usb_pipetype(urb->pipe) == PIPE_CONTROL) urb->error_count = 1; /* mark as a new urb */ /* kick the scheduler, it'll do the rest */ if (!timer_pending(&dum_hcd->timer)) mod_timer(&dum_hcd->timer, jiffies + 1); done: spin_unlock_irqrestore(&dum_hcd->dum->lock, flags); return rc; } static int dummy_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status) { struct dummy_hcd *dum_hcd; unsigned long flags; int rc; /* giveback happens automatically in timer callback, * so make sure the callback happens */ dum_hcd = hcd_to_dummy_hcd(hcd); spin_lock_irqsave(&dum_hcd->dum->lock, flags); rc = usb_hcd_check_unlink_urb(hcd, urb, status); if (!rc && dum_hcd->rh_state != DUMMY_RH_RUNNING && !list_empty(&dum_hcd->urbp_list)) mod_timer(&dum_hcd->timer, jiffies); spin_unlock_irqrestore(&dum_hcd->dum->lock, flags); return rc; } static int dummy_perform_transfer(struct urb *urb, struct dummy_request *req, u32 len) { void *ubuf, *rbuf; struct urbp *urbp = urb->hcpriv; int to_host; struct sg_mapping_iter *miter = &urbp->miter; u32 trans = 0; u32 this_sg; bool next_sg; to_host = usb_urb_dir_in(urb); rbuf = req->req.buf + req->req.actual; if (!urb->num_sgs) { ubuf = urb->transfer_buffer + urb->actual_length; if (to_host) memcpy(ubuf, rbuf, len); else memcpy(rbuf, ubuf, len); return len; } if (!urbp->miter_started) { u32 flags = SG_MITER_ATOMIC; if (to_host) flags |= SG_MITER_TO_SG; else flags |= SG_MITER_FROM_SG; sg_miter_start(miter, urb->sg, urb->num_sgs, flags); urbp->miter_started = 1; } next_sg = sg_miter_next(miter); if (next_sg == false) { WARN_ON_ONCE(1); return -EINVAL; } do { ubuf = miter->addr; this_sg = min_t(u32, len, miter->length); miter->consumed = this_sg; trans += this_sg; if (to_host) memcpy(ubuf, rbuf, this_sg); else memcpy(rbuf, ubuf, this_sg); len -= this_sg; if (!len) break; next_sg = sg_miter_next(miter); if (next_sg == false) { WARN_ON_ONCE(1); return -EINVAL; } rbuf += this_sg; } while (1); sg_miter_stop(miter); return trans; } /* transfer up to a frame's worth; caller must own lock */ static int transfer(struct dummy_hcd *dum_hcd, struct urb *urb, struct dummy_ep *ep, int limit, int *status) { struct dummy *dum = dum_hcd->dum; struct dummy_request *req; int sent = 0; top: /* if there's no request queued, the device is NAKing; return */ list_for_each_entry(req, &ep->queue, queue) { unsigned host_len, dev_len, len; int is_short, to_host; int rescan = 0; if (dummy_ep_stream_en(dum_hcd, urb)) { if ((urb->stream_id != req->req.stream_id)) continue; } /* 1..N packets of ep->ep.maxpacket each ... the last one * may be short (including zero length). * * writer can send a zlp explicitly (length 0) or implicitly * (length mod maxpacket zero, and 'zero' flag); they always * terminate reads. */ host_len = urb->transfer_buffer_length - urb->actual_length; dev_len = req->req.length - req->req.actual; len = min(host_len, dev_len); /* FIXME update emulated data toggle too */ to_host = usb_urb_dir_in(urb); if (unlikely(len == 0)) is_short = 1; else { /* not enough bandwidth left? */ if (limit < ep->ep.maxpacket && limit < len) break; len = min_t(unsigned, len, limit); if (len == 0) break; /* send multiple of maxpacket first, then remainder */ if (len >= ep->ep.maxpacket) { is_short = 0; if (len % ep->ep.maxpacket) rescan = 1; len -= len % ep->ep.maxpacket; } else { is_short = 1; } len = dummy_perform_transfer(urb, req, len); ep->last_io = jiffies; if ((int)len < 0) { req->req.status = len; } else { limit -= len; sent += len; urb->actual_length += len; req->req.actual += len; } } /* short packets terminate, maybe with overflow/underflow. * it's only really an error to write too much. * * partially filling a buffer optionally blocks queue advances * (so completion handlers can clean up the queue) but we don't * need to emulate such data-in-flight. */ if (is_short) { if (host_len == dev_len) { req->req.status = 0; *status = 0; } else if (to_host) { req->req.status = 0; if (dev_len > host_len) *status = -EOVERFLOW; else *status = 0; } else { *status = 0; if (host_len > dev_len) req->req.status = -EOVERFLOW; else req->req.status = 0; } /* * many requests terminate without a short packet. * send a zlp if demanded by flags. */ } else { if (req->req.length == req->req.actual) { if (req->req.zero && to_host) rescan = 1; else req->req.status = 0; } if (urb->transfer_buffer_length == urb->actual_length) { if (urb->transfer_flags & URB_ZERO_PACKET && !to_host) rescan = 1; else *status = 0; } } /* device side completion --> continuable */ if (req->req.status != -EINPROGRESS) { list_del_init(&req->queue); spin_unlock(&dum->lock); usb_gadget_giveback_request(&ep->ep, &req->req); spin_lock(&dum->lock); /* requests might have been unlinked... */ rescan = 1; } /* host side completion --> terminate */ if (*status != -EINPROGRESS) break; /* rescan to continue with any other queued i/o */ if (rescan) goto top; } return sent; } static int periodic_bytes(struct dummy *dum, struct dummy_ep *ep) { int limit = ep->ep.maxpacket; if (dum->gadget.speed == USB_SPEED_HIGH) { int tmp; /* high bandwidth mode */ tmp = usb_endpoint_maxp_mult(ep->desc); tmp *= 8 /* applies to entire frame */; limit += limit * tmp; } if (dum->gadget.speed == USB_SPEED_SUPER) { switch (usb_endpoint_type(ep->desc)) { case USB_ENDPOINT_XFER_ISOC: /* Sec. 4.4.8.2 USB3.0 Spec */ limit = 3 * 16 * 1024 * 8; break; case USB_ENDPOINT_XFER_INT: /* Sec. 4.4.7.2 USB3.0 Spec */ limit = 3 * 1024 * 8; break; case USB_ENDPOINT_XFER_BULK: default: break; } } return limit; } #define is_active(dum_hcd) ((dum_hcd->port_status & \ (USB_PORT_STAT_CONNECTION | USB_PORT_STAT_ENABLE | \ USB_PORT_STAT_SUSPEND)) \ == (USB_PORT_STAT_CONNECTION | USB_PORT_STAT_ENABLE)) static struct dummy_ep *find_endpoint(struct dummy *dum, u8 address) { int i; if (!is_active((dum->gadget.speed == USB_SPEED_SUPER ? dum->ss_hcd : dum->hs_hcd))) return NULL; if (!dum->ints_enabled) return NULL; if ((address & ~USB_DIR_IN) == 0) return &dum->ep[0]; for (i = 1; i < DUMMY_ENDPOINTS; i++) { struct dummy_ep *ep = &dum->ep[i]; if (!ep->desc) continue; if (ep->desc->bEndpointAddress == address) return ep; } return NULL; } #undef is_active #define Dev_Request (USB_TYPE_STANDARD | USB_RECIP_DEVICE) #define Dev_InRequest (Dev_Request | USB_DIR_IN) #define Intf_Request (USB_TYPE_STANDARD | USB_RECIP_INTERFACE) #define Intf_InRequest (Intf_Request | USB_DIR_IN) #define Ep_Request (USB_TYPE_STANDARD | USB_RECIP_ENDPOINT) #define Ep_InRequest (Ep_Request | USB_DIR_IN) /** * handle_control_request() - handles all control transfers * @dum_hcd: pointer to dummy (the_controller) * @urb: the urb request to handle * @setup: pointer to the setup data for a USB device control * request * @status: pointer to request handling status * * Return 0 - if the request was handled * 1 - if the request wasn't handles * error code on error */ static int handle_control_request(struct dummy_hcd *dum_hcd, struct urb *urb, struct usb_ctrlrequest *setup, int *status) { struct dummy_ep *ep2; struct dummy *dum = dum_hcd->dum; int ret_val = 1; unsigned w_index; unsigned w_value; w_index = le16_to_cpu(setup->wIndex); w_value = le16_to_cpu(setup->wValue); switch (setup->bRequest) { case USB_REQ_SET_ADDRESS: if (setup->bRequestType != Dev_Request) break; dum->address = w_value; *status = 0; dev_dbg(udc_dev(dum), "set_address = %d\n", w_value); ret_val = 0; break; case USB_REQ_SET_FEATURE: if (setup->bRequestType == Dev_Request) { ret_val = 0; switch (w_value) { case USB_DEVICE_REMOTE_WAKEUP: break; case USB_DEVICE_B_HNP_ENABLE: dum->gadget.b_hnp_enable = 1; break; case USB_DEVICE_A_HNP_SUPPORT: dum->gadget.a_hnp_support = 1; break; case USB_DEVICE_A_ALT_HNP_SUPPORT: dum->gadget.a_alt_hnp_support = 1; break; case USB_DEVICE_U1_ENABLE: if (dummy_hcd_to_hcd(dum_hcd)->speed == HCD_USB3) w_value = USB_DEV_STAT_U1_ENABLED; else ret_val = -EOPNOTSUPP; break; case USB_DEVICE_U2_ENABLE: if (dummy_hcd_to_hcd(dum_hcd)->speed == HCD_USB3) w_value = USB_DEV_STAT_U2_ENABLED; else ret_val = -EOPNOTSUPP; break; case USB_DEVICE_LTM_ENABLE: if (dummy_hcd_to_hcd(dum_hcd)->speed == HCD_USB3) w_value = USB_DEV_STAT_LTM_ENABLED; else ret_val = -EOPNOTSUPP; break; default: ret_val = -EOPNOTSUPP; } if (ret_val == 0) { dum->devstatus |= (1 << w_value); *status = 0; } } else if (setup->bRequestType == Ep_Request) { /* endpoint halt */ ep2 = find_endpoint(dum, w_index); if (!ep2 || ep2->ep.name == ep0name) { ret_val = -EOPNOTSUPP; break; } ep2->halted = 1; ret_val = 0; *status = 0; } break; case USB_REQ_CLEAR_FEATURE: if (setup->bRequestType == Dev_Request) { ret_val = 0; switch (w_value) { case USB_DEVICE_REMOTE_WAKEUP: w_value = USB_DEVICE_REMOTE_WAKEUP; break; case USB_DEVICE_U1_ENABLE: if (dummy_hcd_to_hcd(dum_hcd)->speed == HCD_USB3) w_value = USB_DEV_STAT_U1_ENABLED; else ret_val = -EOPNOTSUPP; break; case USB_DEVICE_U2_ENABLE: if (dummy_hcd_to_hcd(dum_hcd)->speed == HCD_USB3) w_value = USB_DEV_STAT_U2_ENABLED; else ret_val = -EOPNOTSUPP; break; case USB_DEVICE_LTM_ENABLE: if (dummy_hcd_to_hcd(dum_hcd)->speed == HCD_USB3) w_value = USB_DEV_STAT_LTM_ENABLED; else ret_val = -EOPNOTSUPP; break; default: ret_val = -EOPNOTSUPP; break; } if (ret_val == 0) { dum->devstatus &= ~(1 << w_value); *status = 0; } } else if (setup->bRequestType == Ep_Request) { /* endpoint halt */ ep2 = find_endpoint(dum, w_index); if (!ep2) { ret_val = -EOPNOTSUPP; break; } if (!ep2->wedged) ep2->halted = 0; ret_val = 0; *status = 0; } break; case USB_REQ_GET_STATUS: if (setup->bRequestType == Dev_InRequest || setup->bRequestType == Intf_InRequest || setup->bRequestType == Ep_InRequest) { char *buf; /* * device: remote wakeup, selfpowered * interface: nothing * endpoint: halt */ buf = (char *)urb->transfer_buffer; if (urb->transfer_buffer_length > 0) { if (setup->bRequestType == Ep_InRequest) { ep2 = find_endpoint(dum, w_index); if (!ep2) { ret_val = -EOPNOTSUPP; break; } buf[0] = ep2->halted; } else if (setup->bRequestType == Dev_InRequest) { buf[0] = (u8)dum->devstatus; } else buf[0] = 0; } if (urb->transfer_buffer_length > 1) buf[1] = 0; urb->actual_length = min_t(u32, 2, urb->transfer_buffer_length); ret_val = 0; *status = 0; } break; } return ret_val; } /* * Drive both sides of the transfers; looks like irq handlers to both * drivers except that the callbacks are invoked from soft interrupt * context. */ static void dummy_timer(struct timer_list *t) { struct dummy_hcd *dum_hcd = from_timer(dum_hcd, t, timer); struct dummy *dum = dum_hcd->dum; struct urbp *urbp, *tmp; unsigned long flags; int limit, total; int i; /* simplistic model for one frame's bandwidth */ /* FIXME: account for transaction and packet overhead */ switch (dum->gadget.speed) { case USB_SPEED_LOW: total = 8/*bytes*/ * 12/*packets*/; break; case USB_SPEED_FULL: total = 64/*bytes*/ * 19/*packets*/; break; case USB_SPEED_HIGH: total = 512/*bytes*/ * 13/*packets*/ * 8/*uframes*/; break; case USB_SPEED_SUPER: /* Bus speed is 500000 bytes/ms, so use a little less */ total = 490000; break; default: /* Can't happen */ dev_err(dummy_dev(dum_hcd), "bogus device speed\n"); total = 0; break; } /* FIXME if HZ != 1000 this will probably misbehave ... */ /* look at each urb queued by the host side driver */ spin_lock_irqsave(&dum->lock, flags); if (!dum_hcd->udev) { dev_err(dummy_dev(dum_hcd), "timer fired with no URBs pending?\n"); spin_unlock_irqrestore(&dum->lock, flags); return; } dum_hcd->next_frame_urbp = NULL; for (i = 0; i < DUMMY_ENDPOINTS; i++) { if (!ep_info[i].name) break; dum->ep[i].already_seen = 0; } restart: list_for_each_entry_safe(urbp, tmp, &dum_hcd->urbp_list, urbp_list) { struct urb *urb; struct dummy_request *req; u8 address; struct dummy_ep *ep = NULL; int status = -EINPROGRESS; /* stop when we reach URBs queued after the timer interrupt */ if (urbp == dum_hcd->next_frame_urbp) break; urb = urbp->urb; if (urb->unlinked) goto return_urb; else if (dum_hcd->rh_state != DUMMY_RH_RUNNING) continue; /* Used up this frame's bandwidth? */ if (total <= 0) continue; /* find the gadget's ep for this request (if configured) */ address = usb_pipeendpoint (urb->pipe); if (usb_urb_dir_in(urb)) address |= USB_DIR_IN; ep = find_endpoint(dum, address); if (!ep) { /* set_configuration() disagreement */ dev_dbg(dummy_dev(dum_hcd), "no ep configured for urb %p\n", urb); status = -EPROTO; goto return_urb; } if (ep->already_seen) continue; ep->already_seen = 1; if (ep == &dum->ep[0] && urb->error_count) { ep->setup_stage = 1; /* a new urb */ urb->error_count = 0; } if (ep->halted && !ep->setup_stage) { /* NOTE: must not be iso! */ dev_dbg(dummy_dev(dum_hcd), "ep %s halted, urb %p\n", ep->ep.name, urb); status = -EPIPE; goto return_urb; } /* FIXME make sure both ends agree on maxpacket */ /* handle control requests */ if (ep == &dum->ep[0] && ep->setup_stage) { struct usb_ctrlrequest setup; int value; setup = *(struct usb_ctrlrequest *) urb->setup_packet; /* paranoia, in case of stale queued data */ list_for_each_entry(req, &ep->queue, queue) { list_del_init(&req->queue); req->req.status = -EOVERFLOW; dev_dbg(udc_dev(dum), "stale req = %p\n", req); spin_unlock(&dum->lock); usb_gadget_giveback_request(&ep->ep, &req->req); spin_lock(&dum->lock); ep->already_seen = 0; goto restart; } /* gadget driver never sees set_address or operations * on standard feature flags. some hardware doesn't * even expose them. */ ep->last_io = jiffies; ep->setup_stage = 0; ep->halted = 0; value = handle_control_request(dum_hcd, urb, &setup, &status); /* gadget driver handles all other requests. block * until setup() returns; no reentrancy issues etc. */ if (value > 0) { ++dum->callback_usage; spin_unlock(&dum->lock); value = dum->driver->setup(&dum->gadget, &setup); spin_lock(&dum->lock); --dum->callback_usage; if (value >= 0) { /* no delays (max 64KB data stage) */ limit = 64*1024; goto treat_control_like_bulk; } /* error, see below */ } if (value < 0) { if (value != -EOPNOTSUPP) dev_dbg(udc_dev(dum), "setup --> %d\n", value); status = -EPIPE; urb->actual_length = 0; } goto return_urb; } /* non-control requests */ limit = total; switch (usb_pipetype(urb->pipe)) { case PIPE_ISOCHRONOUS: /* * We don't support isochronous. But if we did, * here are some of the issues we'd have to face: * * Is it urb->interval since the last xfer? * Use urb->iso_frame_desc[i]. * Complete whether or not ep has requests queued. * Report random errors, to debug drivers. */ limit = max(limit, periodic_bytes(dum, ep)); status = -EINVAL; /* fail all xfers */ break; case PIPE_INTERRUPT: /* FIXME is it urb->interval since the last xfer? * this almost certainly polls too fast. */ limit = max(limit, periodic_bytes(dum, ep)); fallthrough; default: treat_control_like_bulk: ep->last_io = jiffies; total -= transfer(dum_hcd, urb, ep, limit, &status); break; } /* incomplete transfer? */ if (status == -EINPROGRESS) continue; return_urb: list_del(&urbp->urbp_list); kfree(urbp); if (ep) ep->already_seen = ep->setup_stage = 0; usb_hcd_unlink_urb_from_ep(dummy_hcd_to_hcd(dum_hcd), urb); spin_unlock(&dum->lock); usb_hcd_giveback_urb(dummy_hcd_to_hcd(dum_hcd), urb, status); spin_lock(&dum->lock); goto restart; } if (list_empty(&dum_hcd->urbp_list)) { usb_put_dev(dum_hcd->udev); dum_hcd->udev = NULL; } else if (dum_hcd->rh_state == DUMMY_RH_RUNNING) { /* want a 1 msec delay here */ mod_timer(&dum_hcd->timer, jiffies + msecs_to_jiffies(1)); } spin_unlock_irqrestore(&dum->lock, flags); } /*-------------------------------------------------------------------------*/ #define PORT_C_MASK \ ((USB_PORT_STAT_C_CONNECTION \ | USB_PORT_STAT_C_ENABLE \ | USB_PORT_STAT_C_SUSPEND \ | USB_PORT_STAT_C_OVERCURRENT \ | USB_PORT_STAT_C_RESET) << 16) static int dummy_hub_status(struct usb_hcd *hcd, char *buf) { struct dummy_hcd *dum_hcd; unsigned long flags; int retval = 0; dum_hcd = hcd_to_dummy_hcd(hcd); spin_lock_irqsave(&dum_hcd->dum->lock, flags); if (!HCD_HW_ACCESSIBLE(hcd)) goto done; if (dum_hcd->resuming && time_after_eq(jiffies, dum_hcd->re_timeout)) { dum_hcd->port_status |= (USB_PORT_STAT_C_SUSPEND << 16); dum_hcd->port_status &= ~USB_PORT_STAT_SUSPEND; set_link_state(dum_hcd); } if ((dum_hcd->port_status & PORT_C_MASK) != 0) { *buf = (1 << 1); dev_dbg(dummy_dev(dum_hcd), "port status 0x%08x has changes\n", dum_hcd->port_status); retval = 1; if (dum_hcd->rh_state == DUMMY_RH_SUSPENDED) usb_hcd_resume_root_hub(hcd); } done: spin_unlock_irqrestore(&dum_hcd->dum->lock, flags); return retval; } /* usb 3.0 root hub device descriptor */ static struct { struct usb_bos_descriptor bos; struct usb_ss_cap_descriptor ss_cap; } __packed usb3_bos_desc = { .bos = { .bLength = USB_DT_BOS_SIZE, .bDescriptorType = USB_DT_BOS, .wTotalLength = cpu_to_le16(sizeof(usb3_bos_desc)), .bNumDeviceCaps = 1, }, .ss_cap = { .bLength = USB_DT_USB_SS_CAP_SIZE, .bDescriptorType = USB_DT_DEVICE_CAPABILITY, .bDevCapabilityType = USB_SS_CAP_TYPE, .wSpeedSupported = cpu_to_le16(USB_5GBPS_OPERATION), .bFunctionalitySupport = ilog2(USB_5GBPS_OPERATION), }, }; static inline void ss_hub_descriptor(struct usb_hub_descriptor *desc) { memset(desc, 0, sizeof *desc); desc->bDescriptorType = USB_DT_SS_HUB; desc->bDescLength = 12; desc->wHubCharacteristics = cpu_to_le16( HUB_CHAR_INDV_PORT_LPSM | HUB_CHAR_COMMON_OCPM); desc->bNbrPorts = 1; desc->u.ss.bHubHdrDecLat = 0x04; /* Worst case: 0.4 micro sec*/ desc->u.ss.DeviceRemovable = 0; } static inline void hub_descriptor(struct usb_hub_descriptor *desc) { memset(desc, 0, sizeof *desc); desc->bDescriptorType = USB_DT_HUB; desc->bDescLength = 9; desc->wHubCharacteristics = cpu_to_le16( HUB_CHAR_INDV_PORT_LPSM | HUB_CHAR_COMMON_OCPM); desc->bNbrPorts = 1; desc->u.hs.DeviceRemovable[0] = 0; desc->u.hs.DeviceRemovable[1] = 0xff; /* PortPwrCtrlMask */ } static int dummy_hub_control( struct usb_hcd *hcd, u16 typeReq, u16 wValue, u16 wIndex, char *buf, u16 wLength ) { struct dummy_hcd *dum_hcd; int retval = 0; unsigned long flags; if (!HCD_HW_ACCESSIBLE(hcd)) return -ETIMEDOUT; dum_hcd = hcd_to_dummy_hcd(hcd); spin_lock_irqsave(&dum_hcd->dum->lock, flags); switch (typeReq) { case ClearHubFeature: break; case ClearPortFeature: switch (wValue) { case USB_PORT_FEAT_SUSPEND: if (hcd->speed == HCD_USB3) { dev_dbg(dummy_dev(dum_hcd), "USB_PORT_FEAT_SUSPEND req not " "supported for USB 3.0 roothub\n"); goto error; } if (dum_hcd->port_status & USB_PORT_STAT_SUSPEND) { /* 20msec resume signaling */ dum_hcd->resuming = 1; dum_hcd->re_timeout = jiffies + msecs_to_jiffies(20); } break; case USB_PORT_FEAT_POWER: dev_dbg(dummy_dev(dum_hcd), "power-off\n"); if (hcd->speed == HCD_USB3) dum_hcd->port_status &= ~USB_SS_PORT_STAT_POWER; else dum_hcd->port_status &= ~USB_PORT_STAT_POWER; set_link_state(dum_hcd); break; case USB_PORT_FEAT_ENABLE: case USB_PORT_FEAT_C_ENABLE: case USB_PORT_FEAT_C_SUSPEND: /* Not allowed for USB-3 */ if (hcd->speed == HCD_USB3) goto error; fallthrough; case USB_PORT_FEAT_C_CONNECTION: case USB_PORT_FEAT_C_RESET: dum_hcd->port_status &= ~(1 << wValue); set_link_state(dum_hcd); break; default: /* Disallow INDICATOR and C_OVER_CURRENT */ goto error; } break; case GetHubDescriptor: if (hcd->speed == HCD_USB3 && (wLength < USB_DT_SS_HUB_SIZE || wValue != (USB_DT_SS_HUB << 8))) { dev_dbg(dummy_dev(dum_hcd), "Wrong hub descriptor type for " "USB 3.0 roothub.\n"); goto error; } if (hcd->speed == HCD_USB3) ss_hub_descriptor((struct usb_hub_descriptor *) buf); else hub_descriptor((struct usb_hub_descriptor *) buf); break; case DeviceRequest | USB_REQ_GET_DESCRIPTOR: if (hcd->speed != HCD_USB3) goto error; if ((wValue >> 8) != USB_DT_BOS) goto error; memcpy(buf, &usb3_bos_desc, sizeof(usb3_bos_desc)); retval = sizeof(usb3_bos_desc); break; case GetHubStatus: *(__le32 *) buf = cpu_to_le32(0); break; case GetPortStatus: if (wIndex != 1) retval = -EPIPE; /* whoever resets or resumes must GetPortStatus to * complete it!! */ if (dum_hcd->resuming && time_after_eq(jiffies, dum_hcd->re_timeout)) { dum_hcd->port_status |= (USB_PORT_STAT_C_SUSPEND << 16); dum_hcd->port_status &= ~USB_PORT_STAT_SUSPEND; } if ((dum_hcd->port_status & USB_PORT_STAT_RESET) != 0 && time_after_eq(jiffies, dum_hcd->re_timeout)) { dum_hcd->port_status |= (USB_PORT_STAT_C_RESET << 16); dum_hcd->port_status &= ~USB_PORT_STAT_RESET; if (dum_hcd->dum->pullup) { dum_hcd->port_status |= USB_PORT_STAT_ENABLE; if (hcd->speed < HCD_USB3) { switch (dum_hcd->dum->gadget.speed) { case USB_SPEED_HIGH: dum_hcd->port_status |= USB_PORT_STAT_HIGH_SPEED; break; case USB_SPEED_LOW: dum_hcd->dum->gadget.ep0-> maxpacket = 8; dum_hcd->port_status |= USB_PORT_STAT_LOW_SPEED; break; default: break; } } } } set_link_state(dum_hcd); ((__le16 *) buf)[0] = cpu_to_le16(dum_hcd->port_status); ((__le16 *) buf)[1] = cpu_to_le16(dum_hcd->port_status >> 16); break; case SetHubFeature: retval = -EPIPE; break; case SetPortFeature: switch (wValue) { case USB_PORT_FEAT_LINK_STATE: if (hcd->speed != HCD_USB3) { dev_dbg(dummy_dev(dum_hcd), "USB_PORT_FEAT_LINK_STATE req not " "supported for USB 2.0 roothub\n"); goto error; } /* * Since this is dummy we don't have an actual link so * there is nothing to do for the SET_LINK_STATE cmd */ break; case USB_PORT_FEAT_U1_TIMEOUT: case USB_PORT_FEAT_U2_TIMEOUT: /* TODO: add suspend/resume support! */ if (hcd->speed != HCD_USB3) { dev_dbg(dummy_dev(dum_hcd), "USB_PORT_FEAT_U1/2_TIMEOUT req not " "supported for USB 2.0 roothub\n"); goto error; } break; case USB_PORT_FEAT_SUSPEND: /* Applicable only for USB2.0 hub */ if (hcd->speed == HCD_USB3) { dev_dbg(dummy_dev(dum_hcd), "USB_PORT_FEAT_SUSPEND req not " "supported for USB 3.0 roothub\n"); goto error; } if (dum_hcd->active) { dum_hcd->port_status |= USB_PORT_STAT_SUSPEND; /* HNP would happen here; for now we * assume b_bus_req is always true. */ set_link_state(dum_hcd); if (((1 << USB_DEVICE_B_HNP_ENABLE) & dum_hcd->dum->devstatus) != 0) dev_dbg(dummy_dev(dum_hcd), "no HNP yet!\n"); } break; case USB_PORT_FEAT_POWER: if (hcd->speed == HCD_USB3) dum_hcd->port_status |= USB_SS_PORT_STAT_POWER; else dum_hcd->port_status |= USB_PORT_STAT_POWER; set_link_state(dum_hcd); break; case USB_PORT_FEAT_BH_PORT_RESET: /* Applicable only for USB3.0 hub */ if (hcd->speed != HCD_USB3) { dev_dbg(dummy_dev(dum_hcd), "USB_PORT_FEAT_BH_PORT_RESET req not " "supported for USB 2.0 roothub\n"); goto error; } fallthrough; case USB_PORT_FEAT_RESET: if (!(dum_hcd->port_status & USB_PORT_STAT_CONNECTION)) break; /* if it's already enabled, disable */ if (hcd->speed == HCD_USB3) { dum_hcd->port_status = (USB_SS_PORT_STAT_POWER | USB_PORT_STAT_CONNECTION | USB_PORT_STAT_RESET); } else { dum_hcd->port_status &= ~(USB_PORT_STAT_ENABLE | USB_PORT_STAT_LOW_SPEED | USB_PORT_STAT_HIGH_SPEED); dum_hcd->port_status |= USB_PORT_STAT_RESET; } /* * We want to reset device status. All but the * Self powered feature */ dum_hcd->dum->devstatus &= (1 << USB_DEVICE_SELF_POWERED); /* * FIXME USB3.0: what is the correct reset signaling * interval? Is it still 50msec as for HS? */ dum_hcd->re_timeout = jiffies + msecs_to_jiffies(50); set_link_state(dum_hcd); break; case USB_PORT_FEAT_C_CONNECTION: case USB_PORT_FEAT_C_RESET: case USB_PORT_FEAT_C_ENABLE: case USB_PORT_FEAT_C_SUSPEND: /* Not allowed for USB-3, and ignored for USB-2 */ if (hcd->speed == HCD_USB3) goto error; break; default: /* Disallow TEST, INDICATOR, and C_OVER_CURRENT */ goto error; } break; case GetPortErrorCount: if (hcd->speed != HCD_USB3) { dev_dbg(dummy_dev(dum_hcd), "GetPortErrorCount req not " "supported for USB 2.0 roothub\n"); goto error; } /* We'll always return 0 since this is a dummy hub */ *(__le32 *) buf = cpu_to_le32(0); break; case SetHubDepth: if (hcd->speed != HCD_USB3) { dev_dbg(dummy_dev(dum_hcd), "SetHubDepth req not supported for " "USB 2.0 roothub\n"); goto error; } break; default: dev_dbg(dummy_dev(dum_hcd), "hub control req%04x v%04x i%04x l%d\n", typeReq, wValue, wIndex, wLength); error: /* "protocol stall" on error */ retval = -EPIPE; } spin_unlock_irqrestore(&dum_hcd->dum->lock, flags); if ((dum_hcd->port_status & PORT_C_MASK) != 0) usb_hcd_poll_rh_status(hcd); return retval; } static int dummy_bus_suspend(struct usb_hcd *hcd) { struct dummy_hcd *dum_hcd = hcd_to_dummy_hcd(hcd); dev_dbg(&hcd->self.root_hub->dev, "%s\n", __func__); spin_lock_irq(&dum_hcd->dum->lock); dum_hcd->rh_state = DUMMY_RH_SUSPENDED; set_link_state(dum_hcd); hcd->state = HC_STATE_SUSPENDED; spin_unlock_irq(&dum_hcd->dum->lock); return 0; } static int dummy_bus_resume(struct usb_hcd *hcd) { struct dummy_hcd *dum_hcd = hcd_to_dummy_hcd(hcd); int rc = 0; dev_dbg(&hcd->self.root_hub->dev, "%s\n", __func__); spin_lock_irq(&dum_hcd->dum->lock); if (!HCD_HW_ACCESSIBLE(hcd)) { rc = -ESHUTDOWN; } else { dum_hcd->rh_state = DUMMY_RH_RUNNING; set_link_state(dum_hcd); if (!list_empty(&dum_hcd->urbp_list)) mod_timer(&dum_hcd->timer, jiffies); hcd->state = HC_STATE_RUNNING; } spin_unlock_irq(&dum_hcd->dum->lock); return rc; } /*-------------------------------------------------------------------------*/ static inline ssize_t show_urb(char *buf, size_t size, struct urb *urb) { int ep = usb_pipeendpoint(urb->pipe); return scnprintf(buf, size, "urb/%p %s ep%d%s%s len %d/%d\n", urb, ({ char *s; switch (urb->dev->speed) { case USB_SPEED_LOW: s = "ls"; break; case USB_SPEED_FULL: s = "fs"; break; case USB_SPEED_HIGH: s = "hs"; break; case USB_SPEED_SUPER: s = "ss"; break; default: s = "?"; break; } s; }), ep, ep ? (usb_urb_dir_in(urb) ? "in" : "out") : "", ({ char *s; \ switch (usb_pipetype(urb->pipe)) { \ case PIPE_CONTROL: \ s = ""; \ break; \ case PIPE_BULK: \ s = "-bulk"; \ break; \ case PIPE_INTERRUPT: \ s = "-int"; \ break; \ default: \ s = "-iso"; \ break; \ } s; }), urb->actual_length, urb->transfer_buffer_length); } static ssize_t urbs_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_hcd *hcd = dev_get_drvdata(dev); struct dummy_hcd *dum_hcd = hcd_to_dummy_hcd(hcd); struct urbp *urbp; size_t size = 0; unsigned long flags; spin_lock_irqsave(&dum_hcd->dum->lock, flags); list_for_each_entry(urbp, &dum_hcd->urbp_list, urbp_list) { size_t temp; temp = show_urb(buf, PAGE_SIZE - size, urbp->urb); buf += temp; size += temp; } spin_unlock_irqrestore(&dum_hcd->dum->lock, flags); return size; } static DEVICE_ATTR_RO(urbs); static int dummy_start_ss(struct dummy_hcd *dum_hcd) { timer_setup(&dum_hcd->timer, dummy_timer, 0); dum_hcd->rh_state = DUMMY_RH_RUNNING; dum_hcd->stream_en_ep = 0; INIT_LIST_HEAD(&dum_hcd->urbp_list); dummy_hcd_to_hcd(dum_hcd)->power_budget = POWER_BUDGET_3; dummy_hcd_to_hcd(dum_hcd)->state = HC_STATE_RUNNING; dummy_hcd_to_hcd(dum_hcd)->uses_new_polling = 1; #ifdef CONFIG_USB_OTG dummy_hcd_to_hcd(dum_hcd)->self.otg_port = 1; #endif return 0; /* FIXME 'urbs' should be a per-device thing, maybe in usbcore */ return device_create_file(dummy_dev(dum_hcd), &dev_attr_urbs); } static int dummy_start(struct usb_hcd *hcd) { struct dummy_hcd *dum_hcd = hcd_to_dummy_hcd(hcd); /* * HOST side init ... we emulate a root hub that'll only ever * talk to one device (the gadget side). Also appears in sysfs, * just like more familiar pci-based HCDs. */ if (!usb_hcd_is_primary_hcd(hcd)) return dummy_start_ss(dum_hcd); spin_lock_init(&dum_hcd->dum->lock); timer_setup(&dum_hcd->timer, dummy_timer, 0); dum_hcd->rh_state = DUMMY_RH_RUNNING; INIT_LIST_HEAD(&dum_hcd->urbp_list); hcd->power_budget = POWER_BUDGET; hcd->state = HC_STATE_RUNNING; hcd->uses_new_polling = 1; #ifdef CONFIG_USB_OTG hcd->self.otg_port = 1; #endif /* FIXME 'urbs' should be a per-device thing, maybe in usbcore */ return device_create_file(dummy_dev(dum_hcd), &dev_attr_urbs); } static void dummy_stop(struct usb_hcd *hcd) { device_remove_file(dummy_dev(hcd_to_dummy_hcd(hcd)), &dev_attr_urbs); dev_info(dummy_dev(hcd_to_dummy_hcd(hcd)), "stopped\n"); } /*-------------------------------------------------------------------------*/ static int dummy_h_get_frame(struct usb_hcd *hcd) { return dummy_g_get_frame(NULL); } static int dummy_setup(struct usb_hcd *hcd) { struct dummy *dum; dum = *((void **)dev_get_platdata(hcd->self.controller)); hcd->self.sg_tablesize = ~0; if (usb_hcd_is_primary_hcd(hcd)) { dum->hs_hcd = hcd_to_dummy_hcd(hcd); dum->hs_hcd->dum = dum; /* * Mark the first roothub as being USB 2.0. * The USB 3.0 roothub will be registered later by * dummy_hcd_probe() */ hcd->speed = HCD_USB2; hcd->self.root_hub->speed = USB_SPEED_HIGH; } else { dum->ss_hcd = hcd_to_dummy_hcd(hcd); dum->ss_hcd->dum = dum; hcd->speed = HCD_USB3; hcd->self.root_hub->speed = USB_SPEED_SUPER; } return 0; } /* Change a group of bulk endpoints to support multiple stream IDs */ static int dummy_alloc_streams(struct usb_hcd *hcd, struct usb_device *udev, struct usb_host_endpoint **eps, unsigned int num_eps, unsigned int num_streams, gfp_t mem_flags) { struct dummy_hcd *dum_hcd = hcd_to_dummy_hcd(hcd); unsigned long flags; int max_stream; int ret_streams = num_streams; unsigned int index; unsigned int i; if (!num_eps) return -EINVAL; spin_lock_irqsave(&dum_hcd->dum->lock, flags); for (i = 0; i < num_eps; i++) { index = dummy_get_ep_idx(&eps[i]->desc); if ((1 << index) & dum_hcd->stream_en_ep) { ret_streams = -EINVAL; goto out; } max_stream = usb_ss_max_streams(&eps[i]->ss_ep_comp); if (!max_stream) { ret_streams = -EINVAL; goto out; } if (max_stream < ret_streams) { dev_dbg(dummy_dev(dum_hcd), "Ep 0x%x only supports %u " "stream IDs.\n", eps[i]->desc.bEndpointAddress, max_stream); ret_streams = max_stream; } } for (i = 0; i < num_eps; i++) { index = dummy_get_ep_idx(&eps[i]->desc); dum_hcd->stream_en_ep |= 1 << index; set_max_streams_for_pipe(dum_hcd, usb_endpoint_num(&eps[i]->desc), ret_streams); } out: spin_unlock_irqrestore(&dum_hcd->dum->lock, flags); return ret_streams; } /* Reverts a group of bulk endpoints back to not using stream IDs. */ static int dummy_free_streams(struct usb_hcd *hcd, struct usb_device *udev, struct usb_host_endpoint **eps, unsigned int num_eps, gfp_t mem_flags) { struct dummy_hcd *dum_hcd = hcd_to_dummy_hcd(hcd); unsigned long flags; int ret; unsigned int index; unsigned int i; spin_lock_irqsave(&dum_hcd->dum->lock, flags); for (i = 0; i < num_eps; i++) { index = dummy_get_ep_idx(&eps[i]->desc); if (!((1 << index) & dum_hcd->stream_en_ep)) { ret = -EINVAL; goto out; } } for (i = 0; i < num_eps; i++) { index = dummy_get_ep_idx(&eps[i]->desc); dum_hcd->stream_en_ep &= ~(1 << index); set_max_streams_for_pipe(dum_hcd, usb_endpoint_num(&eps[i]->desc), 0); } ret = 0; out: spin_unlock_irqrestore(&dum_hcd->dum->lock, flags); return ret; } static struct hc_driver dummy_hcd = { .description = (char *) driver_name, .product_desc = "Dummy host controller", .hcd_priv_size = sizeof(struct dummy_hcd), .reset = dummy_setup, .start = dummy_start, .stop = dummy_stop, .urb_enqueue = dummy_urb_enqueue, .urb_dequeue = dummy_urb_dequeue, .get_frame_number = dummy_h_get_frame, .hub_status_data = dummy_hub_status, .hub_control = dummy_hub_control, .bus_suspend = dummy_bus_suspend, .bus_resume = dummy_bus_resume, .alloc_streams = dummy_alloc_streams, .free_streams = dummy_free_streams, }; static int dummy_hcd_probe(struct platform_device *pdev) { struct dummy *dum; struct usb_hcd *hs_hcd; struct usb_hcd *ss_hcd; int retval; dev_info(&pdev->dev, "%s, driver " DRIVER_VERSION "\n", driver_desc); dum = *((void **)dev_get_platdata(&pdev->dev)); if (mod_data.is_super_speed) dummy_hcd.flags = HCD_USB3 | HCD_SHARED; else if (mod_data.is_high_speed) dummy_hcd.flags = HCD_USB2; else dummy_hcd.flags = HCD_USB11; hs_hcd = usb_create_hcd(&dummy_hcd, &pdev->dev, dev_name(&pdev->dev)); if (!hs_hcd) return -ENOMEM; hs_hcd->has_tt = 1; retval = usb_add_hcd(hs_hcd, 0, 0); if (retval) goto put_usb2_hcd; if (mod_data.is_super_speed) { ss_hcd = usb_create_shared_hcd(&dummy_hcd, &pdev->dev, dev_name(&pdev->dev), hs_hcd); if (!ss_hcd) { retval = -ENOMEM; goto dealloc_usb2_hcd; } retval = usb_add_hcd(ss_hcd, 0, 0); if (retval) goto put_usb3_hcd; } return 0; put_usb3_hcd: usb_put_hcd(ss_hcd); dealloc_usb2_hcd: usb_remove_hcd(hs_hcd); put_usb2_hcd: usb_put_hcd(hs_hcd); dum->hs_hcd = dum->ss_hcd = NULL; return retval; } static void dummy_hcd_remove(struct platform_device *pdev) { struct dummy *dum; dum = hcd_to_dummy_hcd(platform_get_drvdata(pdev))->dum; if (dum->ss_hcd) { usb_remove_hcd(dummy_hcd_to_hcd(dum->ss_hcd)); usb_put_hcd(dummy_hcd_to_hcd(dum->ss_hcd)); } usb_remove_hcd(dummy_hcd_to_hcd(dum->hs_hcd)); usb_put_hcd(dummy_hcd_to_hcd(dum->hs_hcd)); dum->hs_hcd = NULL; dum->ss_hcd = NULL; } static int dummy_hcd_suspend(struct platform_device *pdev, pm_message_t state) { struct usb_hcd *hcd; struct dummy_hcd *dum_hcd; int rc = 0; dev_dbg(&pdev->dev, "%s\n", __func__); hcd = platform_get_drvdata(pdev); dum_hcd = hcd_to_dummy_hcd(hcd); if (dum_hcd->rh_state == DUMMY_RH_RUNNING) { dev_warn(&pdev->dev, "Root hub isn't suspended!\n"); rc = -EBUSY; } else clear_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags); return rc; } static int dummy_hcd_resume(struct platform_device *pdev) { struct usb_hcd *hcd; dev_dbg(&pdev->dev, "%s\n", __func__); hcd = platform_get_drvdata(pdev); set_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags); usb_hcd_poll_rh_status(hcd); return 0; } static struct platform_driver dummy_hcd_driver = { .probe = dummy_hcd_probe, .remove_new = dummy_hcd_remove, .suspend = dummy_hcd_suspend, .resume = dummy_hcd_resume, .driver = { .name = driver_name, }, }; /*-------------------------------------------------------------------------*/ #define MAX_NUM_UDC 32 static struct platform_device *the_udc_pdev[MAX_NUM_UDC]; static struct platform_device *the_hcd_pdev[MAX_NUM_UDC]; static int __init dummy_hcd_init(void) { int retval = -ENOMEM; int i; struct dummy *dum[MAX_NUM_UDC] = {}; if (usb_disabled()) return -ENODEV; if (!mod_data.is_high_speed && mod_data.is_super_speed) return -EINVAL; if (mod_data.num < 1 || mod_data.num > MAX_NUM_UDC) { pr_err("Number of emulated UDC must be in range of 1...%d\n", MAX_NUM_UDC); return -EINVAL; } for (i = 0; i < mod_data.num; i++) { the_hcd_pdev[i] = platform_device_alloc(driver_name, i); if (!the_hcd_pdev[i]) { i--; while (i >= 0) platform_device_put(the_hcd_pdev[i--]); return retval; } } for (i = 0; i < mod_data.num; i++) { the_udc_pdev[i] = platform_device_alloc(gadget_name, i); if (!the_udc_pdev[i]) { i--; while (i >= 0) platform_device_put(the_udc_pdev[i--]); goto err_alloc_udc; } } for (i = 0; i < mod_data.num; i++) { dum[i] = kzalloc(sizeof(struct dummy), GFP_KERNEL); if (!dum[i]) { retval = -ENOMEM; goto err_add_pdata; } retval = platform_device_add_data(the_hcd_pdev[i], &dum[i], sizeof(void *)); if (retval) goto err_add_pdata; retval = platform_device_add_data(the_udc_pdev[i], &dum[i], sizeof(void *)); if (retval) goto err_add_pdata; } retval = platform_driver_register(&dummy_hcd_driver); if (retval < 0) goto err_add_pdata; retval = platform_driver_register(&dummy_udc_driver); if (retval < 0) goto err_register_udc_driver; for (i = 0; i < mod_data.num; i++) { retval = platform_device_add(the_hcd_pdev[i]); if (retval < 0) { i--; while (i >= 0) platform_device_del(the_hcd_pdev[i--]); goto err_add_hcd; } } for (i = 0; i < mod_data.num; i++) { if (!dum[i]->hs_hcd || (!dum[i]->ss_hcd && mod_data.is_super_speed)) { /* * The hcd was added successfully but its probe * function failed for some reason. */ retval = -EINVAL; goto err_add_udc; } } for (i = 0; i < mod_data.num; i++) { retval = platform_device_add(the_udc_pdev[i]); if (retval < 0) { i--; while (i >= 0) platform_device_del(the_udc_pdev[i--]); goto err_add_udc; } } for (i = 0; i < mod_data.num; i++) { if (!platform_get_drvdata(the_udc_pdev[i])) { /* * The udc was added successfully but its probe * function failed for some reason. */ retval = -EINVAL; goto err_probe_udc; } } return retval; err_probe_udc: for (i = 0; i < mod_data.num; i++) platform_device_del(the_udc_pdev[i]); err_add_udc: for (i = 0; i < mod_data.num; i++) platform_device_del(the_hcd_pdev[i]); err_add_hcd: platform_driver_unregister(&dummy_udc_driver); err_register_udc_driver: platform_driver_unregister(&dummy_hcd_driver); err_add_pdata: for (i = 0; i < mod_data.num; i++) kfree(dum[i]); for (i = 0; i < mod_data.num; i++) platform_device_put(the_udc_pdev[i]); err_alloc_udc: for (i = 0; i < mod_data.num; i++) platform_device_put(the_hcd_pdev[i]); return retval; } module_init(dummy_hcd_init); static void __exit dummy_hcd_cleanup(void) { int i; for (i = 0; i < mod_data.num; i++) { struct dummy *dum; dum = *((void **)dev_get_platdata(&the_udc_pdev[i]->dev)); platform_device_unregister(the_udc_pdev[i]); platform_device_unregister(the_hcd_pdev[i]); kfree(dum); } platform_driver_unregister(&dummy_udc_driver); platform_driver_unregister(&dummy_hcd_driver); } module_exit(dummy_hcd_cleanup);
1 1 2 2 1 1 3 1 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 // SPDX-License-Identifier: GPL-2.0-or-later /* * Roccat Kone[+] driver for Linux * * Copyright (c) 2010 Stefan Achatz <erazor_de@users.sourceforge.net> */ /* */ /* * Roccat Kone[+] is an updated/improved version of the Kone with more memory * and functionality and without the non-standard behaviours the Kone had. * KoneXTD has same capabilities but updated sensor. */ #include <linux/device.h> #include <linux/input.h> #include <linux/hid.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/hid-roccat.h> #include "hid-ids.h" #include "hid-roccat-common.h" #include "hid-roccat-koneplus.h" static uint profile_numbers[5] = {0, 1, 2, 3, 4}; static void koneplus_profile_activated(struct koneplus_device *koneplus, uint new_profile) { koneplus->actual_profile = new_profile; } static int koneplus_send_control(struct usb_device *usb_dev, uint value, enum koneplus_control_requests request) { struct roccat_common2_control control; if ((request == KONEPLUS_CONTROL_REQUEST_PROFILE_SETTINGS || request == KONEPLUS_CONTROL_REQUEST_PROFILE_BUTTONS) && value > 4) return -EINVAL; control.command = ROCCAT_COMMON_COMMAND_CONTROL; control.value = value; control.request = request; return roccat_common2_send_with_status(usb_dev, ROCCAT_COMMON_COMMAND_CONTROL, &control, sizeof(struct roccat_common2_control)); } /* retval is 0-4 on success, < 0 on error */ static int koneplus_get_actual_profile(struct usb_device *usb_dev) { struct koneplus_actual_profile buf; int retval; retval = roccat_common2_receive(usb_dev, KONEPLUS_COMMAND_ACTUAL_PROFILE, &buf, KONEPLUS_SIZE_ACTUAL_PROFILE); return retval ? retval : buf.actual_profile; } static int koneplus_set_actual_profile(struct usb_device *usb_dev, int new_profile) { struct koneplus_actual_profile buf; buf.command = KONEPLUS_COMMAND_ACTUAL_PROFILE; buf.size = KONEPLUS_SIZE_ACTUAL_PROFILE; buf.actual_profile = new_profile; return roccat_common2_send_with_status(usb_dev, KONEPLUS_COMMAND_ACTUAL_PROFILE, &buf, KONEPLUS_SIZE_ACTUAL_PROFILE); } static ssize_t koneplus_sysfs_read(struct file *fp, struct kobject *kobj, char *buf, loff_t off, size_t count, size_t real_size, uint command) { struct device *dev = kobj_to_dev(kobj)->parent->parent; struct koneplus_device *koneplus = hid_get_drvdata(dev_get_drvdata(dev)); struct usb_device *usb_dev = interface_to_usbdev(to_usb_interface(dev)); int retval; if (off >= real_size) return 0; if (off != 0 || count != real_size) return -EINVAL; mutex_lock(&koneplus->koneplus_lock); retval = roccat_common2_receive(usb_dev, command, buf, real_size); mutex_unlock(&koneplus->koneplus_lock); if (retval) return retval; return real_size; } static ssize_t koneplus_sysfs_write(struct file *fp, struct kobject *kobj, void const *buf, loff_t off, size_t count, size_t real_size, uint command) { struct device *dev = kobj_to_dev(kobj)->parent->parent; struct koneplus_device *koneplus = hid_get_drvdata(dev_get_drvdata(dev)); struct usb_device *usb_dev = interface_to_usbdev(to_usb_interface(dev)); int retval; if (off != 0 || count != real_size) return -EINVAL; mutex_lock(&koneplus->koneplus_lock); retval = roccat_common2_send_with_status(usb_dev, command, buf, real_size); mutex_unlock(&koneplus->koneplus_lock); if (retval) return retval; return real_size; } #define KONEPLUS_SYSFS_W(thingy, THINGY) \ static ssize_t koneplus_sysfs_write_ ## thingy(struct file *fp, \ struct kobject *kobj, struct bin_attribute *attr, char *buf, \ loff_t off, size_t count) \ { \ return koneplus_sysfs_write(fp, kobj, buf, off, count, \ KONEPLUS_SIZE_ ## THINGY, KONEPLUS_COMMAND_ ## THINGY); \ } #define KONEPLUS_SYSFS_R(thingy, THINGY) \ static ssize_t koneplus_sysfs_read_ ## thingy(struct file *fp, \ struct kobject *kobj, struct bin_attribute *attr, char *buf, \ loff_t off, size_t count) \ { \ return koneplus_sysfs_read(fp, kobj, buf, off, count, \ KONEPLUS_SIZE_ ## THINGY, KONEPLUS_COMMAND_ ## THINGY); \ } #define KONEPLUS_SYSFS_RW(thingy, THINGY) \ KONEPLUS_SYSFS_W(thingy, THINGY) \ KONEPLUS_SYSFS_R(thingy, THINGY) #define KONEPLUS_BIN_ATTRIBUTE_RW(thingy, THINGY) \ KONEPLUS_SYSFS_RW(thingy, THINGY); \ static struct bin_attribute bin_attr_##thingy = { \ .attr = { .name = #thingy, .mode = 0660 }, \ .size = KONEPLUS_SIZE_ ## THINGY, \ .read = koneplus_sysfs_read_ ## thingy, \ .write = koneplus_sysfs_write_ ## thingy \ } #define KONEPLUS_BIN_ATTRIBUTE_R(thingy, THINGY) \ KONEPLUS_SYSFS_R(thingy, THINGY); \ static struct bin_attribute bin_attr_##thingy = { \ .attr = { .name = #thingy, .mode = 0440 }, \ .size = KONEPLUS_SIZE_ ## THINGY, \ .read = koneplus_sysfs_read_ ## thingy, \ } #define KONEPLUS_BIN_ATTRIBUTE_W(thingy, THINGY) \ KONEPLUS_SYSFS_W(thingy, THINGY); \ static struct bin_attribute bin_attr_##thingy = { \ .attr = { .name = #thingy, .mode = 0220 }, \ .size = KONEPLUS_SIZE_ ## THINGY, \ .write = koneplus_sysfs_write_ ## thingy \ } KONEPLUS_BIN_ATTRIBUTE_W(control, CONTROL); KONEPLUS_BIN_ATTRIBUTE_W(talk, TALK); KONEPLUS_BIN_ATTRIBUTE_W(macro, MACRO); KONEPLUS_BIN_ATTRIBUTE_R(tcu_image, TCU_IMAGE); KONEPLUS_BIN_ATTRIBUTE_RW(info, INFO); KONEPLUS_BIN_ATTRIBUTE_RW(sensor, SENSOR); KONEPLUS_BIN_ATTRIBUTE_RW(tcu, TCU); KONEPLUS_BIN_ATTRIBUTE_RW(profile_settings, PROFILE_SETTINGS); KONEPLUS_BIN_ATTRIBUTE_RW(profile_buttons, PROFILE_BUTTONS); static ssize_t koneplus_sysfs_read_profilex_settings(struct file *fp, struct kobject *kobj, struct bin_attribute *attr, char *buf, loff_t off, size_t count) { struct device *dev = kobj_to_dev(kobj)->parent->parent; struct usb_device *usb_dev = interface_to_usbdev(to_usb_interface(dev)); ssize_t retval; retval = koneplus_send_control(usb_dev, *(uint *)(attr->private), KONEPLUS_CONTROL_REQUEST_PROFILE_SETTINGS); if (retval) return retval; return koneplus_sysfs_read(fp, kobj, buf, off, count, KONEPLUS_SIZE_PROFILE_SETTINGS, KONEPLUS_COMMAND_PROFILE_SETTINGS); } static ssize_t koneplus_sysfs_read_profilex_buttons(struct file *fp, struct kobject *kobj, struct bin_attribute *attr, char *buf, loff_t off, size_t count) { struct device *dev = kobj_to_dev(kobj)->parent->parent; struct usb_device *usb_dev = interface_to_usbdev(to_usb_interface(dev)); ssize_t retval; retval = koneplus_send_control(usb_dev, *(uint *)(attr->private), KONEPLUS_CONTROL_REQUEST_PROFILE_BUTTONS); if (retval) return retval; return koneplus_sysfs_read(fp, kobj, buf, off, count, KONEPLUS_SIZE_PROFILE_BUTTONS, KONEPLUS_COMMAND_PROFILE_BUTTONS); } #define PROFILE_ATTR(number) \ static struct bin_attribute bin_attr_profile##number##_settings = { \ .attr = { .name = "profile" #number "_settings", .mode = 0440 }, \ .size = KONEPLUS_SIZE_PROFILE_SETTINGS, \ .read = koneplus_sysfs_read_profilex_settings, \ .private = &profile_numbers[number-1], \ }; \ static struct bin_attribute bin_attr_profile##number##_buttons = { \ .attr = { .name = "profile" #number "_buttons", .mode = 0440 }, \ .size = KONEPLUS_SIZE_PROFILE_BUTTONS, \ .read = koneplus_sysfs_read_profilex_buttons, \ .private = &profile_numbers[number-1], \ }; PROFILE_ATTR(1); PROFILE_ATTR(2); PROFILE_ATTR(3); PROFILE_ATTR(4); PROFILE_ATTR(5); static ssize_t koneplus_sysfs_show_actual_profile(struct device *dev, struct device_attribute *attr, char *buf) { struct koneplus_device *koneplus = hid_get_drvdata(dev_get_drvdata(dev->parent->parent)); return snprintf(buf, PAGE_SIZE, "%d\n", koneplus->actual_profile); } static ssize_t koneplus_sysfs_set_actual_profile(struct device *dev, struct device_attribute *attr, char const *buf, size_t size) { struct koneplus_device *koneplus; struct usb_device *usb_dev; unsigned long profile; int retval; struct koneplus_roccat_report roccat_report; dev = dev->parent->parent; koneplus = hid_get_drvdata(dev_get_drvdata(dev)); usb_dev = interface_to_usbdev(to_usb_interface(dev)); retval = kstrtoul(buf, 10, &profile); if (retval) return retval; if (profile > 4) return -EINVAL; mutex_lock(&koneplus->koneplus_lock); retval = koneplus_set_actual_profile(usb_dev, profile); if (retval) { mutex_unlock(&koneplus->koneplus_lock); return retval; } koneplus_profile_activated(koneplus, profile); roccat_report.type = KONEPLUS_MOUSE_REPORT_BUTTON_TYPE_PROFILE; roccat_report.data1 = profile + 1; roccat_report.data2 = 0; roccat_report.profile = profile + 1; roccat_report_event(koneplus->chrdev_minor, (uint8_t const *)&roccat_report); mutex_unlock(&koneplus->koneplus_lock); return size; } static DEVICE_ATTR(actual_profile, 0660, koneplus_sysfs_show_actual_profile, koneplus_sysfs_set_actual_profile); static DEVICE_ATTR(startup_profile, 0660, koneplus_sysfs_show_actual_profile, koneplus_sysfs_set_actual_profile); static ssize_t koneplus_sysfs_show_firmware_version(struct device *dev, struct device_attribute *attr, char *buf) { struct koneplus_device *koneplus; struct usb_device *usb_dev; struct koneplus_info info; dev = dev->parent->parent; koneplus = hid_get_drvdata(dev_get_drvdata(dev)); usb_dev = interface_to_usbdev(to_usb_interface(dev)); mutex_lock(&koneplus->koneplus_lock); roccat_common2_receive(usb_dev, KONEPLUS_COMMAND_INFO, &info, KONEPLUS_SIZE_INFO); mutex_unlock(&koneplus->koneplus_lock); return snprintf(buf, PAGE_SIZE, "%d\n", info.firmware_version); } static DEVICE_ATTR(firmware_version, 0440, koneplus_sysfs_show_firmware_version, NULL); static struct attribute *koneplus_attrs[] = { &dev_attr_actual_profile.attr, &dev_attr_startup_profile.attr, &dev_attr_firmware_version.attr, NULL, }; static struct bin_attribute *koneplus_bin_attributes[] = { &bin_attr_control, &bin_attr_talk, &bin_attr_macro, &bin_attr_tcu_image, &bin_attr_info, &bin_attr_sensor, &bin_attr_tcu, &bin_attr_profile_settings, &bin_attr_profile_buttons, &bin_attr_profile1_settings, &bin_attr_profile2_settings, &bin_attr_profile3_settings, &bin_attr_profile4_settings, &bin_attr_profile5_settings, &bin_attr_profile1_buttons, &bin_attr_profile2_buttons, &bin_attr_profile3_buttons, &bin_attr_profile4_buttons, &bin_attr_profile5_buttons, NULL, }; static const struct attribute_group koneplus_group = { .attrs = koneplus_attrs, .bin_attrs = koneplus_bin_attributes, }; static const struct attribute_group *koneplus_groups[] = { &koneplus_group, NULL, }; static const struct class koneplus_class = { .name = "koneplus", .dev_groups = koneplus_groups, }; static int koneplus_init_koneplus_device_struct(struct usb_device *usb_dev, struct koneplus_device *koneplus) { int retval; mutex_init(&koneplus->koneplus_lock); retval = koneplus_get_actual_profile(usb_dev); if (retval < 0) return retval; koneplus_profile_activated(koneplus, retval); return 0; } static int koneplus_init_specials(struct hid_device *hdev) { struct usb_interface *intf = to_usb_interface(hdev->dev.parent); struct usb_device *usb_dev = interface_to_usbdev(intf); struct koneplus_device *koneplus; int retval; if (intf->cur_altsetting->desc.bInterfaceProtocol == USB_INTERFACE_PROTOCOL_MOUSE) { koneplus = kzalloc(sizeof(*koneplus), GFP_KERNEL); if (!koneplus) { hid_err(hdev, "can't alloc device descriptor\n"); return -ENOMEM; } hid_set_drvdata(hdev, koneplus); retval = koneplus_init_koneplus_device_struct(usb_dev, koneplus); if (retval) { hid_err(hdev, "couldn't init struct koneplus_device\n"); goto exit_free; } retval = roccat_connect(&koneplus_class, hdev, sizeof(struct koneplus_roccat_report)); if (retval < 0) { hid_err(hdev, "couldn't init char dev\n"); } else { koneplus->chrdev_minor = retval; koneplus->roccat_claimed = 1; } } else { hid_set_drvdata(hdev, NULL); } return 0; exit_free: kfree(koneplus); return retval; } static void koneplus_remove_specials(struct hid_device *hdev) { struct usb_interface *intf = to_usb_interface(hdev->dev.parent); struct koneplus_device *koneplus; if (intf->cur_altsetting->desc.bInterfaceProtocol == USB_INTERFACE_PROTOCOL_MOUSE) { koneplus = hid_get_drvdata(hdev); if (koneplus->roccat_claimed) roccat_disconnect(koneplus->chrdev_minor); kfree(koneplus); } } static int koneplus_probe(struct hid_device *hdev, const struct hid_device_id *id) { int retval; if (!hid_is_usb(hdev)) return -EINVAL; retval = hid_parse(hdev); if (retval) { hid_err(hdev, "parse failed\n"); goto exit; } retval = hid_hw_start(hdev, HID_CONNECT_DEFAULT); if (retval) { hid_err(hdev, "hw start failed\n"); goto exit; } retval = koneplus_init_specials(hdev); if (retval) { hid_err(hdev, "couldn't install mouse\n"); goto exit_stop; } return 0; exit_stop: hid_hw_stop(hdev); exit: return retval; } static void koneplus_remove(struct hid_device *hdev) { koneplus_remove_specials(hdev); hid_hw_stop(hdev); } static void koneplus_keep_values_up_to_date(struct koneplus_device *koneplus, u8 const *data) { struct koneplus_mouse_report_button const *button_report; switch (data[0]) { case KONEPLUS_MOUSE_REPORT_NUMBER_BUTTON: button_report = (struct koneplus_mouse_report_button const *)data; switch (button_report->type) { case KONEPLUS_MOUSE_REPORT_BUTTON_TYPE_PROFILE: koneplus_profile_activated(koneplus, button_report->data1 - 1); break; } break; } } static void koneplus_report_to_chrdev(struct koneplus_device const *koneplus, u8 const *data) { struct koneplus_roccat_report roccat_report; struct koneplus_mouse_report_button const *button_report; if (data[0] != KONEPLUS_MOUSE_REPORT_NUMBER_BUTTON) return; button_report = (struct koneplus_mouse_report_button const *)data; if ((button_report->type == KONEPLUS_MOUSE_REPORT_BUTTON_TYPE_QUICKLAUNCH || button_report->type == KONEPLUS_MOUSE_REPORT_BUTTON_TYPE_TIMER) && button_report->data2 != KONEPLUS_MOUSE_REPORT_BUTTON_ACTION_PRESS) return; roccat_report.type = button_report->type; roccat_report.data1 = button_report->data1; roccat_report.data2 = button_report->data2; roccat_report.profile = koneplus->actual_profile + 1; roccat_report_event(koneplus->chrdev_minor, (uint8_t const *)&roccat_report); } static int koneplus_raw_event(struct hid_device *hdev, struct hid_report *report, u8 *data, int size) { struct usb_interface *intf = to_usb_interface(hdev->dev.parent); struct koneplus_device *koneplus = hid_get_drvdata(hdev); if (intf->cur_altsetting->desc.bInterfaceProtocol != USB_INTERFACE_PROTOCOL_MOUSE) return 0; if (koneplus == NULL) return 0; koneplus_keep_values_up_to_date(koneplus, data); if (koneplus->roccat_claimed) koneplus_report_to_chrdev(koneplus, data); return 0; } static const struct hid_device_id koneplus_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_ROCCAT, USB_DEVICE_ID_ROCCAT_KONEPLUS) }, { HID_USB_DEVICE(USB_VENDOR_ID_ROCCAT, USB_DEVICE_ID_ROCCAT_KONEXTD) }, { } }; MODULE_DEVICE_TABLE(hid, koneplus_devices); static struct hid_driver koneplus_driver = { .name = "koneplus", .id_table = koneplus_devices, .probe = koneplus_probe, .remove = koneplus_remove, .raw_event = koneplus_raw_event }; static int __init koneplus_init(void) { int retval; /* class name has to be same as driver name */ retval = class_register(&koneplus_class); if (retval) return retval; retval = hid_register_driver(&koneplus_driver); if (retval) class_unregister(&koneplus_class); return retval; } static void __exit koneplus_exit(void) { hid_unregister_driver(&koneplus_driver); class_unregister(&koneplus_class); } module_init(koneplus_init); module_exit(koneplus_exit); MODULE_AUTHOR("Stefan Achatz"); MODULE_DESCRIPTION("USB Roccat Kone[+]/XTD driver"); MODULE_LICENSE("GPL v2");
1719 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __FS_NOTIFY_FSNOTIFY_H_ #define __FS_NOTIFY_FSNOTIFY_H_ #include <linux/list.h> #include <linux/fsnotify.h> #include <linux/srcu.h> #include <linux/types.h> #include "../mount.h" static inline struct inode *fsnotify_conn_inode( struct fsnotify_mark_connector *conn) { return container_of(conn->obj, struct inode, i_fsnotify_marks); } static inline struct mount *fsnotify_conn_mount( struct fsnotify_mark_connector *conn) { return container_of(conn->obj, struct mount, mnt_fsnotify_marks); } static inline struct super_block *fsnotify_conn_sb( struct fsnotify_mark_connector *conn) { return container_of(conn->obj, struct super_block, s_fsnotify_marks); } static inline struct super_block *fsnotify_connector_sb( struct fsnotify_mark_connector *conn) { switch (conn->type) { case FSNOTIFY_OBJ_TYPE_INODE: return fsnotify_conn_inode(conn)->i_sb; case FSNOTIFY_OBJ_TYPE_VFSMOUNT: return fsnotify_conn_mount(conn)->mnt.mnt_sb; case FSNOTIFY_OBJ_TYPE_SB: return fsnotify_conn_sb(conn); default: return NULL; } } /* destroy all events sitting in this groups notification queue */ extern void fsnotify_flush_notify(struct fsnotify_group *group); /* protects reads of inode and vfsmount marks list */ extern struct srcu_struct fsnotify_mark_srcu; /* compare two groups for sorting of marks lists */ extern int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b); /* Destroy all marks attached to an object via connector */ extern void fsnotify_destroy_marks(fsnotify_connp_t *connp); /* run the list of all marks associated with inode and destroy them */ static inline void fsnotify_clear_marks_by_inode(struct inode *inode) { fsnotify_destroy_marks(&inode->i_fsnotify_marks); } /* run the list of all marks associated with vfsmount and destroy them */ static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) { fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks); } /* run the list of all marks associated with sb and destroy them */ static inline void fsnotify_clear_marks_by_sb(struct super_block *sb) { fsnotify_destroy_marks(&sb->s_fsnotify_marks); } /* * update the dentry->d_flags of all of inode's children to indicate if inode cares * about events that happen to its children. */ extern void __fsnotify_update_child_dentry_flags(struct inode *inode); extern struct kmem_cache *fsnotify_mark_connector_cachep; #endif /* __FS_NOTIFY_FSNOTIFY_H_ */
23 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 // SPDX-License-Identifier: GPL-2.0 /* * consolemap.c * * Mapping from internal code (such as Latin-1 or Unicode or IBM PC code) * to font positions. * * aeb, 950210 * * Support for multiple unimaps by Jakub Jelinek <jj@ultra.linux.cz>, July 1998 * * Fix bug in inverse translation. Stanislav Voronyi <stas@cnti.uanet.kharkov.ua>, Dec 1998 * * In order to prevent the following circular lock dependency: * &mm->mmap_lock --> cpu_hotplug.lock --> console_lock --> &mm->mmap_lock * * We cannot allow page fault to happen while holding the console_lock. * Therefore, all the userspace copy operations have to be done outside * the console_lock critical sections. * * As all the affected functions are all called directly from vt_ioctl(), we * can allocate some small buffers directly on stack without worrying about * stack overflow. */ #include <linux/bitfield.h> #include <linux/bits.h> #include <linux/module.h> #include <linux/kd.h> #include <linux/errno.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/tty.h> #include <linux/uaccess.h> #include <linux/console.h> #include <linux/consolemap.h> #include <linux/vt_kern.h> #include <linux/string.h> static unsigned short translations[][E_TABSZ] = { /* 8-bit Latin-1 mapped to Unicode -- trivial mapping */ [LAT1_MAP] = { 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7, 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, 0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff }, /* VT100 graphics mapped to Unicode */ [GRAF_MAP] = { 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x2192, 0x2190, 0x2191, 0x2193, 0x002f, 0x2588, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x00a0, 0x25c6, 0x2592, 0x2409, 0x240c, 0x240d, 0x240a, 0x00b0, 0x00b1, 0x2591, 0x240b, 0x2518, 0x2510, 0x250c, 0x2514, 0x253c, 0x23ba, 0x23bb, 0x2500, 0x23bc, 0x23bd, 0x251c, 0x2524, 0x2534, 0x252c, 0x2502, 0x2264, 0x2265, 0x03c0, 0x2260, 0x00a3, 0x00b7, 0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7, 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, 0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff }, /* IBM Codepage 437 mapped to Unicode */ [IBMPC_MAP] = { 0x0000, 0x263a, 0x263b, 0x2665, 0x2666, 0x2663, 0x2660, 0x2022, 0x25d8, 0x25cb, 0x25d9, 0x2642, 0x2640, 0x266a, 0x266b, 0x263c, 0x25b6, 0x25c0, 0x2195, 0x203c, 0x00b6, 0x00a7, 0x25ac, 0x21a8, 0x2191, 0x2193, 0x2192, 0x2190, 0x221f, 0x2194, 0x25b2, 0x25bc, 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x2302, 0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7, 0x00ea, 0x00eb, 0x00e8, 0x00ef, 0x00ee, 0x00ec, 0x00c4, 0x00c5, 0x00c9, 0x00e6, 0x00c6, 0x00f4, 0x00f6, 0x00f2, 0x00fb, 0x00f9, 0x00ff, 0x00d6, 0x00dc, 0x00a2, 0x00a3, 0x00a5, 0x20a7, 0x0192, 0x00e1, 0x00ed, 0x00f3, 0x00fa, 0x00f1, 0x00d1, 0x00aa, 0x00ba, 0x00bf, 0x2310, 0x00ac, 0x00bd, 0x00bc, 0x00a1, 0x00ab, 0x00bb, 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556, 0x2555, 0x2563, 0x2551, 0x2557, 0x255d, 0x255c, 0x255b, 0x2510, 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x255e, 0x255f, 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x2567, 0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256b, 0x256a, 0x2518, 0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580, 0x03b1, 0x00df, 0x0393, 0x03c0, 0x03a3, 0x03c3, 0x00b5, 0x03c4, 0x03a6, 0x0398, 0x03a9, 0x03b4, 0x221e, 0x03c6, 0x03b5, 0x2229, 0x2261, 0x00b1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00f7, 0x2248, 0x00b0, 0x2219, 0x00b7, 0x221a, 0x207f, 0x00b2, 0x25a0, 0x00a0 }, /* User mapping -- default to codes for direct font mapping */ [USER_MAP] = { 0xf000, 0xf001, 0xf002, 0xf003, 0xf004, 0xf005, 0xf006, 0xf007, 0xf008, 0xf009, 0xf00a, 0xf00b, 0xf00c, 0xf00d, 0xf00e, 0xf00f, 0xf010, 0xf011, 0xf012, 0xf013, 0xf014, 0xf015, 0xf016, 0xf017, 0xf018, 0xf019, 0xf01a, 0xf01b, 0xf01c, 0xf01d, 0xf01e, 0xf01f, 0xf020, 0xf021, 0xf022, 0xf023, 0xf024, 0xf025, 0xf026, 0xf027, 0xf028, 0xf029, 0xf02a, 0xf02b, 0xf02c, 0xf02d, 0xf02e, 0xf02f, 0xf030, 0xf031, 0xf032, 0xf033, 0xf034, 0xf035, 0xf036, 0xf037, 0xf038, 0xf039, 0xf03a, 0xf03b, 0xf03c, 0xf03d, 0xf03e, 0xf03f, 0xf040, 0xf041, 0xf042, 0xf043, 0xf044, 0xf045, 0xf046, 0xf047, 0xf048, 0xf049, 0xf04a, 0xf04b, 0xf04c, 0xf04d, 0xf04e, 0xf04f, 0xf050, 0xf051, 0xf052, 0xf053, 0xf054, 0xf055, 0xf056, 0xf057, 0xf058, 0xf059, 0xf05a, 0xf05b, 0xf05c, 0xf05d, 0xf05e, 0xf05f, 0xf060, 0xf061, 0xf062, 0xf063, 0xf064, 0xf065, 0xf066, 0xf067, 0xf068, 0xf069, 0xf06a, 0xf06b, 0xf06c, 0xf06d, 0xf06e, 0xf06f, 0xf070, 0xf071, 0xf072, 0xf073, 0xf074, 0xf075, 0xf076, 0xf077, 0xf078, 0xf079, 0xf07a, 0xf07b, 0xf07c, 0xf07d, 0xf07e, 0xf07f, 0xf080, 0xf081, 0xf082, 0xf083, 0xf084, 0xf085, 0xf086, 0xf087, 0xf088, 0xf089, 0xf08a, 0xf08b, 0xf08c, 0xf08d, 0xf08e, 0xf08f, 0xf090, 0xf091, 0xf092, 0xf093, 0xf094, 0xf095, 0xf096, 0xf097, 0xf098, 0xf099, 0xf09a, 0xf09b, 0xf09c, 0xf09d, 0xf09e, 0xf09f, 0xf0a0, 0xf0a1, 0xf0a2, 0xf0a3, 0xf0a4, 0xf0a5, 0xf0a6, 0xf0a7, 0xf0a8, 0xf0a9, 0xf0aa, 0xf0ab, 0xf0ac, 0xf0ad, 0xf0ae, 0xf0af, 0xf0b0, 0xf0b1, 0xf0b2, 0xf0b3, 0xf0b4, 0xf0b5, 0xf0b6, 0xf0b7, 0xf0b8, 0xf0b9, 0xf0ba, 0xf0bb, 0xf0bc, 0xf0bd, 0xf0be, 0xf0bf, 0xf0c0, 0xf0c1, 0xf0c2, 0xf0c3, 0xf0c4, 0xf0c5, 0xf0c6, 0xf0c7, 0xf0c8, 0xf0c9, 0xf0ca, 0xf0cb, 0xf0cc, 0xf0cd, 0xf0ce, 0xf0cf, 0xf0d0, 0xf0d1, 0xf0d2, 0xf0d3, 0xf0d4, 0xf0d5, 0xf0d6, 0xf0d7, 0xf0d8, 0xf0d9, 0xf0da, 0xf0db, 0xf0dc, 0xf0dd, 0xf0de, 0xf0df, 0xf0e0, 0xf0e1, 0xf0e2, 0xf0e3, 0xf0e4, 0xf0e5, 0xf0e6, 0xf0e7, 0xf0e8, 0xf0e9, 0xf0ea, 0xf0eb, 0xf0ec, 0xf0ed, 0xf0ee, 0xf0ef, 0xf0f0, 0xf0f1, 0xf0f2, 0xf0f3, 0xf0f4, 0xf0f5, 0xf0f6, 0xf0f7, 0xf0f8, 0xf0f9, 0xf0fa, 0xf0fb, 0xf0fc, 0xf0fd, 0xf0fe, 0xf0ff } }; /* The standard kernel character-to-font mappings are not invertible -- this is just a best effort. */ #define MAX_GLYPH 512 /* Max possible glyph value */ static enum translation_map inv_translate[MAX_NR_CONSOLES]; #define UNI_DIRS 32U #define UNI_DIR_ROWS 32U #define UNI_ROW_GLYPHS 64U #define UNI_DIR_BITS GENMASK(15, 11) #define UNI_ROW_BITS GENMASK(10, 6) #define UNI_GLYPH_BITS GENMASK( 5, 0) #define UNI_DIR(uni) FIELD_GET(UNI_DIR_BITS, (uni)) #define UNI_ROW(uni) FIELD_GET(UNI_ROW_BITS, (uni)) #define UNI_GLYPH(uni) FIELD_GET(UNI_GLYPH_BITS, (uni)) #define UNI(dir, row, glyph) (FIELD_PREP(UNI_DIR_BITS, (dir)) | \ FIELD_PREP(UNI_ROW_BITS, (row)) | \ FIELD_PREP(UNI_GLYPH_BITS, (glyph))) /** * struct uni_pagedict - unicode directory * * @uni_pgdir: 32*32*64 table with glyphs * @refcount: reference count of this structure * @sum: checksum * @inverse_translations: best-effort inverse mapping * @inverse_trans_unicode: best-effort inverse mapping to unicode */ struct uni_pagedict { u16 **uni_pgdir[UNI_DIRS]; unsigned long refcount; unsigned long sum; unsigned char *inverse_translations[LAST_MAP + 1]; u16 *inverse_trans_unicode; }; static struct uni_pagedict *dflt; static void set_inverse_transl(struct vc_data *conp, struct uni_pagedict *dict, enum translation_map m) { unsigned short *t = translations[m]; unsigned char *inv; if (!dict) return; inv = dict->inverse_translations[m]; if (!inv) { inv = dict->inverse_translations[m] = kmalloc(MAX_GLYPH, GFP_KERNEL); if (!inv) return; } memset(inv, 0, MAX_GLYPH); for (unsigned int ch = 0; ch < ARRAY_SIZE(translations[m]); ch++) { int glyph = conv_uni_to_pc(conp, t[ch]); if (glyph >= 0 && glyph < MAX_GLYPH && inv[glyph] < 32) { /* prefer '-' above SHY etc. */ inv[glyph] = ch; } } } static void set_inverse_trans_unicode(struct uni_pagedict *dict) { unsigned int d, r, g; u16 *inv; if (!dict) return; inv = dict->inverse_trans_unicode; if (!inv) { inv = dict->inverse_trans_unicode = kmalloc_array(MAX_GLYPH, sizeof(*inv), GFP_KERNEL); if (!inv) return; } memset(inv, 0, MAX_GLYPH * sizeof(*inv)); for (d = 0; d < UNI_DIRS; d++) { u16 **dir = dict->uni_pgdir[d]; if (!dir) continue; for (r = 0; r < UNI_DIR_ROWS; r++) { u16 *row = dir[r]; if (!row) continue; for (g = 0; g < UNI_ROW_GLYPHS; g++) { u16 glyph = row[g]; if (glyph < MAX_GLYPH && inv[glyph] < 32) inv[glyph] = UNI(d, r, g); } } } } unsigned short *set_translate(enum translation_map m, struct vc_data *vc) { inv_translate[vc->vc_num] = m; return translations[m]; } /* * Inverse translation is impossible for several reasons: * 1. The font<->character maps are not 1-1. * 2. The text may have been written while a different translation map * was active. * Still, it is now possible to a certain extent to cut and paste non-ASCII. */ u16 inverse_translate(const struct vc_data *conp, u16 glyph, bool use_unicode) { struct uni_pagedict *p; enum translation_map m; if (glyph >= MAX_GLYPH) return 0; p = *conp->uni_pagedict_loc; if (!p) return glyph; if (use_unicode) { if (!p->inverse_trans_unicode) return glyph; return p->inverse_trans_unicode[glyph]; } m = inv_translate[conp->vc_num]; if (!p->inverse_translations[m]) return glyph; return p->inverse_translations[m][glyph]; } EXPORT_SYMBOL_GPL(inverse_translate); static void update_user_maps(void) { int i; struct uni_pagedict *p, *q = NULL; for (i = 0; i < MAX_NR_CONSOLES; i++) { if (!vc_cons_allocated(i)) continue; p = *vc_cons[i].d->uni_pagedict_loc; if (p && p != q) { set_inverse_transl(vc_cons[i].d, p, USER_MAP); set_inverse_trans_unicode(p); q = p; } } } /* * Load customizable translation table * arg points to a 256 byte translation table. * * The "old" variants are for translation directly to font (using the * 0xf000-0xf0ff "transparent" Unicodes) whereas the "new" variants set * Unicodes explicitly. */ int con_set_trans_old(unsigned char __user * arg) { unsigned short inbuf[E_TABSZ]; unsigned int i; unsigned char ch; for (i = 0; i < ARRAY_SIZE(inbuf); i++) { if (get_user(ch, &arg[i])) return -EFAULT; inbuf[i] = UNI_DIRECT_BASE | ch; } console_lock(); memcpy(translations[USER_MAP], inbuf, sizeof(inbuf)); update_user_maps(); console_unlock(); return 0; } int con_get_trans_old(unsigned char __user * arg) { int i, ch; unsigned short *p = translations[USER_MAP]; unsigned char outbuf[E_TABSZ]; console_lock(); for (i = 0; i < ARRAY_SIZE(outbuf); i++) { ch = conv_uni_to_pc(vc_cons[fg_console].d, p[i]); outbuf[i] = (ch & ~0xff) ? 0 : ch; } console_unlock(); return copy_to_user(arg, outbuf, sizeof(outbuf)) ? -EFAULT : 0; } int con_set_trans_new(ushort __user * arg) { unsigned short inbuf[E_TABSZ]; if (copy_from_user(inbuf, arg, sizeof(inbuf))) return -EFAULT; console_lock(); memcpy(translations[USER_MAP], inbuf, sizeof(inbuf)); update_user_maps(); console_unlock(); return 0; } int con_get_trans_new(ushort __user * arg) { unsigned short outbuf[E_TABSZ]; console_lock(); memcpy(outbuf, translations[USER_MAP], sizeof(outbuf)); console_unlock(); return copy_to_user(arg, outbuf, sizeof(outbuf)) ? -EFAULT : 0; } /* * Unicode -> current font conversion * * A font has at most 512 chars, usually 256. * But one font position may represent several Unicode chars. * A hashtable is somewhat of a pain to deal with, so use a * "paged table" instead. Simulation has shown the memory cost of * this 3-level paged table scheme to be comparable to a hash table. */ extern u8 dfont_unicount[]; /* Defined in console_defmap.c */ extern u16 dfont_unitable[]; static void con_release_unimap(struct uni_pagedict *dict) { unsigned int d, r; if (dict == dflt) dflt = NULL; for (d = 0; d < UNI_DIRS; d++) { u16 **dir = dict->uni_pgdir[d]; if (dir != NULL) { for (r = 0; r < UNI_DIR_ROWS; r++) kfree(dir[r]); kfree(dir); } dict->uni_pgdir[d] = NULL; } for (r = 0; r < ARRAY_SIZE(dict->inverse_translations); r++) { kfree(dict->inverse_translations[r]); dict->inverse_translations[r] = NULL; } kfree(dict->inverse_trans_unicode); dict->inverse_trans_unicode = NULL; } /* Caller must hold the console lock */ void con_free_unimap(struct vc_data *vc) { struct uni_pagedict *p; p = *vc->uni_pagedict_loc; if (!p) return; *vc->uni_pagedict_loc = NULL; if (--p->refcount) return; con_release_unimap(p); kfree(p); } static int con_unify_unimap(struct vc_data *conp, struct uni_pagedict *dict1) { struct uni_pagedict *dict2; unsigned int cons, d, r; for (cons = 0; cons < MAX_NR_CONSOLES; cons++) { if (!vc_cons_allocated(cons)) continue; dict2 = *vc_cons[cons].d->uni_pagedict_loc; if (!dict2 || dict2 == dict1 || dict2->sum != dict1->sum) continue; for (d = 0; d < UNI_DIRS; d++) { u16 **dir1 = dict1->uni_pgdir[d]; u16 **dir2 = dict2->uni_pgdir[d]; if (!dir1 && !dir2) continue; if (!dir1 || !dir2) break; for (r = 0; r < UNI_DIR_ROWS; r++) { if (!dir1[r] && !dir2[r]) continue; if (!dir1[r] || !dir2[r]) break; if (memcmp(dir1[r], dir2[r], UNI_ROW_GLYPHS * sizeof(*dir1[r]))) break; } if (r < UNI_DIR_ROWS) break; } if (d == UNI_DIRS) { dict2->refcount++; *conp->uni_pagedict_loc = dict2; con_release_unimap(dict1); kfree(dict1); return 1; } } return 0; } static int con_insert_unipair(struct uni_pagedict *p, u_short unicode, u_short fontpos) { u16 **dir, *row; unsigned int n; n = UNI_DIR(unicode); dir = p->uni_pgdir[n]; if (!dir) { dir = p->uni_pgdir[n] = kcalloc(UNI_DIR_ROWS, sizeof(*dir), GFP_KERNEL); if (!dir) return -ENOMEM; } n = UNI_ROW(unicode); row = dir[n]; if (!row) { row = dir[n] = kmalloc_array(UNI_ROW_GLYPHS, sizeof(*row), GFP_KERNEL); if (!row) return -ENOMEM; /* No glyphs for the characters (yet) */ memset(row, 0xff, UNI_ROW_GLYPHS * sizeof(*row)); } row[UNI_GLYPH(unicode)] = fontpos; p->sum += (fontpos << 20U) + unicode; return 0; } static int con_allocate_new(struct vc_data *vc) { struct uni_pagedict *new, *old = *vc->uni_pagedict_loc; new = kzalloc(sizeof(*new), GFP_KERNEL); if (!new) return -ENOMEM; new->refcount = 1; *vc->uni_pagedict_loc = new; if (old) old->refcount--; return 0; } /* Caller must hold the lock */ static int con_do_clear_unimap(struct vc_data *vc) { struct uni_pagedict *old = *vc->uni_pagedict_loc; if (!old || old->refcount > 1) return con_allocate_new(vc); old->sum = 0; con_release_unimap(old); return 0; } int con_clear_unimap(struct vc_data *vc) { int ret; console_lock(); ret = con_do_clear_unimap(vc); console_unlock(); return ret; } static struct uni_pagedict *con_unshare_unimap(struct vc_data *vc, struct uni_pagedict *old) { struct uni_pagedict *new; unsigned int d, r, g; int ret; u16 uni = 0; ret = con_allocate_new(vc); if (ret) return ERR_PTR(ret); new = *vc->uni_pagedict_loc; /* * uni_pgdir is a 32*32*64 table with rows allocated when its first * entry is added. The unicode value must still be incremented for * empty rows. We are copying entries from "old" to "new". */ for (d = 0; d < UNI_DIRS; d++) { u16 **dir = old->uni_pgdir[d]; if (!dir) { /* Account for empty table */ uni += UNI_DIR_ROWS * UNI_ROW_GLYPHS; continue; } for (r = 0; r < UNI_DIR_ROWS; r++) { u16 *row = dir[r]; if (!row) { /* Account for row of 64 empty entries */ uni += UNI_ROW_GLYPHS; continue; } for (g = 0; g < UNI_ROW_GLYPHS; g++, uni++) { if (row[g] == 0xffff) continue; /* * Found one, copy entry for unicode uni with * fontpos value row[g]. */ ret = con_insert_unipair(new, uni, row[g]); if (ret) { old->refcount++; *vc->uni_pagedict_loc = old; con_release_unimap(new); kfree(new); return ERR_PTR(ret); } } } } return new; } int con_set_unimap(struct vc_data *vc, ushort ct, struct unipair __user *list) { int err = 0, err1; struct uni_pagedict *dict; struct unipair *unilist, *plist; if (!ct) return 0; unilist = vmemdup_user(list, array_size(sizeof(*unilist), ct)); if (IS_ERR(unilist)) return PTR_ERR(unilist); console_lock(); /* Save original vc_unipagdir_loc in case we allocate a new one */ dict = *vc->uni_pagedict_loc; if (!dict) { err = -EINVAL; goto out_unlock; } if (dict->refcount > 1) { dict = con_unshare_unimap(vc, dict); if (IS_ERR(dict)) { err = PTR_ERR(dict); goto out_unlock; } } else if (dict == dflt) { dflt = NULL; } /* * Insert user specified unicode pairs into new table. */ for (plist = unilist; ct; ct--, plist++) { err1 = con_insert_unipair(dict, plist->unicode, plist->fontpos); if (err1) err = err1; } /* * Merge with fontmaps of any other virtual consoles. */ if (con_unify_unimap(vc, dict)) goto out_unlock; for (enum translation_map m = FIRST_MAP; m <= LAST_MAP; m++) set_inverse_transl(vc, dict, m); set_inverse_trans_unicode(dict); out_unlock: console_unlock(); kvfree(unilist); return err; } /** * con_set_default_unimap - set default unicode map * @vc: the console we are updating * * Loads the unimap for the hardware font, as defined in uni_hash.tbl. * The representation used was the most compact I could come up * with. This routine is executed at video setup, and when the * PIO_FONTRESET ioctl is called. * * The caller must hold the console lock */ int con_set_default_unimap(struct vc_data *vc) { struct uni_pagedict *dict; unsigned int fontpos, count; int err = 0, err1; u16 *dfont; if (dflt) { dict = *vc->uni_pagedict_loc; if (dict == dflt) return 0; dflt->refcount++; *vc->uni_pagedict_loc = dflt; if (dict && !--dict->refcount) { con_release_unimap(dict); kfree(dict); } return 0; } /* The default font is always 256 characters */ err = con_do_clear_unimap(vc); if (err) return err; dict = *vc->uni_pagedict_loc; dfont = dfont_unitable; for (fontpos = 0; fontpos < 256U; fontpos++) for (count = dfont_unicount[fontpos]; count; count--) { err1 = con_insert_unipair(dict, *(dfont++), fontpos); if (err1) err = err1; } if (con_unify_unimap(vc, dict)) { dflt = *vc->uni_pagedict_loc; return err; } for (enum translation_map m = FIRST_MAP; m <= LAST_MAP; m++) set_inverse_transl(vc, dict, m); set_inverse_trans_unicode(dict); dflt = dict; return err; } EXPORT_SYMBOL(con_set_default_unimap); /** * con_copy_unimap - copy unimap between two vts * @dst_vc: target * @src_vc: source * * The caller must hold the console lock when invoking this method */ int con_copy_unimap(struct vc_data *dst_vc, struct vc_data *src_vc) { struct uni_pagedict *src; if (!*src_vc->uni_pagedict_loc) return -EINVAL; if (*dst_vc->uni_pagedict_loc == *src_vc->uni_pagedict_loc) return 0; con_free_unimap(dst_vc); src = *src_vc->uni_pagedict_loc; src->refcount++; *dst_vc->uni_pagedict_loc = src; return 0; } EXPORT_SYMBOL(con_copy_unimap); /* * con_get_unimap - get the unicode map * * Read the console unicode data for this console. Called from the ioctl * handlers. */ int con_get_unimap(struct vc_data *vc, ushort ct, ushort __user *uct, struct unipair __user *list) { ushort ect; struct uni_pagedict *dict; struct unipair *unilist; unsigned int d, r, g; int ret = 0; unilist = kvmalloc_array(ct, sizeof(*unilist), GFP_KERNEL); if (!unilist) return -ENOMEM; console_lock(); ect = 0; dict = *vc->uni_pagedict_loc; if (!dict) goto unlock; for (d = 0; d < UNI_DIRS; d++) { u16 **dir = dict->uni_pgdir[d]; if (!dir) continue; for (r = 0; r < UNI_DIR_ROWS; r++) { u16 *row = dir[r]; if (!row) continue; for (g = 0; g < UNI_ROW_GLYPHS; g++, row++) { if (*row >= MAX_GLYPH) continue; if (ect < ct) { unilist[ect].unicode = UNI(d, r, g); unilist[ect].fontpos = *row; } ect++; } } } unlock: console_unlock(); if (copy_to_user(list, unilist, min(ect, ct) * sizeof(*unilist))) ret = -EFAULT; if (put_user(ect, uct)) ret = -EFAULT; kvfree(unilist); return ret ? ret : (ect <= ct) ? 0 : -ENOMEM; } /* * Always use USER_MAP. These functions are used by the keyboard, * which shouldn't be affected by G0/G1 switching, etc. * If the user map still contains default values, i.e. the * direct-to-font mapping, then assume user is using Latin1. * * FIXME: at some point we need to decide if we want to lock the table * update element itself via the keyboard_event_lock for consistency with the * keyboard driver as well as the consoles */ /* may be called during an interrupt */ u32 conv_8bit_to_uni(unsigned char c) { unsigned short uni = translations[USER_MAP][c]; return uni == (0xf000 | c) ? c : uni; } int conv_uni_to_8bit(u32 uni) { int c; for (c = 0; c < ARRAY_SIZE(translations[USER_MAP]); c++) if (translations[USER_MAP][c] == uni || (translations[USER_MAP][c] == (c | 0xf000) && uni == c)) return c; return -1; } int conv_uni_to_pc(struct vc_data *conp, long ucs) { struct uni_pagedict *dict; u16 **dir, *row, glyph; /* Only 16-bit codes supported at this time */ if (ucs > 0xffff) return -4; /* Not found */ else if (ucs < 0x20) return -1; /* Not a printable character */ else if (ucs == 0xfeff || (ucs >= 0x200b && ucs <= 0x200f)) return -2; /* Zero-width space */ /* * UNI_DIRECT_BASE indicates the start of the region in the User Zone * which always has a 1:1 mapping to the currently loaded font. The * UNI_DIRECT_MASK indicates the bit span of the region. */ else if ((ucs & ~UNI_DIRECT_MASK) == UNI_DIRECT_BASE) return ucs & UNI_DIRECT_MASK; dict = *conp->uni_pagedict_loc; if (!dict) return -3; dir = dict->uni_pgdir[UNI_DIR(ucs)]; if (!dir) return -4; row = dir[UNI_ROW(ucs)]; if (!row) return -4; glyph = row[UNI_GLYPH(ucs)]; if (glyph >= MAX_GLYPH) return -4; return glyph; } /* * This is called at sys_setup time, after memory and the console are * initialized. It must be possible to call kmalloc(..., GFP_KERNEL) * from this function, hence the call from sys_setup. */ void __init console_map_init(void) { int i; for (i = 0; i < MAX_NR_CONSOLES; i++) if (vc_cons_allocated(i) && !*vc_cons[i].d->uni_pagedict_loc) con_set_default_unimap(vc_cons[i].d); }
2 4 4 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 // SPDX-License-Identifier: GPL-2.0-only /* L2TP core. * * Copyright (c) 2008,2009,2010 Katalix Systems Ltd * * This file contains some code of the original L2TPv2 pppol2tp * driver, which has the following copyright: * * Authors: Martijn van Oosterhout <kleptog@svana.org> * James Chapman (jchapman@katalix.com) * Contributors: * Michal Ostrowski <mostrows@speakeasy.net> * Arnaldo Carvalho de Melo <acme@xconectiva.com.br> * David S. Miller (davem@redhat.com) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/string.h> #include <linux/list.h> #include <linux/rculist.h> #include <linux/uaccess.h> #include <linux/kernel.h> #include <linux/spinlock.h> #include <linux/kthread.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/errno.h> #include <linux/jiffies.h> #include <linux/netdevice.h> #include <linux/net.h> #include <linux/inetdevice.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/udp.h> #include <linux/l2tp.h> #include <linux/hash.h> #include <linux/sort.h> #include <linux/file.h> #include <linux/nsproxy.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/dst.h> #include <net/ip.h> #include <net/udp.h> #include <net/udp_tunnel.h> #include <net/inet_common.h> #include <net/xfrm.h> #include <net/protocol.h> #include <net/inet6_connection_sock.h> #include <net/inet_ecn.h> #include <net/ip6_route.h> #include <net/ip6_checksum.h> #include <asm/byteorder.h> #include <linux/atomic.h> #include "l2tp_core.h" #include "trace.h" #define CREATE_TRACE_POINTS #include "trace.h" #define L2TP_DRV_VERSION "V2.0" /* L2TP header constants */ #define L2TP_HDRFLAG_T 0x8000 #define L2TP_HDRFLAG_L 0x4000 #define L2TP_HDRFLAG_S 0x0800 #define L2TP_HDRFLAG_O 0x0200 #define L2TP_HDRFLAG_P 0x0100 #define L2TP_HDR_VER_MASK 0x000F #define L2TP_HDR_VER_2 0x0002 #define L2TP_HDR_VER_3 0x0003 /* L2TPv3 default L2-specific sublayer */ #define L2TP_SLFLAG_S 0x40000000 #define L2TP_SL_SEQ_MASK 0x00ffffff #define L2TP_HDR_SIZE_MAX 14 /* Default trace flags */ #define L2TP_DEFAULT_DEBUG_FLAGS 0 /* Private data stored for received packets in the skb. */ struct l2tp_skb_cb { u32 ns; u16 has_seq; u16 length; unsigned long expires; }; #define L2TP_SKB_CB(skb) ((struct l2tp_skb_cb *)&(skb)->cb[sizeof(struct inet_skb_parm)]) static struct workqueue_struct *l2tp_wq; /* per-net private data for this module */ static unsigned int l2tp_net_id; struct l2tp_net { /* Lock for write access to l2tp_tunnel_idr */ spinlock_t l2tp_tunnel_idr_lock; struct idr l2tp_tunnel_idr; struct hlist_head l2tp_session_hlist[L2TP_HASH_SIZE_2]; /* Lock for write access to l2tp_session_hlist */ spinlock_t l2tp_session_hlist_lock; }; #if IS_ENABLED(CONFIG_IPV6) static bool l2tp_sk_is_v6(struct sock *sk) { return sk->sk_family == PF_INET6 && !ipv6_addr_v4mapped(&sk->sk_v6_daddr); } #endif static inline struct l2tp_net *l2tp_pernet(const struct net *net) { return net_generic(net, l2tp_net_id); } /* Session hash global list for L2TPv3. * The session_id SHOULD be random according to RFC3931, but several * L2TP implementations use incrementing session_ids. So we do a real * hash on the session_id, rather than a simple bitmask. */ static inline struct hlist_head * l2tp_session_id_hash_2(struct l2tp_net *pn, u32 session_id) { return &pn->l2tp_session_hlist[hash_32(session_id, L2TP_HASH_BITS_2)]; } /* Session hash list. * The session_id SHOULD be random according to RFC2661, but several * L2TP implementations (Cisco and Microsoft) use incrementing * session_ids. So we do a real hash on the session_id, rather than a * simple bitmask. */ static inline struct hlist_head * l2tp_session_id_hash(struct l2tp_tunnel *tunnel, u32 session_id) { return &tunnel->session_hlist[hash_32(session_id, L2TP_HASH_BITS)]; } static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel) { trace_free_tunnel(tunnel); sock_put(tunnel->sock); /* the tunnel is freed in the socket destructor */ } static void l2tp_session_free(struct l2tp_session *session) { trace_free_session(session); if (session->tunnel) l2tp_tunnel_dec_refcount(session->tunnel); kfree(session); } struct l2tp_tunnel *l2tp_sk_to_tunnel(struct sock *sk) { struct l2tp_tunnel *tunnel = sk->sk_user_data; if (tunnel) if (WARN_ON(tunnel->magic != L2TP_TUNNEL_MAGIC)) return NULL; return tunnel; } EXPORT_SYMBOL_GPL(l2tp_sk_to_tunnel); void l2tp_tunnel_inc_refcount(struct l2tp_tunnel *tunnel) { refcount_inc(&tunnel->ref_count); } EXPORT_SYMBOL_GPL(l2tp_tunnel_inc_refcount); void l2tp_tunnel_dec_refcount(struct l2tp_tunnel *tunnel) { if (refcount_dec_and_test(&tunnel->ref_count)) l2tp_tunnel_free(tunnel); } EXPORT_SYMBOL_GPL(l2tp_tunnel_dec_refcount); void l2tp_session_inc_refcount(struct l2tp_session *session) { refcount_inc(&session->ref_count); } EXPORT_SYMBOL_GPL(l2tp_session_inc_refcount); void l2tp_session_dec_refcount(struct l2tp_session *session) { if (refcount_dec_and_test(&session->ref_count)) l2tp_session_free(session); } EXPORT_SYMBOL_GPL(l2tp_session_dec_refcount); /* Lookup a tunnel. A new reference is held on the returned tunnel. */ struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id) { const struct l2tp_net *pn = l2tp_pernet(net); struct l2tp_tunnel *tunnel; rcu_read_lock_bh(); tunnel = idr_find(&pn->l2tp_tunnel_idr, tunnel_id); if (tunnel && refcount_inc_not_zero(&tunnel->ref_count)) { rcu_read_unlock_bh(); return tunnel; } rcu_read_unlock_bh(); return NULL; } EXPORT_SYMBOL_GPL(l2tp_tunnel_get); struct l2tp_tunnel *l2tp_tunnel_get_nth(const struct net *net, int nth) { struct l2tp_net *pn = l2tp_pernet(net); unsigned long tunnel_id, tmp; struct l2tp_tunnel *tunnel; int count = 0; rcu_read_lock_bh(); idr_for_each_entry_ul(&pn->l2tp_tunnel_idr, tunnel, tmp, tunnel_id) { if (tunnel && ++count > nth && refcount_inc_not_zero(&tunnel->ref_count)) { rcu_read_unlock_bh(); return tunnel; } } rcu_read_unlock_bh(); return NULL; } EXPORT_SYMBOL_GPL(l2tp_tunnel_get_nth); struct l2tp_session *l2tp_tunnel_get_session(struct l2tp_tunnel *tunnel, u32 session_id) { struct hlist_head *session_list; struct l2tp_session *session; session_list = l2tp_session_id_hash(tunnel, session_id); rcu_read_lock_bh(); hlist_for_each_entry_rcu(session, session_list, hlist) if (session->session_id == session_id) { l2tp_session_inc_refcount(session); rcu_read_unlock_bh(); return session; } rcu_read_unlock_bh(); return NULL; } EXPORT_SYMBOL_GPL(l2tp_tunnel_get_session); struct l2tp_session *l2tp_session_get(const struct net *net, u32 session_id) { struct hlist_head *session_list; struct l2tp_session *session; session_list = l2tp_session_id_hash_2(l2tp_pernet(net), session_id); rcu_read_lock_bh(); hlist_for_each_entry_rcu(session, session_list, global_hlist) if (session->session_id == session_id) { l2tp_session_inc_refcount(session); rcu_read_unlock_bh(); return session; } rcu_read_unlock_bh(); return NULL; } EXPORT_SYMBOL_GPL(l2tp_session_get); struct l2tp_session *l2tp_session_get_nth(struct l2tp_tunnel *tunnel, int nth) { int hash; struct l2tp_session *session; int count = 0; rcu_read_lock_bh(); for (hash = 0; hash < L2TP_HASH_SIZE; hash++) { hlist_for_each_entry_rcu(session, &tunnel->session_hlist[hash], hlist) { if (++count > nth) { l2tp_session_inc_refcount(session); rcu_read_unlock_bh(); return session; } } } rcu_read_unlock_bh(); return NULL; } EXPORT_SYMBOL_GPL(l2tp_session_get_nth); /* Lookup a session by interface name. * This is very inefficient but is only used by management interfaces. */ struct l2tp_session *l2tp_session_get_by_ifname(const struct net *net, const char *ifname) { struct l2tp_net *pn = l2tp_pernet(net); int hash; struct l2tp_session *session; rcu_read_lock_bh(); for (hash = 0; hash < L2TP_HASH_SIZE_2; hash++) { hlist_for_each_entry_rcu(session, &pn->l2tp_session_hlist[hash], global_hlist) { if (!strcmp(session->ifname, ifname)) { l2tp_session_inc_refcount(session); rcu_read_unlock_bh(); return session; } } } rcu_read_unlock_bh(); return NULL; } EXPORT_SYMBOL_GPL(l2tp_session_get_by_ifname); int l2tp_session_register(struct l2tp_session *session, struct l2tp_tunnel *tunnel) { struct l2tp_session *session_walk; struct hlist_head *g_head; struct hlist_head *head; struct l2tp_net *pn; int err; head = l2tp_session_id_hash(tunnel, session->session_id); spin_lock_bh(&tunnel->hlist_lock); if (!tunnel->acpt_newsess) { err = -ENODEV; goto err_tlock; } hlist_for_each_entry(session_walk, head, hlist) if (session_walk->session_id == session->session_id) { err = -EEXIST; goto err_tlock; } if (tunnel->version == L2TP_HDR_VER_3) { pn = l2tp_pernet(tunnel->l2tp_net); g_head = l2tp_session_id_hash_2(pn, session->session_id); spin_lock_bh(&pn->l2tp_session_hlist_lock); /* IP encap expects session IDs to be globally unique, while * UDP encap doesn't. */ hlist_for_each_entry(session_walk, g_head, global_hlist) if (session_walk->session_id == session->session_id && (session_walk->tunnel->encap == L2TP_ENCAPTYPE_IP || tunnel->encap == L2TP_ENCAPTYPE_IP)) { err = -EEXIST; goto err_tlock_pnlock; } l2tp_tunnel_inc_refcount(tunnel); hlist_add_head_rcu(&session->global_hlist, g_head); spin_unlock_bh(&pn->l2tp_session_hlist_lock); } else { l2tp_tunnel_inc_refcount(tunnel); } hlist_add_head_rcu(&session->hlist, head); spin_unlock_bh(&tunnel->hlist_lock); trace_register_session(session); return 0; err_tlock_pnlock: spin_unlock_bh(&pn->l2tp_session_hlist_lock); err_tlock: spin_unlock_bh(&tunnel->hlist_lock); return err; } EXPORT_SYMBOL_GPL(l2tp_session_register); /***************************************************************************** * Receive data handling *****************************************************************************/ /* Queue a skb in order. We come here only if the skb has an L2TP sequence * number. */ static void l2tp_recv_queue_skb(struct l2tp_session *session, struct sk_buff *skb) { struct sk_buff *skbp; struct sk_buff *tmp; u32 ns = L2TP_SKB_CB(skb)->ns; spin_lock_bh(&session->reorder_q.lock); skb_queue_walk_safe(&session->reorder_q, skbp, tmp) { if (L2TP_SKB_CB(skbp)->ns > ns) { __skb_queue_before(&session->reorder_q, skbp, skb); atomic_long_inc(&session->stats.rx_oos_packets); goto out; } } __skb_queue_tail(&session->reorder_q, skb); out: spin_unlock_bh(&session->reorder_q.lock); } /* Dequeue a single skb. */ static void l2tp_recv_dequeue_skb(struct l2tp_session *session, struct sk_buff *skb) { struct l2tp_tunnel *tunnel = session->tunnel; int length = L2TP_SKB_CB(skb)->length; /* We're about to requeue the skb, so return resources * to its current owner (a socket receive buffer). */ skb_orphan(skb); atomic_long_inc(&tunnel->stats.rx_packets); atomic_long_add(length, &tunnel->stats.rx_bytes); atomic_long_inc(&session->stats.rx_packets); atomic_long_add(length, &session->stats.rx_bytes); if (L2TP_SKB_CB(skb)->has_seq) { /* Bump our Nr */ session->nr++; session->nr &= session->nr_max; trace_session_seqnum_update(session); } /* call private receive handler */ if (session->recv_skb) (*session->recv_skb)(session, skb, L2TP_SKB_CB(skb)->length); else kfree_skb(skb); } /* Dequeue skbs from the session's reorder_q, subject to packet order. * Skbs that have been in the queue for too long are simply discarded. */ static void l2tp_recv_dequeue(struct l2tp_session *session) { struct sk_buff *skb; struct sk_buff *tmp; /* If the pkt at the head of the queue has the nr that we * expect to send up next, dequeue it and any other * in-sequence packets behind it. */ start: spin_lock_bh(&session->reorder_q.lock); skb_queue_walk_safe(&session->reorder_q, skb, tmp) { struct l2tp_skb_cb *cb = L2TP_SKB_CB(skb); /* If the packet has been pending on the queue for too long, discard it */ if (time_after(jiffies, cb->expires)) { atomic_long_inc(&session->stats.rx_seq_discards); atomic_long_inc(&session->stats.rx_errors); trace_session_pkt_expired(session, cb->ns); session->reorder_skip = 1; __skb_unlink(skb, &session->reorder_q); kfree_skb(skb); continue; } if (cb->has_seq) { if (session->reorder_skip) { session->reorder_skip = 0; session->nr = cb->ns; trace_session_seqnum_reset(session); } if (cb->ns != session->nr) goto out; } __skb_unlink(skb, &session->reorder_q); /* Process the skb. We release the queue lock while we * do so to let other contexts process the queue. */ spin_unlock_bh(&session->reorder_q.lock); l2tp_recv_dequeue_skb(session, skb); goto start; } out: spin_unlock_bh(&session->reorder_q.lock); } static int l2tp_seq_check_rx_window(struct l2tp_session *session, u32 nr) { u32 nws; if (nr >= session->nr) nws = nr - session->nr; else nws = (session->nr_max + 1) - (session->nr - nr); return nws < session->nr_window_size; } /* If packet has sequence numbers, queue it if acceptable. Returns 0 if * acceptable, else non-zero. */ static int l2tp_recv_data_seq(struct l2tp_session *session, struct sk_buff *skb) { struct l2tp_skb_cb *cb = L2TP_SKB_CB(skb); if (!l2tp_seq_check_rx_window(session, cb->ns)) { /* Packet sequence number is outside allowed window. * Discard it. */ trace_session_pkt_outside_rx_window(session, cb->ns); goto discard; } if (session->reorder_timeout != 0) { /* Packet reordering enabled. Add skb to session's * reorder queue, in order of ns. */ l2tp_recv_queue_skb(session, skb); goto out; } /* Packet reordering disabled. Discard out-of-sequence packets, while * tracking the number if in-sequence packets after the first OOS packet * is seen. After nr_oos_count_max in-sequence packets, reset the * sequence number to re-enable packet reception. */ if (cb->ns == session->nr) { skb_queue_tail(&session->reorder_q, skb); } else { u32 nr_oos = cb->ns; u32 nr_next = (session->nr_oos + 1) & session->nr_max; if (nr_oos == nr_next) session->nr_oos_count++; else session->nr_oos_count = 0; session->nr_oos = nr_oos; if (session->nr_oos_count > session->nr_oos_count_max) { session->reorder_skip = 1; } if (!session->reorder_skip) { atomic_long_inc(&session->stats.rx_seq_discards); trace_session_pkt_oos(session, cb->ns); goto discard; } skb_queue_tail(&session->reorder_q, skb); } out: return 0; discard: return 1; } /* Do receive processing of L2TP data frames. We handle both L2TPv2 * and L2TPv3 data frames here. * * L2TPv2 Data Message Header * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |T|L|x|x|S|x|O|P|x|x|x|x| Ver | Length (opt) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Tunnel ID | Session ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Ns (opt) | Nr (opt) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Offset Size (opt) | Offset pad... (opt) * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Data frames are marked by T=0. All other fields are the same as * those in L2TP control frames. * * L2TPv3 Data Message Header * * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | L2TP Session Header | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | L2-Specific Sublayer | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Tunnel Payload ... * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * L2TPv3 Session Header Over IP * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Session ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Cookie (optional, maximum 64 bits)... * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * L2TPv3 L2-Specific Sublayer Format * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |x|S|x|x|x|x|x|x| Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Cookie value and sublayer format are negotiated with the peer when * the session is set up. Unlike L2TPv2, we do not need to parse the * packet header to determine if optional fields are present. * * Caller must already have parsed the frame and determined that it is * a data (not control) frame before coming here. Fields up to the * session-id have already been parsed and ptr points to the data * after the session-id. */ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, unsigned char *ptr, unsigned char *optr, u16 hdrflags, int length) { struct l2tp_tunnel *tunnel = session->tunnel; int offset; /* Parse and check optional cookie */ if (session->peer_cookie_len > 0) { if (memcmp(ptr, &session->peer_cookie[0], session->peer_cookie_len)) { pr_debug_ratelimited("%s: cookie mismatch (%u/%u). Discarding.\n", tunnel->name, tunnel->tunnel_id, session->session_id); atomic_long_inc(&session->stats.rx_cookie_discards); goto discard; } ptr += session->peer_cookie_len; } /* Handle the optional sequence numbers. Sequence numbers are * in different places for L2TPv2 and L2TPv3. * * If we are the LAC, enable/disable sequence numbers under * the control of the LNS. If no sequence numbers present but * we were expecting them, discard frame. */ L2TP_SKB_CB(skb)->has_seq = 0; if (tunnel->version == L2TP_HDR_VER_2) { if (hdrflags & L2TP_HDRFLAG_S) { /* Store L2TP info in the skb */ L2TP_SKB_CB(skb)->ns = ntohs(*(__be16 *)ptr); L2TP_SKB_CB(skb)->has_seq = 1; ptr += 2; /* Skip past nr in the header */ ptr += 2; } } else if (session->l2specific_type == L2TP_L2SPECTYPE_DEFAULT) { u32 l2h = ntohl(*(__be32 *)ptr); if (l2h & 0x40000000) { /* Store L2TP info in the skb */ L2TP_SKB_CB(skb)->ns = l2h & 0x00ffffff; L2TP_SKB_CB(skb)->has_seq = 1; } ptr += 4; } if (L2TP_SKB_CB(skb)->has_seq) { /* Received a packet with sequence numbers. If we're the LAC, * check if we sre sending sequence numbers and if not, * configure it so. */ if (!session->lns_mode && !session->send_seq) { trace_session_seqnum_lns_enable(session); session->send_seq = 1; l2tp_session_set_header_len(session, tunnel->version); } } else { /* No sequence numbers. * If user has configured mandatory sequence numbers, discard. */ if (session->recv_seq) { pr_debug_ratelimited("%s: recv data has no seq numbers when required. Discarding.\n", session->name); atomic_long_inc(&session->stats.rx_seq_discards); goto discard; } /* If we're the LAC and we're sending sequence numbers, the * LNS has requested that we no longer send sequence numbers. * If we're the LNS and we're sending sequence numbers, the * LAC is broken. Discard the frame. */ if (!session->lns_mode && session->send_seq) { trace_session_seqnum_lns_disable(session); session->send_seq = 0; l2tp_session_set_header_len(session, tunnel->version); } else if (session->send_seq) { pr_debug_ratelimited("%s: recv data has no seq numbers when required. Discarding.\n", session->name); atomic_long_inc(&session->stats.rx_seq_discards); goto discard; } } /* Session data offset is defined only for L2TPv2 and is * indicated by an optional 16-bit value in the header. */ if (tunnel->version == L2TP_HDR_VER_2) { /* If offset bit set, skip it. */ if (hdrflags & L2TP_HDRFLAG_O) { offset = ntohs(*(__be16 *)ptr); ptr += 2 + offset; } } offset = ptr - optr; if (!pskb_may_pull(skb, offset)) goto discard; __skb_pull(skb, offset); /* Prepare skb for adding to the session's reorder_q. Hold * packets for max reorder_timeout or 1 second if not * reordering. */ L2TP_SKB_CB(skb)->length = length; L2TP_SKB_CB(skb)->expires = jiffies + (session->reorder_timeout ? session->reorder_timeout : HZ); /* Add packet to the session's receive queue. Reordering is done here, if * enabled. Saved L2TP protocol info is stored in skb->sb[]. */ if (L2TP_SKB_CB(skb)->has_seq) { if (l2tp_recv_data_seq(session, skb)) goto discard; } else { /* No sequence numbers. Add the skb to the tail of the * reorder queue. This ensures that it will be * delivered after all previous sequenced skbs. */ skb_queue_tail(&session->reorder_q, skb); } /* Try to dequeue as many skbs from reorder_q as we can. */ l2tp_recv_dequeue(session); return; discard: atomic_long_inc(&session->stats.rx_errors); kfree_skb(skb); } EXPORT_SYMBOL_GPL(l2tp_recv_common); /* Drop skbs from the session's reorder_q */ static void l2tp_session_queue_purge(struct l2tp_session *session) { struct sk_buff *skb = NULL; while ((skb = skb_dequeue(&session->reorder_q))) { atomic_long_inc(&session->stats.rx_errors); kfree_skb(skb); } } /* Internal UDP receive frame. Do the real work of receiving an L2TP data frame * here. The skb is not on a list when we get here. * Returns 0 if the packet was a data packet and was successfully passed on. * Returns 1 if the packet was not a good data packet and could not be * forwarded. All such packets are passed up to userspace to deal with. */ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb) { struct l2tp_session *session = NULL; unsigned char *ptr, *optr; u16 hdrflags; u32 tunnel_id, session_id; u16 version; int length; /* UDP has verified checksum */ /* UDP always verifies the packet length. */ __skb_pull(skb, sizeof(struct udphdr)); /* Short packet? */ if (!pskb_may_pull(skb, L2TP_HDR_SIZE_MAX)) { pr_debug_ratelimited("%s: recv short packet (len=%d)\n", tunnel->name, skb->len); goto invalid; } /* Point to L2TP header */ optr = skb->data; ptr = skb->data; /* Get L2TP header flags */ hdrflags = ntohs(*(__be16 *)ptr); /* Check protocol version */ version = hdrflags & L2TP_HDR_VER_MASK; if (version != tunnel->version) { pr_debug_ratelimited("%s: recv protocol version mismatch: got %d expected %d\n", tunnel->name, version, tunnel->version); goto invalid; } /* Get length of L2TP packet */ length = skb->len; /* If type is control packet, it is handled by userspace. */ if (hdrflags & L2TP_HDRFLAG_T) goto pass; /* Skip flags */ ptr += 2; if (tunnel->version == L2TP_HDR_VER_2) { /* If length is present, skip it */ if (hdrflags & L2TP_HDRFLAG_L) ptr += 2; /* Extract tunnel and session ID */ tunnel_id = ntohs(*(__be16 *)ptr); ptr += 2; session_id = ntohs(*(__be16 *)ptr); ptr += 2; } else { ptr += 2; /* skip reserved bits */ tunnel_id = tunnel->tunnel_id; session_id = ntohl(*(__be32 *)ptr); ptr += 4; } /* Find the session context */ session = l2tp_tunnel_get_session(tunnel, session_id); if (!session || !session->recv_skb) { if (session) l2tp_session_dec_refcount(session); /* Not found? Pass to userspace to deal with */ pr_debug_ratelimited("%s: no session found (%u/%u). Passing up.\n", tunnel->name, tunnel_id, session_id); goto pass; } if (tunnel->version == L2TP_HDR_VER_3 && l2tp_v3_ensure_opt_in_linear(session, skb, &ptr, &optr)) { l2tp_session_dec_refcount(session); goto invalid; } l2tp_recv_common(session, skb, ptr, optr, hdrflags, length); l2tp_session_dec_refcount(session); return 0; invalid: atomic_long_inc(&tunnel->stats.rx_invalid); pass: /* Put UDP header back */ __skb_push(skb, sizeof(struct udphdr)); return 1; } /* UDP encapsulation receive handler. See net/ipv4/udp.c. * Return codes: * 0 : success. * <0: error * >0: skb should be passed up to userspace as UDP. */ int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) { struct l2tp_tunnel *tunnel; /* Note that this is called from the encap_rcv hook inside an * RCU-protected region, but without the socket being locked. * Hence we use rcu_dereference_sk_user_data to access the * tunnel data structure rather the usual l2tp_sk_to_tunnel * accessor function. */ tunnel = rcu_dereference_sk_user_data(sk); if (!tunnel) goto pass_up; if (WARN_ON(tunnel->magic != L2TP_TUNNEL_MAGIC)) goto pass_up; if (l2tp_udp_recv_core(tunnel, skb)) goto pass_up; return 0; pass_up: return 1; } EXPORT_SYMBOL_GPL(l2tp_udp_encap_recv); /************************************************************************ * Transmit handling ***********************************************************************/ /* Build an L2TP header for the session into the buffer provided. */ static int l2tp_build_l2tpv2_header(struct l2tp_session *session, void *buf) { struct l2tp_tunnel *tunnel = session->tunnel; __be16 *bufp = buf; __be16 *optr = buf; u16 flags = L2TP_HDR_VER_2; u32 tunnel_id = tunnel->peer_tunnel_id; u32 session_id = session->peer_session_id; if (session->send_seq) flags |= L2TP_HDRFLAG_S; /* Setup L2TP header. */ *bufp++ = htons(flags); *bufp++ = htons(tunnel_id); *bufp++ = htons(session_id); if (session->send_seq) { *bufp++ = htons(session->ns); *bufp++ = 0; session->ns++; session->ns &= 0xffff; trace_session_seqnum_update(session); } return bufp - optr; } static int l2tp_build_l2tpv3_header(struct l2tp_session *session, void *buf) { struct l2tp_tunnel *tunnel = session->tunnel; char *bufp = buf; char *optr = bufp; /* Setup L2TP header. The header differs slightly for UDP and * IP encapsulations. For UDP, there is 4 bytes of flags. */ if (tunnel->encap == L2TP_ENCAPTYPE_UDP) { u16 flags = L2TP_HDR_VER_3; *((__be16 *)bufp) = htons(flags); bufp += 2; *((__be16 *)bufp) = 0; bufp += 2; } *((__be32 *)bufp) = htonl(session->peer_session_id); bufp += 4; if (session->cookie_len) { memcpy(bufp, &session->cookie[0], session->cookie_len); bufp += session->cookie_len; } if (session->l2specific_type == L2TP_L2SPECTYPE_DEFAULT) { u32 l2h = 0; if (session->send_seq) { l2h = 0x40000000 | session->ns; session->ns++; session->ns &= 0xffffff; trace_session_seqnum_update(session); } *((__be32 *)bufp) = htonl(l2h); bufp += 4; } return bufp - optr; } /* Queue the packet to IP for output: tunnel socket lock must be held */ static int l2tp_xmit_queue(struct l2tp_tunnel *tunnel, struct sk_buff *skb, struct flowi *fl) { int err; skb->ignore_df = 1; skb_dst_drop(skb); #if IS_ENABLED(CONFIG_IPV6) if (l2tp_sk_is_v6(tunnel->sock)) err = inet6_csk_xmit(tunnel->sock, skb, NULL); else #endif err = ip_queue_xmit(tunnel->sock, skb, fl); return err >= 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP; } static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, unsigned int *len) { struct l2tp_tunnel *tunnel = session->tunnel; unsigned int data_len = skb->len; struct sock *sk = tunnel->sock; int headroom, uhlen, udp_len; int ret = NET_XMIT_SUCCESS; struct inet_sock *inet; struct udphdr *uh; /* Check that there's enough headroom in the skb to insert IP, * UDP and L2TP headers. If not enough, expand it to * make room. Adjust truesize. */ uhlen = (tunnel->encap == L2TP_ENCAPTYPE_UDP) ? sizeof(*uh) : 0; headroom = NET_SKB_PAD + sizeof(struct iphdr) + uhlen + session->hdr_len; if (skb_cow_head(skb, headroom)) { kfree_skb(skb); return NET_XMIT_DROP; } /* Setup L2TP header */ if (tunnel->version == L2TP_HDR_VER_2) l2tp_build_l2tpv2_header(session, __skb_push(skb, session->hdr_len)); else l2tp_build_l2tpv3_header(session, __skb_push(skb, session->hdr_len)); /* Reset skb netfilter state */ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); nf_reset_ct(skb); bh_lock_sock_nested(sk); if (sock_owned_by_user(sk)) { kfree_skb(skb); ret = NET_XMIT_DROP; goto out_unlock; } /* The user-space may change the connection status for the user-space * provided socket at run time: we must check it under the socket lock */ if (tunnel->fd >= 0 && sk->sk_state != TCP_ESTABLISHED) { kfree_skb(skb); ret = NET_XMIT_DROP; goto out_unlock; } /* Report transmitted length before we add encap header, which keeps * statistics consistent for both UDP and IP encap tx/rx paths. */ *len = skb->len; inet = inet_sk(sk); switch (tunnel->encap) { case L2TP_ENCAPTYPE_UDP: /* Setup UDP header */ __skb_push(skb, sizeof(*uh)); skb_reset_transport_header(skb); uh = udp_hdr(skb); uh->source = inet->inet_sport; uh->dest = inet->inet_dport; udp_len = uhlen + session->hdr_len + data_len; uh->len = htons(udp_len); /* Calculate UDP checksum if configured to do so */ #if IS_ENABLED(CONFIG_IPV6) if (l2tp_sk_is_v6(sk)) udp6_set_csum(udp_get_no_check6_tx(sk), skb, &inet6_sk(sk)->saddr, &sk->sk_v6_daddr, udp_len); else #endif udp_set_csum(sk->sk_no_check_tx, skb, inet->inet_saddr, inet->inet_daddr, udp_len); break; case L2TP_ENCAPTYPE_IP: break; } ret = l2tp_xmit_queue(tunnel, skb, &inet->cork.fl); out_unlock: bh_unlock_sock(sk); return ret; } /* If caller requires the skb to have a ppp header, the header must be * inserted in the skb data before calling this function. */ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb) { unsigned int len = 0; int ret; ret = l2tp_xmit_core(session, skb, &len); if (ret == NET_XMIT_SUCCESS) { atomic_long_inc(&session->tunnel->stats.tx_packets); atomic_long_add(len, &session->tunnel->stats.tx_bytes); atomic_long_inc(&session->stats.tx_packets); atomic_long_add(len, &session->stats.tx_bytes); } else { atomic_long_inc(&session->tunnel->stats.tx_errors); atomic_long_inc(&session->stats.tx_errors); } return ret; } EXPORT_SYMBOL_GPL(l2tp_xmit_skb); /***************************************************************************** * Tinnel and session create/destroy. *****************************************************************************/ /* Tunnel socket destruct hook. * The tunnel context is deleted only when all session sockets have been * closed. */ static void l2tp_tunnel_destruct(struct sock *sk) { struct l2tp_tunnel *tunnel = l2tp_sk_to_tunnel(sk); if (!tunnel) goto end; /* Disable udp encapsulation */ switch (tunnel->encap) { case L2TP_ENCAPTYPE_UDP: /* No longer an encapsulation socket. See net/ipv4/udp.c */ WRITE_ONCE(udp_sk(sk)->encap_type, 0); udp_sk(sk)->encap_rcv = NULL; udp_sk(sk)->encap_destroy = NULL; break; case L2TP_ENCAPTYPE_IP: break; } /* Remove hooks into tunnel socket */ write_lock_bh(&sk->sk_callback_lock); sk->sk_destruct = tunnel->old_sk_destruct; sk->sk_user_data = NULL; write_unlock_bh(&sk->sk_callback_lock); /* Call the original destructor */ if (sk->sk_destruct) (*sk->sk_destruct)(sk); kfree_rcu(tunnel, rcu); end: return; } /* Remove an l2tp session from l2tp_core's hash lists. */ static void l2tp_session_unhash(struct l2tp_session *session) { struct l2tp_tunnel *tunnel = session->tunnel; /* Remove the session from core hashes */ if (tunnel) { /* Remove from the per-tunnel hash */ spin_lock_bh(&tunnel->hlist_lock); hlist_del_init_rcu(&session->hlist); spin_unlock_bh(&tunnel->hlist_lock); /* For L2TPv3 we have a per-net hash: remove from there, too */ if (tunnel->version != L2TP_HDR_VER_2) { struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net); spin_lock_bh(&pn->l2tp_session_hlist_lock); hlist_del_init_rcu(&session->global_hlist); spin_unlock_bh(&pn->l2tp_session_hlist_lock); } synchronize_rcu(); } } /* When the tunnel is closed, all the attached sessions need to go too. */ static void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel) { struct l2tp_session *session; int hash; spin_lock_bh(&tunnel->hlist_lock); tunnel->acpt_newsess = false; for (hash = 0; hash < L2TP_HASH_SIZE; hash++) { again: hlist_for_each_entry_rcu(session, &tunnel->session_hlist[hash], hlist) { hlist_del_init_rcu(&session->hlist); spin_unlock_bh(&tunnel->hlist_lock); l2tp_session_delete(session); spin_lock_bh(&tunnel->hlist_lock); /* Now restart from the beginning of this hash * chain. We always remove a session from the * list so we are guaranteed to make forward * progress. */ goto again; } } spin_unlock_bh(&tunnel->hlist_lock); } /* Tunnel socket destroy hook for UDP encapsulation */ static void l2tp_udp_encap_destroy(struct sock *sk) { struct l2tp_tunnel *tunnel = l2tp_sk_to_tunnel(sk); if (tunnel) l2tp_tunnel_delete(tunnel); } static void l2tp_tunnel_remove(struct net *net, struct l2tp_tunnel *tunnel) { struct l2tp_net *pn = l2tp_pernet(net); spin_lock_bh(&pn->l2tp_tunnel_idr_lock); idr_remove(&pn->l2tp_tunnel_idr, tunnel->tunnel_id); spin_unlock_bh(&pn->l2tp_tunnel_idr_lock); } /* Workqueue tunnel deletion function */ static void l2tp_tunnel_del_work(struct work_struct *work) { struct l2tp_tunnel *tunnel = container_of(work, struct l2tp_tunnel, del_work); struct sock *sk = tunnel->sock; struct socket *sock = sk->sk_socket; l2tp_tunnel_closeall(tunnel); /* If the tunnel socket was created within the kernel, use * the sk API to release it here. */ if (tunnel->fd < 0) { if (sock) { kernel_sock_shutdown(sock, SHUT_RDWR); sock_release(sock); } } l2tp_tunnel_remove(tunnel->l2tp_net, tunnel); /* drop initial ref */ l2tp_tunnel_dec_refcount(tunnel); /* drop workqueue ref */ l2tp_tunnel_dec_refcount(tunnel); } /* Create a socket for the tunnel, if one isn't set up by * userspace. This is used for static tunnels where there is no * managing L2TP daemon. * * Since we don't want these sockets to keep a namespace alive by * themselves, we drop the socket's namespace refcount after creation. * These sockets are freed when the namespace exits using the pernet * exit hook. */ static int l2tp_tunnel_sock_create(struct net *net, u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, struct socket **sockp) { int err = -EINVAL; struct socket *sock = NULL; struct udp_port_cfg udp_conf; switch (cfg->encap) { case L2TP_ENCAPTYPE_UDP: memset(&udp_conf, 0, sizeof(udp_conf)); #if IS_ENABLED(CONFIG_IPV6) if (cfg->local_ip6 && cfg->peer_ip6) { udp_conf.family = AF_INET6; memcpy(&udp_conf.local_ip6, cfg->local_ip6, sizeof(udp_conf.local_ip6)); memcpy(&udp_conf.peer_ip6, cfg->peer_ip6, sizeof(udp_conf.peer_ip6)); udp_conf.use_udp6_tx_checksums = !cfg->udp6_zero_tx_checksums; udp_conf.use_udp6_rx_checksums = !cfg->udp6_zero_rx_checksums; } else #endif { udp_conf.family = AF_INET; udp_conf.local_ip = cfg->local_ip; udp_conf.peer_ip = cfg->peer_ip; udp_conf.use_udp_checksums = cfg->use_udp_checksums; } udp_conf.local_udp_port = htons(cfg->local_udp_port); udp_conf.peer_udp_port = htons(cfg->peer_udp_port); err = udp_sock_create(net, &udp_conf, &sock); if (err < 0) goto out; break; case L2TP_ENCAPTYPE_IP: #if IS_ENABLED(CONFIG_IPV6) if (cfg->local_ip6 && cfg->peer_ip6) { struct sockaddr_l2tpip6 ip6_addr = {0}; err = sock_create_kern(net, AF_INET6, SOCK_DGRAM, IPPROTO_L2TP, &sock); if (err < 0) goto out; ip6_addr.l2tp_family = AF_INET6; memcpy(&ip6_addr.l2tp_addr, cfg->local_ip6, sizeof(ip6_addr.l2tp_addr)); ip6_addr.l2tp_conn_id = tunnel_id; err = kernel_bind(sock, (struct sockaddr *)&ip6_addr, sizeof(ip6_addr)); if (err < 0) goto out; ip6_addr.l2tp_family = AF_INET6; memcpy(&ip6_addr.l2tp_addr, cfg->peer_ip6, sizeof(ip6_addr.l2tp_addr)); ip6_addr.l2tp_conn_id = peer_tunnel_id; err = kernel_connect(sock, (struct sockaddr *)&ip6_addr, sizeof(ip6_addr), 0); if (err < 0) goto out; } else #endif { struct sockaddr_l2tpip ip_addr = {0}; err = sock_create_kern(net, AF_INET, SOCK_DGRAM, IPPROTO_L2TP, &sock); if (err < 0) goto out; ip_addr.l2tp_family = AF_INET; ip_addr.l2tp_addr = cfg->local_ip; ip_addr.l2tp_conn_id = tunnel_id; err = kernel_bind(sock, (struct sockaddr *)&ip_addr, sizeof(ip_addr)); if (err < 0) goto out; ip_addr.l2tp_family = AF_INET; ip_addr.l2tp_addr = cfg->peer_ip; ip_addr.l2tp_conn_id = peer_tunnel_id; err = kernel_connect(sock, (struct sockaddr *)&ip_addr, sizeof(ip_addr), 0); if (err < 0) goto out; } break; default: goto out; } out: *sockp = sock; if (err < 0 && sock) { kernel_sock_shutdown(sock, SHUT_RDWR); sock_release(sock); *sockp = NULL; } return err; } int l2tp_tunnel_create(int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, struct l2tp_tunnel **tunnelp) { struct l2tp_tunnel *tunnel = NULL; int err; enum l2tp_encap_type encap = L2TP_ENCAPTYPE_UDP; if (cfg) encap = cfg->encap; tunnel = kzalloc(sizeof(*tunnel), GFP_KERNEL); if (!tunnel) { err = -ENOMEM; goto err; } tunnel->version = version; tunnel->tunnel_id = tunnel_id; tunnel->peer_tunnel_id = peer_tunnel_id; tunnel->magic = L2TP_TUNNEL_MAGIC; sprintf(&tunnel->name[0], "tunl %u", tunnel_id); spin_lock_init(&tunnel->hlist_lock); tunnel->acpt_newsess = true; tunnel->encap = encap; refcount_set(&tunnel->ref_count, 1); tunnel->fd = fd; /* Init delete workqueue struct */ INIT_WORK(&tunnel->del_work, l2tp_tunnel_del_work); INIT_LIST_HEAD(&tunnel->list); err = 0; err: if (tunnelp) *tunnelp = tunnel; return err; } EXPORT_SYMBOL_GPL(l2tp_tunnel_create); static int l2tp_validate_socket(const struct sock *sk, const struct net *net, enum l2tp_encap_type encap) { if (!net_eq(sock_net(sk), net)) return -EINVAL; if (sk->sk_type != SOCK_DGRAM) return -EPROTONOSUPPORT; if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6) return -EPROTONOSUPPORT; if ((encap == L2TP_ENCAPTYPE_UDP && sk->sk_protocol != IPPROTO_UDP) || (encap == L2TP_ENCAPTYPE_IP && sk->sk_protocol != IPPROTO_L2TP)) return -EPROTONOSUPPORT; if (sk->sk_user_data) return -EBUSY; return 0; } int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, struct l2tp_tunnel_cfg *cfg) { struct l2tp_net *pn = l2tp_pernet(net); u32 tunnel_id = tunnel->tunnel_id; struct socket *sock; struct sock *sk; int ret; spin_lock_bh(&pn->l2tp_tunnel_idr_lock); ret = idr_alloc_u32(&pn->l2tp_tunnel_idr, NULL, &tunnel_id, tunnel_id, GFP_ATOMIC); spin_unlock_bh(&pn->l2tp_tunnel_idr_lock); if (ret) return ret == -ENOSPC ? -EEXIST : ret; if (tunnel->fd < 0) { ret = l2tp_tunnel_sock_create(net, tunnel->tunnel_id, tunnel->peer_tunnel_id, cfg, &sock); if (ret < 0) goto err; } else { sock = sockfd_lookup(tunnel->fd, &ret); if (!sock) goto err; } sk = sock->sk; lock_sock(sk); write_lock_bh(&sk->sk_callback_lock); ret = l2tp_validate_socket(sk, net, tunnel->encap); if (ret < 0) goto err_inval_sock; rcu_assign_sk_user_data(sk, tunnel); write_unlock_bh(&sk->sk_callback_lock); if (tunnel->encap == L2TP_ENCAPTYPE_UDP) { struct udp_tunnel_sock_cfg udp_cfg = { .sk_user_data = tunnel, .encap_type = UDP_ENCAP_L2TPINUDP, .encap_rcv = l2tp_udp_encap_recv, .encap_destroy = l2tp_udp_encap_destroy, }; setup_udp_tunnel_sock(net, sock, &udp_cfg); } tunnel->old_sk_destruct = sk->sk_destruct; sk->sk_destruct = &l2tp_tunnel_destruct; sk->sk_allocation = GFP_ATOMIC; release_sock(sk); sock_hold(sk); tunnel->sock = sk; tunnel->l2tp_net = net; spin_lock_bh(&pn->l2tp_tunnel_idr_lock); idr_replace(&pn->l2tp_tunnel_idr, tunnel, tunnel->tunnel_id); spin_unlock_bh(&pn->l2tp_tunnel_idr_lock); trace_register_tunnel(tunnel); if (tunnel->fd >= 0) sockfd_put(sock); return 0; err_inval_sock: write_unlock_bh(&sk->sk_callback_lock); release_sock(sk); if (tunnel->fd < 0) sock_release(sock); else sockfd_put(sock); err: l2tp_tunnel_remove(net, tunnel); return ret; } EXPORT_SYMBOL_GPL(l2tp_tunnel_register); /* This function is used by the netlink TUNNEL_DELETE command. */ void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel) { if (!test_and_set_bit(0, &tunnel->dead)) { trace_delete_tunnel(tunnel); l2tp_tunnel_inc_refcount(tunnel); queue_work(l2tp_wq, &tunnel->del_work); } } EXPORT_SYMBOL_GPL(l2tp_tunnel_delete); void l2tp_session_delete(struct l2tp_session *session) { if (test_and_set_bit(0, &session->dead)) return; trace_delete_session(session); l2tp_session_unhash(session); l2tp_session_queue_purge(session); if (session->session_close) (*session->session_close)(session); l2tp_session_dec_refcount(session); } EXPORT_SYMBOL_GPL(l2tp_session_delete); /* We come here whenever a session's send_seq, cookie_len or * l2specific_type parameters are set. */ void l2tp_session_set_header_len(struct l2tp_session *session, int version) { if (version == L2TP_HDR_VER_2) { session->hdr_len = 6; if (session->send_seq) session->hdr_len += 4; } else { session->hdr_len = 4 + session->cookie_len; session->hdr_len += l2tp_get_l2specific_len(session); if (session->tunnel->encap == L2TP_ENCAPTYPE_UDP) session->hdr_len += 4; } } EXPORT_SYMBOL_GPL(l2tp_session_set_header_len); struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg) { struct l2tp_session *session; session = kzalloc(sizeof(*session) + priv_size, GFP_KERNEL); if (session) { session->magic = L2TP_SESSION_MAGIC; session->tunnel = tunnel; session->session_id = session_id; session->peer_session_id = peer_session_id; session->nr = 0; if (tunnel->version == L2TP_HDR_VER_2) session->nr_max = 0xffff; else session->nr_max = 0xffffff; session->nr_window_size = session->nr_max / 2; session->nr_oos_count_max = 4; /* Use NR of first received packet */ session->reorder_skip = 1; sprintf(&session->name[0], "sess %u/%u", tunnel->tunnel_id, session->session_id); skb_queue_head_init(&session->reorder_q); INIT_HLIST_NODE(&session->hlist); INIT_HLIST_NODE(&session->global_hlist); if (cfg) { session->pwtype = cfg->pw_type; session->send_seq = cfg->send_seq; session->recv_seq = cfg->recv_seq; session->lns_mode = cfg->lns_mode; session->reorder_timeout = cfg->reorder_timeout; session->l2specific_type = cfg->l2specific_type; session->cookie_len = cfg->cookie_len; memcpy(&session->cookie[0], &cfg->cookie[0], cfg->cookie_len); session->peer_cookie_len = cfg->peer_cookie_len; memcpy(&session->peer_cookie[0], &cfg->peer_cookie[0], cfg->peer_cookie_len); } l2tp_session_set_header_len(session, tunnel->version); refcount_set(&session->ref_count, 1); return session; } return ERR_PTR(-ENOMEM); } EXPORT_SYMBOL_GPL(l2tp_session_create); /***************************************************************************** * Init and cleanup *****************************************************************************/ static __net_init int l2tp_init_net(struct net *net) { struct l2tp_net *pn = net_generic(net, l2tp_net_id); int hash; idr_init(&pn->l2tp_tunnel_idr); spin_lock_init(&pn->l2tp_tunnel_idr_lock); for (hash = 0; hash < L2TP_HASH_SIZE_2; hash++) INIT_HLIST_HEAD(&pn->l2tp_session_hlist[hash]); spin_lock_init(&pn->l2tp_session_hlist_lock); return 0; } static __net_exit void l2tp_exit_net(struct net *net) { struct l2tp_net *pn = l2tp_pernet(net); struct l2tp_tunnel *tunnel = NULL; unsigned long tunnel_id, tmp; int hash; rcu_read_lock_bh(); idr_for_each_entry_ul(&pn->l2tp_tunnel_idr, tunnel, tmp, tunnel_id) { if (tunnel) l2tp_tunnel_delete(tunnel); } rcu_read_unlock_bh(); if (l2tp_wq) flush_workqueue(l2tp_wq); rcu_barrier(); for (hash = 0; hash < L2TP_HASH_SIZE_2; hash++) WARN_ON_ONCE(!hlist_empty(&pn->l2tp_session_hlist[hash])); idr_destroy(&pn->l2tp_tunnel_idr); } static struct pernet_operations l2tp_net_ops = { .init = l2tp_init_net, .exit = l2tp_exit_net, .id = &l2tp_net_id, .size = sizeof(struct l2tp_net), }; static int __init l2tp_init(void) { int rc = 0; rc = register_pernet_device(&l2tp_net_ops); if (rc) goto out; l2tp_wq = alloc_workqueue("l2tp", WQ_UNBOUND, 0); if (!l2tp_wq) { pr_err("alloc_workqueue failed\n"); unregister_pernet_device(&l2tp_net_ops); rc = -ENOMEM; goto out; } pr_info("L2TP core driver, %s\n", L2TP_DRV_VERSION); out: return rc; } static void __exit l2tp_exit(void) { unregister_pernet_device(&l2tp_net_ops); if (l2tp_wq) { destroy_workqueue(l2tp_wq); l2tp_wq = NULL; } } module_init(l2tp_init); module_exit(l2tp_exit); MODULE_AUTHOR("James Chapman <jchapman@katalix.com>"); MODULE_DESCRIPTION("L2TP core"); MODULE_LICENSE("GPL"); MODULE_VERSION(L2TP_DRV_VERSION);
47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_STRING_CHOICES_H_ #define _LINUX_STRING_CHOICES_H_ #include <linux/types.h> static inline const char *str_enable_disable(bool v) { return v ? "enable" : "disable"; } static inline const char *str_enabled_disabled(bool v) { return v ? "enabled" : "disabled"; } static inline const char *str_hi_lo(bool v) { return v ? "hi" : "lo"; } #define str_lo_hi(v) str_hi_lo(!(v)) static inline const char *str_high_low(bool v) { return v ? "high" : "low"; } #define str_low_high(v) str_high_low(!(v)) static inline const char *str_read_write(bool v) { return v ? "read" : "write"; } #define str_write_read(v) str_read_write(!(v)) static inline const char *str_on_off(bool v) { return v ? "on" : "off"; } static inline const char *str_yes_no(bool v) { return v ? "yes" : "no"; } #endif
2 2 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 // SPDX-License-Identifier: GPL-2.0-only /* Driver for Realtek USB card reader * * Copyright(c) 2009-2013 Realtek Semiconductor Corp. All rights reserved. * * Author: * Roger Tseng <rogerable@realtek.com> */ #include <linux/module.h> #include <linux/slab.h> #include <linux/mutex.h> #include <linux/usb.h> #include <linux/platform_device.h> #include <linux/mfd/core.h> #include <linux/rtsx_usb.h> static int polling_pipe = 1; module_param(polling_pipe, int, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(polling_pipe, "polling pipe (0: ctl, 1: bulk)"); static const struct mfd_cell rtsx_usb_cells[] = { [RTSX_USB_SD_CARD] = { .name = "rtsx_usb_sdmmc", .pdata_size = 0, }, [RTSX_USB_MS_CARD] = { .name = "rtsx_usb_ms", .pdata_size = 0, }, }; static void rtsx_usb_sg_timed_out(struct timer_list *t) { struct rtsx_ucr *ucr = from_timer(ucr, t, sg_timer); dev_dbg(&ucr->pusb_intf->dev, "%s: sg transfer timed out", __func__); usb_sg_cancel(&ucr->current_sg); } static int rtsx_usb_bulk_transfer_sglist(struct rtsx_ucr *ucr, unsigned int pipe, struct scatterlist *sg, int num_sg, unsigned int length, unsigned int *act_len, int timeout) { int ret; dev_dbg(&ucr->pusb_intf->dev, "%s: xfer %u bytes, %d entries\n", __func__, length, num_sg); ret = usb_sg_init(&ucr->current_sg, ucr->pusb_dev, pipe, 0, sg, num_sg, length, GFP_NOIO); if (ret) return ret; ucr->sg_timer.expires = jiffies + msecs_to_jiffies(timeout); add_timer(&ucr->sg_timer); usb_sg_wait(&ucr->current_sg); if (!del_timer_sync(&ucr->sg_timer)) ret = -ETIMEDOUT; else ret = ucr->current_sg.status; if (act_len) *act_len = ucr->current_sg.bytes; return ret; } int rtsx_usb_transfer_data(struct rtsx_ucr *ucr, unsigned int pipe, void *buf, unsigned int len, int num_sg, unsigned int *act_len, int timeout) { if (timeout < 600) timeout = 600; if (num_sg) return rtsx_usb_bulk_transfer_sglist(ucr, pipe, (struct scatterlist *)buf, num_sg, len, act_len, timeout); else return usb_bulk_msg(ucr->pusb_dev, pipe, buf, len, act_len, timeout); } EXPORT_SYMBOL_GPL(rtsx_usb_transfer_data); static inline void rtsx_usb_seq_cmd_hdr(struct rtsx_ucr *ucr, u16 addr, u16 len, u8 seq_type) { rtsx_usb_cmd_hdr_tag(ucr); ucr->cmd_buf[PACKET_TYPE] = seq_type; ucr->cmd_buf[5] = (u8)(len >> 8); ucr->cmd_buf[6] = (u8)len; ucr->cmd_buf[8] = (u8)(addr >> 8); ucr->cmd_buf[9] = (u8)addr; if (seq_type == SEQ_WRITE) ucr->cmd_buf[STAGE_FLAG] = 0; else ucr->cmd_buf[STAGE_FLAG] = STAGE_R; } static int rtsx_usb_seq_write_register(struct rtsx_ucr *ucr, u16 addr, u16 len, u8 *data) { u16 cmd_len = ALIGN(SEQ_WRITE_DATA_OFFSET + len, 4); if (!data) return -EINVAL; if (cmd_len > IOBUF_SIZE) return -EINVAL; rtsx_usb_seq_cmd_hdr(ucr, addr, len, SEQ_WRITE); memcpy(ucr->cmd_buf + SEQ_WRITE_DATA_OFFSET, data, len); return rtsx_usb_transfer_data(ucr, usb_sndbulkpipe(ucr->pusb_dev, EP_BULK_OUT), ucr->cmd_buf, cmd_len, 0, NULL, 100); } static int rtsx_usb_seq_read_register(struct rtsx_ucr *ucr, u16 addr, u16 len, u8 *data) { int i, ret; u16 rsp_len = round_down(len, 4); u16 res_len = len - rsp_len; if (!data) return -EINVAL; /* 4-byte aligned part */ if (rsp_len) { rtsx_usb_seq_cmd_hdr(ucr, addr, len, SEQ_READ); ret = rtsx_usb_transfer_data(ucr, usb_sndbulkpipe(ucr->pusb_dev, EP_BULK_OUT), ucr->cmd_buf, 12, 0, NULL, 100); if (ret) return ret; ret = rtsx_usb_transfer_data(ucr, usb_rcvbulkpipe(ucr->pusb_dev, EP_BULK_IN), data, rsp_len, 0, NULL, 100); if (ret) return ret; } /* unaligned part */ for (i = 0; i < res_len; i++) { ret = rtsx_usb_read_register(ucr, addr + rsp_len + i, data + rsp_len + i); if (ret) return ret; } return 0; } int rtsx_usb_read_ppbuf(struct rtsx_ucr *ucr, u8 *buf, int buf_len) { return rtsx_usb_seq_read_register(ucr, PPBUF_BASE2, (u16)buf_len, buf); } EXPORT_SYMBOL_GPL(rtsx_usb_read_ppbuf); int rtsx_usb_write_ppbuf(struct rtsx_ucr *ucr, u8 *buf, int buf_len) { return rtsx_usb_seq_write_register(ucr, PPBUF_BASE2, (u16)buf_len, buf); } EXPORT_SYMBOL_GPL(rtsx_usb_write_ppbuf); int rtsx_usb_ep0_write_register(struct rtsx_ucr *ucr, u16 addr, u8 mask, u8 data) { u16 value, index; addr |= EP0_WRITE_REG_CMD << EP0_OP_SHIFT; value = swab16(addr); index = mask | data << 8; return usb_control_msg(ucr->pusb_dev, usb_sndctrlpipe(ucr->pusb_dev, 0), RTSX_USB_REQ_REG_OP, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, value, index, NULL, 0, 100); } EXPORT_SYMBOL_GPL(rtsx_usb_ep0_write_register); int rtsx_usb_ep0_read_register(struct rtsx_ucr *ucr, u16 addr, u8 *data) { u16 value; u8 *buf; int ret; if (!data) return -EINVAL; buf = kzalloc(sizeof(u8), GFP_KERNEL); if (!buf) return -ENOMEM; addr |= EP0_READ_REG_CMD << EP0_OP_SHIFT; value = swab16(addr); ret = usb_control_msg(ucr->pusb_dev, usb_rcvctrlpipe(ucr->pusb_dev, 0), RTSX_USB_REQ_REG_OP, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE, value, 0, buf, 1, 100); *data = *buf; kfree(buf); return ret; } EXPORT_SYMBOL_GPL(rtsx_usb_ep0_read_register); void rtsx_usb_add_cmd(struct rtsx_ucr *ucr, u8 cmd_type, u16 reg_addr, u8 mask, u8 data) { int i; if (ucr->cmd_idx < (IOBUF_SIZE - CMD_OFFSET) / 4) { i = CMD_OFFSET + ucr->cmd_idx * 4; ucr->cmd_buf[i++] = ((cmd_type & 0x03) << 6) | (u8)((reg_addr >> 8) & 0x3F); ucr->cmd_buf[i++] = (u8)reg_addr; ucr->cmd_buf[i++] = mask; ucr->cmd_buf[i++] = data; ucr->cmd_idx++; } } EXPORT_SYMBOL_GPL(rtsx_usb_add_cmd); int rtsx_usb_send_cmd(struct rtsx_ucr *ucr, u8 flag, int timeout) { int ret; ucr->cmd_buf[CNT_H] = (u8)(ucr->cmd_idx >> 8); ucr->cmd_buf[CNT_L] = (u8)(ucr->cmd_idx); ucr->cmd_buf[STAGE_FLAG] = flag; ret = rtsx_usb_transfer_data(ucr, usb_sndbulkpipe(ucr->pusb_dev, EP_BULK_OUT), ucr->cmd_buf, ucr->cmd_idx * 4 + CMD_OFFSET, 0, NULL, timeout); if (ret) { rtsx_usb_clear_fsm_err(ucr); return ret; } return 0; } EXPORT_SYMBOL_GPL(rtsx_usb_send_cmd); int rtsx_usb_get_rsp(struct rtsx_ucr *ucr, int rsp_len, int timeout) { if (rsp_len <= 0) return -EINVAL; rsp_len = ALIGN(rsp_len, 4); return rtsx_usb_transfer_data(ucr, usb_rcvbulkpipe(ucr->pusb_dev, EP_BULK_IN), ucr->rsp_buf, rsp_len, 0, NULL, timeout); } EXPORT_SYMBOL_GPL(rtsx_usb_get_rsp); static int rtsx_usb_get_status_with_bulk(struct rtsx_ucr *ucr, u16 *status) { int ret; rtsx_usb_init_cmd(ucr); rtsx_usb_add_cmd(ucr, READ_REG_CMD, CARD_EXIST, 0x00, 0x00); rtsx_usb_add_cmd(ucr, READ_REG_CMD, OCPSTAT, 0x00, 0x00); ret = rtsx_usb_send_cmd(ucr, MODE_CR, 100); if (ret) return ret; ret = rtsx_usb_get_rsp(ucr, 2, 100); if (ret) return ret; *status = ((ucr->rsp_buf[0] >> 2) & 0x0f) | ((ucr->rsp_buf[1] & 0x03) << 4); return 0; } int rtsx_usb_get_card_status(struct rtsx_ucr *ucr, u16 *status) { int ret; u16 *buf; if (!status) return -EINVAL; if (polling_pipe == 0) { buf = kzalloc(sizeof(u16), GFP_KERNEL); if (!buf) return -ENOMEM; ret = usb_control_msg(ucr->pusb_dev, usb_rcvctrlpipe(ucr->pusb_dev, 0), RTSX_USB_REQ_POLL, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 0, 0, buf, 2, 100); *status = *buf; kfree(buf); } else { ret = rtsx_usb_get_status_with_bulk(ucr, status); } /* usb_control_msg may return positive when success */ if (ret < 0) return ret; return 0; } EXPORT_SYMBOL_GPL(rtsx_usb_get_card_status); static int rtsx_usb_write_phy_register(struct rtsx_ucr *ucr, u8 addr, u8 val) { dev_dbg(&ucr->pusb_intf->dev, "Write 0x%x to phy register 0x%x\n", val, addr); rtsx_usb_init_cmd(ucr); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, HS_VSTAIN, 0xFF, val); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, HS_VCONTROL, 0xFF, addr & 0x0F); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, HS_VLOADM, 0xFF, 0x00); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, HS_VLOADM, 0xFF, 0x00); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, HS_VLOADM, 0xFF, 0x01); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, HS_VCONTROL, 0xFF, (addr >> 4) & 0x0F); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, HS_VLOADM, 0xFF, 0x00); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, HS_VLOADM, 0xFF, 0x00); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, HS_VLOADM, 0xFF, 0x01); return rtsx_usb_send_cmd(ucr, MODE_C, 100); } int rtsx_usb_write_register(struct rtsx_ucr *ucr, u16 addr, u8 mask, u8 data) { rtsx_usb_init_cmd(ucr); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, addr, mask, data); return rtsx_usb_send_cmd(ucr, MODE_C, 100); } EXPORT_SYMBOL_GPL(rtsx_usb_write_register); int rtsx_usb_read_register(struct rtsx_ucr *ucr, u16 addr, u8 *data) { int ret; if (data != NULL) *data = 0; rtsx_usb_init_cmd(ucr); rtsx_usb_add_cmd(ucr, READ_REG_CMD, addr, 0, 0); ret = rtsx_usb_send_cmd(ucr, MODE_CR, 100); if (ret) return ret; ret = rtsx_usb_get_rsp(ucr, 1, 100); if (ret) return ret; if (data != NULL) *data = ucr->rsp_buf[0]; return 0; } EXPORT_SYMBOL_GPL(rtsx_usb_read_register); static inline u8 double_ssc_depth(u8 depth) { return (depth > 1) ? (depth - 1) : depth; } static u8 revise_ssc_depth(u8 ssc_depth, u8 div) { if (div > CLK_DIV_1) { if (ssc_depth > div - 1) ssc_depth -= (div - 1); else ssc_depth = SSC_DEPTH_2M; } return ssc_depth; } int rtsx_usb_switch_clock(struct rtsx_ucr *ucr, unsigned int card_clock, u8 ssc_depth, bool initial_mode, bool double_clk, bool vpclk) { int ret; u8 n, clk_divider, mcu_cnt, div; if (!card_clock) { ucr->cur_clk = 0; return 0; } if (initial_mode) { /* We use 250k(around) here, in initial stage */ clk_divider = SD_CLK_DIVIDE_128; card_clock = 30000000; } else { clk_divider = SD_CLK_DIVIDE_0; } ret = rtsx_usb_write_register(ucr, SD_CFG1, SD_CLK_DIVIDE_MASK, clk_divider); if (ret < 0) return ret; card_clock /= 1000000; dev_dbg(&ucr->pusb_intf->dev, "Switch card clock to %dMHz\n", card_clock); if (!initial_mode && double_clk) card_clock *= 2; dev_dbg(&ucr->pusb_intf->dev, "Internal SSC clock: %dMHz (cur_clk = %d)\n", card_clock, ucr->cur_clk); if (card_clock == ucr->cur_clk) return 0; /* Converting clock value into internal settings: n and div */ n = card_clock - 2; if ((card_clock <= 2) || (n > MAX_DIV_N)) return -EINVAL; mcu_cnt = 60/card_clock + 3; if (mcu_cnt > 15) mcu_cnt = 15; /* Make sure that the SSC clock div_n is not less than MIN_DIV_N */ div = CLK_DIV_1; while (n < MIN_DIV_N && div < CLK_DIV_4) { n = (n + 2) * 2 - 2; div++; } dev_dbg(&ucr->pusb_intf->dev, "n = %d, div = %d\n", n, div); if (double_clk) ssc_depth = double_ssc_depth(ssc_depth); ssc_depth = revise_ssc_depth(ssc_depth, div); dev_dbg(&ucr->pusb_intf->dev, "ssc_depth = %d\n", ssc_depth); rtsx_usb_init_cmd(ucr); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CLK_DIV, CLK_CHANGE, CLK_CHANGE); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CLK_DIV, 0x3F, (div << 4) | mcu_cnt); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, SSC_CTL1, SSC_RSTB, 0); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, SSC_CTL2, SSC_DEPTH_MASK, ssc_depth); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, SSC_DIV_N_0, 0xFF, n); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, SSC_CTL1, SSC_RSTB, SSC_RSTB); if (vpclk) { rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, SD_VPCLK0_CTL, PHASE_NOT_RESET, 0); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, SD_VPCLK0_CTL, PHASE_NOT_RESET, PHASE_NOT_RESET); } ret = rtsx_usb_send_cmd(ucr, MODE_C, 2000); if (ret < 0) return ret; ret = rtsx_usb_write_register(ucr, SSC_CTL1, 0xff, SSC_RSTB | SSC_8X_EN | SSC_SEL_4M); if (ret < 0) return ret; /* Wait SSC clock stable */ usleep_range(100, 1000); ret = rtsx_usb_write_register(ucr, CLK_DIV, CLK_CHANGE, 0); if (ret < 0) return ret; ucr->cur_clk = card_clock; return 0; } EXPORT_SYMBOL_GPL(rtsx_usb_switch_clock); int rtsx_usb_card_exclusive_check(struct rtsx_ucr *ucr, int card) { int ret; u16 val; u16 cd_mask[] = { [RTSX_USB_SD_CARD] = (CD_MASK & ~SD_CD), [RTSX_USB_MS_CARD] = (CD_MASK & ~MS_CD) }; ret = rtsx_usb_get_card_status(ucr, &val); /* * If get status fails, return 0 (ok) for the exclusive check * and let the flow fail at somewhere else. */ if (ret) return 0; if (val & cd_mask[card]) return -EIO; return 0; } EXPORT_SYMBOL_GPL(rtsx_usb_card_exclusive_check); static int rtsx_usb_reset_chip(struct rtsx_ucr *ucr) { int ret; u8 val; rtsx_usb_init_cmd(ucr); if (CHECK_PKG(ucr, LQFP48)) { rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CARD_PWR_CTL, LDO3318_PWR_MASK, LDO_SUSPEND); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CARD_PWR_CTL, FORCE_LDO_POWERB, FORCE_LDO_POWERB); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CARD_PULL_CTL1, 0x30, 0x10); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CARD_PULL_CTL5, 0x03, 0x01); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CARD_PULL_CTL6, 0x0C, 0x04); } rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, SYS_DUMMY0, NYET_MSAK, NYET_EN); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CD_DEGLITCH_WIDTH, 0xFF, 0x08); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CD_DEGLITCH_EN, XD_CD_DEGLITCH_EN, 0x0); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, SD30_DRIVE_SEL, SD30_DRIVE_MASK, DRIVER_TYPE_D); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CARD_DRIVE_SEL, SD20_DRIVE_MASK, 0x0); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, LDO_POWER_CFG, 0xE0, 0x0); if (ucr->is_rts5179) rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CARD_PULL_CTL5, 0x03, 0x01); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CARD_DMA1_CTL, EXTEND_DMA1_ASYNC_SIGNAL, EXTEND_DMA1_ASYNC_SIGNAL); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CARD_INT_PEND, XD_INT | MS_INT | SD_INT, XD_INT | MS_INT | SD_INT); ret = rtsx_usb_send_cmd(ucr, MODE_C, 100); if (ret) return ret; /* config non-crystal mode */ rtsx_usb_read_register(ucr, CFG_MODE, &val); if ((val & XTAL_FREE) || ((val & CLK_MODE_MASK) == CLK_MODE_NON_XTAL)) { ret = rtsx_usb_write_phy_register(ucr, 0xC2, 0x7C); if (ret) return ret; } return 0; } static int rtsx_usb_init_chip(struct rtsx_ucr *ucr) { int ret; u8 val; rtsx_usb_clear_fsm_err(ucr); /* power on SSC */ ret = rtsx_usb_write_register(ucr, FPDCTL, SSC_POWER_MASK, SSC_POWER_ON); if (ret) return ret; usleep_range(100, 1000); ret = rtsx_usb_write_register(ucr, CLK_DIV, CLK_CHANGE, 0x00); if (ret) return ret; /* determine IC version */ ret = rtsx_usb_read_register(ucr, HW_VERSION, &val); if (ret) return ret; ucr->ic_version = val & HW_VER_MASK; /* determine package */ ret = rtsx_usb_read_register(ucr, CARD_SHARE_MODE, &val); if (ret) return ret; if (val & CARD_SHARE_LQFP_SEL) { ucr->package = LQFP48; dev_dbg(&ucr->pusb_intf->dev, "Package: LQFP48\n"); } else { ucr->package = QFN24; dev_dbg(&ucr->pusb_intf->dev, "Package: QFN24\n"); } /* determine IC variations */ rtsx_usb_read_register(ucr, CFG_MODE_1, &val); if (val & RTS5179) { ucr->is_rts5179 = true; dev_dbg(&ucr->pusb_intf->dev, "Device is rts5179\n"); } else { ucr->is_rts5179 = false; } return rtsx_usb_reset_chip(ucr); } static int rtsx_usb_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct usb_device *usb_dev = interface_to_usbdev(intf); struct rtsx_ucr *ucr; int ret; dev_dbg(&intf->dev, ": Realtek USB Card Reader found at bus %03d address %03d\n", usb_dev->bus->busnum, usb_dev->devnum); ucr = devm_kzalloc(&intf->dev, sizeof(*ucr), GFP_KERNEL); if (!ucr) return -ENOMEM; ucr->pusb_dev = usb_dev; ucr->cmd_buf = kmalloc(IOBUF_SIZE, GFP_KERNEL); if (!ucr->cmd_buf) return -ENOMEM; ucr->rsp_buf = kmalloc(IOBUF_SIZE, GFP_KERNEL); if (!ucr->rsp_buf) { ret = -ENOMEM; goto out_free_cmd_buf; } usb_set_intfdata(intf, ucr); ucr->vendor_id = id->idVendor; ucr->product_id = id->idProduct; mutex_init(&ucr->dev_mutex); ucr->pusb_intf = intf; /* initialize */ ret = rtsx_usb_init_chip(ucr); if (ret) goto out_init_fail; /* initialize USB SG transfer timer */ timer_setup(&ucr->sg_timer, rtsx_usb_sg_timed_out, 0); ret = mfd_add_hotplug_devices(&intf->dev, rtsx_usb_cells, ARRAY_SIZE(rtsx_usb_cells)); if (ret) goto out_init_fail; #ifdef CONFIG_PM intf->needs_remote_wakeup = 1; usb_enable_autosuspend(usb_dev); #endif return 0; out_init_fail: usb_set_intfdata(ucr->pusb_intf, NULL); kfree(ucr->rsp_buf); ucr->rsp_buf = NULL; out_free_cmd_buf: kfree(ucr->cmd_buf); ucr->cmd_buf = NULL; return ret; } static void rtsx_usb_disconnect(struct usb_interface *intf) { struct rtsx_ucr *ucr = (struct rtsx_ucr *)usb_get_intfdata(intf); dev_dbg(&intf->dev, "%s called\n", __func__); mfd_remove_devices(&intf->dev); usb_set_intfdata(ucr->pusb_intf, NULL); kfree(ucr->cmd_buf); ucr->cmd_buf = NULL; kfree(ucr->rsp_buf); ucr->rsp_buf = NULL; } #ifdef CONFIG_PM static int rtsx_usb_suspend(struct usb_interface *intf, pm_message_t message) { struct rtsx_ucr *ucr = (struct rtsx_ucr *)usb_get_intfdata(intf); u16 val = 0; dev_dbg(&intf->dev, "%s called with pm message 0x%04x\n", __func__, message.event); if (PMSG_IS_AUTO(message)) { if (mutex_trylock(&ucr->dev_mutex)) { rtsx_usb_get_card_status(ucr, &val); mutex_unlock(&ucr->dev_mutex); /* Defer the autosuspend if card exists */ if (val & (SD_CD | MS_CD)) return -EAGAIN; } else { /* There is an ongoing operation*/ return -EAGAIN; } } return 0; } static int rtsx_usb_resume_child(struct device *dev, void *data) { pm_request_resume(dev); return 0; } static int rtsx_usb_resume(struct usb_interface *intf) { device_for_each_child(&intf->dev, NULL, rtsx_usb_resume_child); return 0; } static int rtsx_usb_reset_resume(struct usb_interface *intf) { struct rtsx_ucr *ucr = (struct rtsx_ucr *)usb_get_intfdata(intf); rtsx_usb_reset_chip(ucr); device_for_each_child(&intf->dev, NULL, rtsx_usb_resume_child); return 0; } #else /* CONFIG_PM */ #define rtsx_usb_suspend NULL #define rtsx_usb_resume NULL #define rtsx_usb_reset_resume NULL #endif /* CONFIG_PM */ static int rtsx_usb_pre_reset(struct usb_interface *intf) { struct rtsx_ucr *ucr = (struct rtsx_ucr *)usb_get_intfdata(intf); mutex_lock(&ucr->dev_mutex); return 0; } static int rtsx_usb_post_reset(struct usb_interface *intf) { struct rtsx_ucr *ucr = (struct rtsx_ucr *)usb_get_intfdata(intf); mutex_unlock(&ucr->dev_mutex); return 0; } static const struct usb_device_id rtsx_usb_usb_ids[] = { { USB_DEVICE(0x0BDA, 0x0129) }, { USB_DEVICE(0x0BDA, 0x0139) }, { USB_DEVICE(0x0BDA, 0x0140) }, { } }; MODULE_DEVICE_TABLE(usb, rtsx_usb_usb_ids); static struct usb_driver rtsx_usb_driver = { .name = "rtsx_usb", .probe = rtsx_usb_probe, .disconnect = rtsx_usb_disconnect, .suspend = rtsx_usb_suspend, .resume = rtsx_usb_resume, .reset_resume = rtsx_usb_reset_resume, .pre_reset = rtsx_usb_pre_reset, .post_reset = rtsx_usb_post_reset, .id_table = rtsx_usb_usb_ids, .supports_autosuspend = 1, .soft_unbind = 1, }; module_usb_driver(rtsx_usb_driver); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Roger Tseng <rogerable@realtek.com>"); MODULE_DESCRIPTION("Realtek USB Card Reader Driver");
158 158 158 158 157 158 158 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 // SPDX-License-Identifier: GPL-2.0 /* * IPv6 Address Label subsystem * for the IPv6 "Default" Source Address Selection * * Copyright (C)2007 USAGI/WIDE Project */ /* * Author: * YOSHIFUJI Hideaki @ USAGI/WIDE Project <yoshfuji@linux-ipv6.org> */ #include <linux/kernel.h> #include <linux/list.h> #include <linux/rcupdate.h> #include <linux/in6.h> #include <linux/slab.h> #include <net/addrconf.h> #include <linux/if_addrlabel.h> #include <linux/netlink.h> #include <linux/rtnetlink.h> #if 0 #define ADDRLABEL(x...) printk(x) #else #define ADDRLABEL(x...) do { ; } while (0) #endif /* * Policy Table */ struct ip6addrlbl_entry { struct in6_addr prefix; int prefixlen; int ifindex; int addrtype; u32 label; struct hlist_node list; struct rcu_head rcu; }; /* * Default policy table (RFC6724 + extensions) * * prefix addr_type label * ------------------------------------------------------------------------- * ::1/128 LOOPBACK 0 * ::/0 N/A 1 * 2002::/16 N/A 2 * ::/96 COMPATv4 3 * ::ffff:0:0/96 V4MAPPED 4 * fc00::/7 N/A 5 ULA (RFC 4193) * 2001::/32 N/A 6 Teredo (RFC 4380) * 2001:10::/28 N/A 7 ORCHID (RFC 4843) * fec0::/10 N/A 11 Site-local * (deprecated by RFC3879) * 3ffe::/16 N/A 12 6bone * * Note: 0xffffffff is used if we do not have any policies. * Note: Labels for ULA and 6to4 are different from labels listed in RFC6724. */ #define IPV6_ADDR_LABEL_DEFAULT 0xffffffffUL static const __net_initconst struct ip6addrlbl_init_table { const struct in6_addr *prefix; int prefixlen; u32 label; } ip6addrlbl_init_table[] = { { /* ::/0 */ .prefix = &in6addr_any, .label = 1, }, { /* fc00::/7 */ .prefix = &(struct in6_addr){ { { 0xfc } } } , .prefixlen = 7, .label = 5, }, { /* fec0::/10 */ .prefix = &(struct in6_addr){ { { 0xfe, 0xc0 } } }, .prefixlen = 10, .label = 11, }, { /* 2002::/16 */ .prefix = &(struct in6_addr){ { { 0x20, 0x02 } } }, .prefixlen = 16, .label = 2, }, { /* 3ffe::/16 */ .prefix = &(struct in6_addr){ { { 0x3f, 0xfe } } }, .prefixlen = 16, .label = 12, }, { /* 2001::/32 */ .prefix = &(struct in6_addr){ { { 0x20, 0x01 } } }, .prefixlen = 32, .label = 6, }, { /* 2001:10::/28 */ .prefix = &(struct in6_addr){ { { 0x20, 0x01, 0x00, 0x10 } } }, .prefixlen = 28, .label = 7, }, { /* ::ffff:0:0 */ .prefix = &(struct in6_addr){ { { [10] = 0xff, [11] = 0xff } } }, .prefixlen = 96, .label = 4, }, { /* ::/96 */ .prefix = &in6addr_any, .prefixlen = 96, .label = 3, }, { /* ::1/128 */ .prefix = &in6addr_loopback, .prefixlen = 128, .label = 0, } }; /* Find label */ static bool __ip6addrlbl_match(const struct ip6addrlbl_entry *p, const struct in6_addr *addr, int addrtype, int ifindex) { if (p->ifindex && p->ifindex != ifindex) return false; if (p->addrtype && p->addrtype != addrtype) return false; if (!ipv6_prefix_equal(addr, &p->prefix, p->prefixlen)) return false; return true; } static struct ip6addrlbl_entry *__ipv6_addr_label(struct net *net, const struct in6_addr *addr, int type, int ifindex) { struct ip6addrlbl_entry *p; hlist_for_each_entry_rcu(p, &net->ipv6.ip6addrlbl_table.head, list) { if (__ip6addrlbl_match(p, addr, type, ifindex)) return p; } return NULL; } u32 ipv6_addr_label(struct net *net, const struct in6_addr *addr, int type, int ifindex) { u32 label; struct ip6addrlbl_entry *p; type &= IPV6_ADDR_MAPPED | IPV6_ADDR_COMPATv4 | IPV6_ADDR_LOOPBACK; rcu_read_lock(); p = __ipv6_addr_label(net, addr, type, ifindex); label = p ? p->label : IPV6_ADDR_LABEL_DEFAULT; rcu_read_unlock(); ADDRLABEL(KERN_DEBUG "%s(addr=%pI6, type=%d, ifindex=%d) => %08x\n", __func__, addr, type, ifindex, label); return label; } /* allocate one entry */ static struct ip6addrlbl_entry *ip6addrlbl_alloc(const struct in6_addr *prefix, int prefixlen, int ifindex, u32 label) { struct ip6addrlbl_entry *newp; int addrtype; ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d, label=%u)\n", __func__, prefix, prefixlen, ifindex, (unsigned int)label); addrtype = ipv6_addr_type(prefix) & (IPV6_ADDR_MAPPED | IPV6_ADDR_COMPATv4 | IPV6_ADDR_LOOPBACK); switch (addrtype) { case IPV6_ADDR_MAPPED: if (prefixlen > 96) return ERR_PTR(-EINVAL); if (prefixlen < 96) addrtype = 0; break; case IPV6_ADDR_COMPATv4: if (prefixlen != 96) addrtype = 0; break; case IPV6_ADDR_LOOPBACK: if (prefixlen != 128) addrtype = 0; break; } newp = kmalloc(sizeof(*newp), GFP_KERNEL); if (!newp) return ERR_PTR(-ENOMEM); ipv6_addr_prefix(&newp->prefix, prefix, prefixlen); newp->prefixlen = prefixlen; newp->ifindex = ifindex; newp->addrtype = addrtype; newp->label = label; INIT_HLIST_NODE(&newp->list); return newp; } /* add a label */ static int __ip6addrlbl_add(struct net *net, struct ip6addrlbl_entry *newp, int replace) { struct ip6addrlbl_entry *last = NULL, *p = NULL; struct hlist_node *n; int ret = 0; ADDRLABEL(KERN_DEBUG "%s(newp=%p, replace=%d)\n", __func__, newp, replace); hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) { if (p->prefixlen == newp->prefixlen && p->ifindex == newp->ifindex && ipv6_addr_equal(&p->prefix, &newp->prefix)) { if (!replace) { ret = -EEXIST; goto out; } hlist_replace_rcu(&p->list, &newp->list); kfree_rcu(p, rcu); goto out; } else if ((p->prefixlen == newp->prefixlen && !p->ifindex) || (p->prefixlen < newp->prefixlen)) { hlist_add_before_rcu(&newp->list, &p->list); goto out; } last = p; } if (last) hlist_add_behind_rcu(&newp->list, &last->list); else hlist_add_head_rcu(&newp->list, &net->ipv6.ip6addrlbl_table.head); out: if (!ret) net->ipv6.ip6addrlbl_table.seq++; return ret; } /* add a label */ static int ip6addrlbl_add(struct net *net, const struct in6_addr *prefix, int prefixlen, int ifindex, u32 label, int replace) { struct ip6addrlbl_entry *newp; int ret = 0; ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d, label=%u, replace=%d)\n", __func__, prefix, prefixlen, ifindex, (unsigned int)label, replace); newp = ip6addrlbl_alloc(prefix, prefixlen, ifindex, label); if (IS_ERR(newp)) return PTR_ERR(newp); spin_lock(&net->ipv6.ip6addrlbl_table.lock); ret = __ip6addrlbl_add(net, newp, replace); spin_unlock(&net->ipv6.ip6addrlbl_table.lock); if (ret) kfree(newp); return ret; } /* remove a label */ static int __ip6addrlbl_del(struct net *net, const struct in6_addr *prefix, int prefixlen, int ifindex) { struct ip6addrlbl_entry *p = NULL; struct hlist_node *n; int ret = -ESRCH; ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d)\n", __func__, prefix, prefixlen, ifindex); hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) { if (p->prefixlen == prefixlen && p->ifindex == ifindex && ipv6_addr_equal(&p->prefix, prefix)) { hlist_del_rcu(&p->list); kfree_rcu(p, rcu); ret = 0; break; } } return ret; } static int ip6addrlbl_del(struct net *net, const struct in6_addr *prefix, int prefixlen, int ifindex) { struct in6_addr prefix_buf; int ret; ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d)\n", __func__, prefix, prefixlen, ifindex); ipv6_addr_prefix(&prefix_buf, prefix, prefixlen); spin_lock(&net->ipv6.ip6addrlbl_table.lock); ret = __ip6addrlbl_del(net, &prefix_buf, prefixlen, ifindex); spin_unlock(&net->ipv6.ip6addrlbl_table.lock); return ret; } /* add default label */ static int __net_init ip6addrlbl_net_init(struct net *net) { struct ip6addrlbl_entry *p = NULL; struct hlist_node *n; int err; int i; ADDRLABEL(KERN_DEBUG "%s\n", __func__); spin_lock_init(&net->ipv6.ip6addrlbl_table.lock); INIT_HLIST_HEAD(&net->ipv6.ip6addrlbl_table.head); for (i = 0; i < ARRAY_SIZE(ip6addrlbl_init_table); i++) { err = ip6addrlbl_add(net, ip6addrlbl_init_table[i].prefix, ip6addrlbl_init_table[i].prefixlen, 0, ip6addrlbl_init_table[i].label, 0); if (err) goto err_ip6addrlbl_add; } return 0; err_ip6addrlbl_add: hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) { hlist_del_rcu(&p->list); kfree_rcu(p, rcu); } return err; } static void __net_exit ip6addrlbl_net_exit(struct net *net) { struct ip6addrlbl_entry *p = NULL; struct hlist_node *n; /* Remove all labels belonging to the exiting net */ spin_lock(&net->ipv6.ip6addrlbl_table.lock); hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) { hlist_del_rcu(&p->list); kfree_rcu(p, rcu); } spin_unlock(&net->ipv6.ip6addrlbl_table.lock); } static struct pernet_operations ipv6_addr_label_ops = { .init = ip6addrlbl_net_init, .exit = ip6addrlbl_net_exit, }; int __init ipv6_addr_label_init(void) { return register_pernet_subsys(&ipv6_addr_label_ops); } void ipv6_addr_label_cleanup(void) { unregister_pernet_subsys(&ipv6_addr_label_ops); } static const struct nla_policy ifal_policy[IFAL_MAX+1] = { [IFAL_ADDRESS] = { .len = sizeof(struct in6_addr), }, [IFAL_LABEL] = { .len = sizeof(u32), }, }; static bool addrlbl_ifindex_exists(struct net *net, int ifindex) { struct net_device *dev; rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); rcu_read_unlock(); return dev != NULL; } static int ip6addrlbl_newdel(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ifaddrlblmsg *ifal; struct nlattr *tb[IFAL_MAX+1]; struct in6_addr *pfx; u32 label; int err = 0; err = nlmsg_parse_deprecated(nlh, sizeof(*ifal), tb, IFAL_MAX, ifal_policy, extack); if (err < 0) return err; ifal = nlmsg_data(nlh); if (ifal->ifal_family != AF_INET6 || ifal->ifal_prefixlen > 128) return -EINVAL; if (!tb[IFAL_ADDRESS]) return -EINVAL; pfx = nla_data(tb[IFAL_ADDRESS]); if (!tb[IFAL_LABEL]) return -EINVAL; label = nla_get_u32(tb[IFAL_LABEL]); if (label == IPV6_ADDR_LABEL_DEFAULT) return -EINVAL; switch (nlh->nlmsg_type) { case RTM_NEWADDRLABEL: if (ifal->ifal_index && !addrlbl_ifindex_exists(net, ifal->ifal_index)) return -EINVAL; err = ip6addrlbl_add(net, pfx, ifal->ifal_prefixlen, ifal->ifal_index, label, nlh->nlmsg_flags & NLM_F_REPLACE); break; case RTM_DELADDRLABEL: err = ip6addrlbl_del(net, pfx, ifal->ifal_prefixlen, ifal->ifal_index); break; default: err = -EOPNOTSUPP; } return err; } static void ip6addrlbl_putmsg(struct nlmsghdr *nlh, int prefixlen, int ifindex, u32 lseq) { struct ifaddrlblmsg *ifal = nlmsg_data(nlh); ifal->ifal_family = AF_INET6; ifal->__ifal_reserved = 0; ifal->ifal_prefixlen = prefixlen; ifal->ifal_flags = 0; ifal->ifal_index = ifindex; ifal->ifal_seq = lseq; }; static int ip6addrlbl_fill(struct sk_buff *skb, struct ip6addrlbl_entry *p, u32 lseq, u32 portid, u32 seq, int event, unsigned int flags) { struct nlmsghdr *nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct ifaddrlblmsg), flags); if (!nlh) return -EMSGSIZE; ip6addrlbl_putmsg(nlh, p->prefixlen, p->ifindex, lseq); if (nla_put_in6_addr(skb, IFAL_ADDRESS, &p->prefix) < 0 || nla_put_u32(skb, IFAL_LABEL, p->label) < 0) { nlmsg_cancel(skb, nlh); return -EMSGSIZE; } nlmsg_end(skb, nlh); return 0; } static int ip6addrlbl_valid_dump_req(const struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct ifaddrlblmsg *ifal; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifal))) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for address label dump request"); return -EINVAL; } ifal = nlmsg_data(nlh); if (ifal->__ifal_reserved || ifal->ifal_prefixlen || ifal->ifal_flags || ifal->ifal_index || ifal->ifal_seq) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for address label dump request"); return -EINVAL; } if (nlmsg_attrlen(nlh, sizeof(*ifal))) { NL_SET_ERR_MSG_MOD(extack, "Invalid data after header for address label dump request"); return -EINVAL; } return 0; } static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct ip6addrlbl_entry *p; int idx = 0, s_idx = cb->args[0]; int err; if (cb->strict_check) { err = ip6addrlbl_valid_dump_req(nlh, cb->extack); if (err < 0) return err; } rcu_read_lock(); hlist_for_each_entry_rcu(p, &net->ipv6.ip6addrlbl_table.head, list) { if (idx >= s_idx) { err = ip6addrlbl_fill(skb, p, net->ipv6.ip6addrlbl_table.seq, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, RTM_NEWADDRLABEL, NLM_F_MULTI); if (err < 0) break; } idx++; } rcu_read_unlock(); cb->args[0] = idx; return skb->len; } static inline int ip6addrlbl_msgsize(void) { return NLMSG_ALIGN(sizeof(struct ifaddrlblmsg)) + nla_total_size(16) /* IFAL_ADDRESS */ + nla_total_size(4); /* IFAL_LABEL */ } static int ip6addrlbl_valid_get_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct ifaddrlblmsg *ifal; int i, err; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifal))) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for addrlabel get request"); return -EINVAL; } if (!netlink_strict_get_check(skb)) return nlmsg_parse_deprecated(nlh, sizeof(*ifal), tb, IFAL_MAX, ifal_policy, extack); ifal = nlmsg_data(nlh); if (ifal->__ifal_reserved || ifal->ifal_flags || ifal->ifal_seq) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for addrlabel get request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifal), tb, IFAL_MAX, ifal_policy, extack); if (err) return err; for (i = 0; i <= IFAL_MAX; i++) { if (!tb[i]) continue; switch (i) { case IFAL_ADDRESS: break; default: NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in addrlabel get request"); return -EINVAL; } } return 0; } static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct ifaddrlblmsg *ifal; struct nlattr *tb[IFAL_MAX+1]; struct in6_addr *addr; u32 lseq; int err = 0; struct ip6addrlbl_entry *p; struct sk_buff *skb; err = ip6addrlbl_valid_get_req(in_skb, nlh, tb, extack); if (err < 0) return err; ifal = nlmsg_data(nlh); if (ifal->ifal_family != AF_INET6 || ifal->ifal_prefixlen != 128) return -EINVAL; if (ifal->ifal_index && !addrlbl_ifindex_exists(net, ifal->ifal_index)) return -EINVAL; if (!tb[IFAL_ADDRESS]) return -EINVAL; addr = nla_data(tb[IFAL_ADDRESS]); skb = nlmsg_new(ip6addrlbl_msgsize(), GFP_KERNEL); if (!skb) return -ENOBUFS; err = -ESRCH; rcu_read_lock(); p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index); lseq = net->ipv6.ip6addrlbl_table.seq; if (p) err = ip6addrlbl_fill(skb, p, lseq, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWADDRLABEL, 0); rcu_read_unlock(); if (err < 0) { WARN_ON(err == -EMSGSIZE); kfree_skb(skb); } else { err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); } return err; } int __init ipv6_addr_label_rtnl_register(void) { int ret; ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWADDRLABEL, ip6addrlbl_newdel, NULL, RTNL_FLAG_DOIT_UNLOCKED); if (ret < 0) return ret; ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELADDRLABEL, ip6addrlbl_newdel, NULL, RTNL_FLAG_DOIT_UNLOCKED); if (ret < 0) return ret; ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETADDRLABEL, ip6addrlbl_get, ip6addrlbl_dump, RTNL_FLAG_DOIT_UNLOCKED); return ret; }
3 1509 1509 1509 1509 23 639 2677 75 1003 1003 2613 2613 2613 2613 3 640 2775 2775 2774 1003 1003 498 1003 503 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* memcontrol.h - Memory Controller * * Copyright IBM Corporation, 2007 * Author Balbir Singh <balbir@linux.vnet.ibm.com> * * Copyright 2007 OpenVZ SWsoft Inc * Author: Pavel Emelianov <xemul@openvz.org> */ #ifndef _LINUX_MEMCONTROL_H #define _LINUX_MEMCONTROL_H #include <linux/cgroup.h> #include <linux/vm_event_item.h> #include <linux/hardirq.h> #include <linux/jump_label.h> #include <linux/page_counter.h> #include <linux/vmpressure.h> #include <linux/eventfd.h> #include <linux/mm.h> #include <linux/vmstat.h> #include <linux/writeback.h> #include <linux/page-flags.h> #include <linux/shrinker.h> struct mem_cgroup; struct obj_cgroup; struct page; struct mm_struct; struct kmem_cache; /* Cgroup-specific page state, on top of universal node page state */ enum memcg_stat_item { MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS, MEMCG_SOCK, MEMCG_PERCPU_B, MEMCG_VMALLOC, MEMCG_KMEM, MEMCG_ZSWAP_B, MEMCG_ZSWAPPED, MEMCG_NR_STAT, }; enum memcg_memory_event { MEMCG_LOW, MEMCG_HIGH, MEMCG_MAX, MEMCG_OOM, MEMCG_OOM_KILL, MEMCG_OOM_GROUP_KILL, MEMCG_SWAP_HIGH, MEMCG_SWAP_MAX, MEMCG_SWAP_FAIL, MEMCG_NR_MEMORY_EVENTS, }; struct mem_cgroup_reclaim_cookie { pg_data_t *pgdat; unsigned int generation; }; #ifdef CONFIG_MEMCG #define MEM_CGROUP_ID_SHIFT 16 struct mem_cgroup_id { int id; refcount_t ref; }; /* * Per memcg event counter is incremented at every pagein/pageout. With THP, * it will be incremented by the number of pages. This counter is used * to trigger some periodic events. This is straightforward and better * than using jiffies etc. to handle periodic memcg event. */ enum mem_cgroup_events_target { MEM_CGROUP_TARGET_THRESH, MEM_CGROUP_TARGET_SOFTLIMIT, MEM_CGROUP_NTARGETS, }; struct memcg_vmstats_percpu; struct memcg_vmstats; struct mem_cgroup_reclaim_iter { struct mem_cgroup *position; /* scan generation, increased every round-trip */ unsigned int generation; }; struct lruvec_stats_percpu { /* Local (CPU and cgroup) state */ long state[NR_VM_NODE_STAT_ITEMS]; /* Delta calculation for lockless upward propagation */ long state_prev[NR_VM_NODE_STAT_ITEMS]; }; struct lruvec_stats { /* Aggregated (CPU and subtree) state */ long state[NR_VM_NODE_STAT_ITEMS]; /* Non-hierarchical (CPU aggregated) state */ long state_local[NR_VM_NODE_STAT_ITEMS]; /* Pending child counts during tree propagation */ long state_pending[NR_VM_NODE_STAT_ITEMS]; }; /* * per-node information in memory controller. */ struct mem_cgroup_per_node { struct lruvec lruvec; struct lruvec_stats_percpu __percpu *lruvec_stats_percpu; struct lruvec_stats lruvec_stats; unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; struct mem_cgroup_reclaim_iter iter; struct shrinker_info __rcu *shrinker_info; struct rb_node tree_node; /* RB tree node */ unsigned long usage_in_excess;/* Set to the value by which */ /* the soft limit is exceeded*/ bool on_tree; struct mem_cgroup *memcg; /* Back pointer, we cannot */ /* use container_of */ }; struct mem_cgroup_threshold { struct eventfd_ctx *eventfd; unsigned long threshold; }; /* For threshold */ struct mem_cgroup_threshold_ary { /* An array index points to threshold just below or equal to usage. */ int current_threshold; /* Size of entries[] */ unsigned int size; /* Array of thresholds */ struct mem_cgroup_threshold entries[] __counted_by(size); }; struct mem_cgroup_thresholds { /* Primary thresholds array */ struct mem_cgroup_threshold_ary *primary; /* * Spare threshold array. * This is needed to make mem_cgroup_unregister_event() "never fail". * It must be able to store at least primary->size - 1 entries. */ struct mem_cgroup_threshold_ary *spare; }; /* * Remember four most recent foreign writebacks with dirty pages in this * cgroup. Inode sharing is expected to be uncommon and, even if we miss * one in a given round, we're likely to catch it later if it keeps * foreign-dirtying, so a fairly low count should be enough. * * See mem_cgroup_track_foreign_dirty_slowpath() for details. */ #define MEMCG_CGWB_FRN_CNT 4 struct memcg_cgwb_frn { u64 bdi_id; /* bdi->id of the foreign inode */ int memcg_id; /* memcg->css.id of foreign inode */ u64 at; /* jiffies_64 at the time of dirtying */ struct wb_completion done; /* tracks in-flight foreign writebacks */ }; /* * Bucket for arbitrarily byte-sized objects charged to a memory * cgroup. The bucket can be reparented in one piece when the cgroup * is destroyed, without having to round up the individual references * of all live memory objects in the wild. */ struct obj_cgroup { struct percpu_ref refcnt; struct mem_cgroup *memcg; atomic_t nr_charged_bytes; union { struct list_head list; /* protected by objcg_lock */ struct rcu_head rcu; }; }; /* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide * statistics based on the statistics developed by Rik Van Riel for clock-pro, * to help the administrator determine what knobs to tune. */ struct mem_cgroup { struct cgroup_subsys_state css; /* Private memcg ID. Used to ID objects that outlive the cgroup */ struct mem_cgroup_id id; /* Accounted resources */ struct page_counter memory; /* Both v1 & v2 */ union { struct page_counter swap; /* v2 only */ struct page_counter memsw; /* v1 only */ }; /* Legacy consumer-oriented counters */ struct page_counter kmem; /* v1 only */ struct page_counter tcpmem; /* v1 only */ /* Range enforcement for interrupt charges */ struct work_struct high_work; #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) unsigned long zswap_max; #endif unsigned long soft_limit; /* vmpressure notifications */ struct vmpressure vmpressure; /* * Should the OOM killer kill all belonging tasks, had it kill one? */ bool oom_group; /* protected by memcg_oom_lock */ bool oom_lock; int under_oom; int swappiness; /* OOM-Killer disable */ int oom_kill_disable; /* memory.events and memory.events.local */ struct cgroup_file events_file; struct cgroup_file events_local_file; /* handle for "memory.swap.events" */ struct cgroup_file swap_events_file; /* protect arrays of thresholds */ struct mutex thresholds_lock; /* thresholds for memory usage. RCU-protected */ struct mem_cgroup_thresholds thresholds; /* thresholds for mem+swap usage. RCU-protected */ struct mem_cgroup_thresholds memsw_thresholds; /* For oom notifier event fd */ struct list_head oom_notify; /* * Should we move charges of a task when a task is moved into this * mem_cgroup ? And what type of charges should we move ? */ unsigned long move_charge_at_immigrate; /* taken only while moving_account > 0 */ spinlock_t move_lock; unsigned long move_lock_flags; CACHELINE_PADDING(_pad1_); /* memory.stat */ struct memcg_vmstats *vmstats; /* memory.events */ atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; atomic_long_t memory_events_local[MEMCG_NR_MEMORY_EVENTS]; /* * Hint of reclaim pressure for socket memroy management. Note * that this indicator should NOT be used in legacy cgroup mode * where socket memory is accounted/charged separately. */ unsigned long socket_pressure; /* Legacy tcp memory accounting */ bool tcpmem_active; int tcpmem_pressure; #ifdef CONFIG_MEMCG_KMEM int kmemcg_id; /* * memcg->objcg is wiped out as a part of the objcg repaprenting * process. memcg->orig_objcg preserves a pointer (and a reference) * to the original objcg until the end of live of memcg. */ struct obj_cgroup __rcu *objcg; struct obj_cgroup *orig_objcg; /* list of inherited objcgs, protected by objcg_lock */ struct list_head objcg_list; #endif CACHELINE_PADDING(_pad2_); /* * set > 0 if pages under this cgroup are moving to other cgroup. */ atomic_t moving_account; struct task_struct *move_lock_task; struct memcg_vmstats_percpu __percpu *vmstats_percpu; #ifdef CONFIG_CGROUP_WRITEBACK struct list_head cgwb_list; struct wb_domain cgwb_domain; struct memcg_cgwb_frn cgwb_frn[MEMCG_CGWB_FRN_CNT]; #endif /* List of events which userspace want to receive */ struct list_head event_list; spinlock_t event_list_lock; #ifdef CONFIG_TRANSPARENT_HUGEPAGE struct deferred_split deferred_split_queue; #endif #ifdef CONFIG_LRU_GEN /* per-memcg mm_struct list */ struct lru_gen_mm_list mm_list; #endif struct mem_cgroup_per_node *nodeinfo[]; }; /* * size of first charge trial. * TODO: maybe necessary to use big numbers in big irons or dynamic based of the * workload. */ #define MEMCG_CHARGE_BATCH 64U extern struct mem_cgroup *root_mem_cgroup; enum page_memcg_data_flags { /* page->memcg_data is a pointer to an objcgs vector */ MEMCG_DATA_OBJCGS = (1UL << 0), /* page has been accounted as a non-slab kernel page */ MEMCG_DATA_KMEM = (1UL << 1), /* the next bit after the last actual flag */ __NR_MEMCG_DATA_FLAGS = (1UL << 2), }; #define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1) static inline bool folio_memcg_kmem(struct folio *folio); /* * After the initialization objcg->memcg is always pointing at * a valid memcg, but can be atomically swapped to the parent memcg. * * The caller must ensure that the returned memcg won't be released: * e.g. acquire the rcu_read_lock or css_set_lock. */ static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg) { return READ_ONCE(objcg->memcg); } /* * __folio_memcg - Get the memory cgroup associated with a non-kmem folio * @folio: Pointer to the folio. * * Returns a pointer to the memory cgroup associated with the folio, * or NULL. This function assumes that the folio is known to have a * proper memory cgroup pointer. It's not safe to call this function * against some type of folios, e.g. slab folios or ex-slab folios or * kmem folios. */ static inline struct mem_cgroup *__folio_memcg(struct folio *folio) { unsigned long memcg_data = folio->memcg_data; VM_BUG_ON_FOLIO(folio_test_slab(folio), folio); VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJCGS, folio); VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_KMEM, folio); return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); } /* * __folio_objcg - get the object cgroup associated with a kmem folio. * @folio: Pointer to the folio. * * Returns a pointer to the object cgroup associated with the folio, * or NULL. This function assumes that the folio is known to have a * proper object cgroup pointer. It's not safe to call this function * against some type of folios, e.g. slab folios or ex-slab folios or * LRU folios. */ static inline struct obj_cgroup *__folio_objcg(struct folio *folio) { unsigned long memcg_data = folio->memcg_data; VM_BUG_ON_FOLIO(folio_test_slab(folio), folio); VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJCGS, folio); VM_BUG_ON_FOLIO(!(memcg_data & MEMCG_DATA_KMEM), folio); return (struct obj_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); } /* * folio_memcg - Get the memory cgroup associated with a folio. * @folio: Pointer to the folio. * * Returns a pointer to the memory cgroup associated with the folio, * or NULL. This function assumes that the folio is known to have a * proper memory cgroup pointer. It's not safe to call this function * against some type of folios, e.g. slab folios or ex-slab folios. * * For a non-kmem folio any of the following ensures folio and memcg binding * stability: * * - the folio lock * - LRU isolation * - folio_memcg_lock() * - exclusive reference * - mem_cgroup_trylock_pages() * * For a kmem folio a caller should hold an rcu read lock to protect memcg * associated with a kmem folio from being released. */ static inline struct mem_cgroup *folio_memcg(struct folio *folio) { if (folio_memcg_kmem(folio)) return obj_cgroup_memcg(__folio_objcg(folio)); return __folio_memcg(folio); } static inline struct mem_cgroup *page_memcg(struct page *page) { return folio_memcg(page_folio(page)); } /** * folio_memcg_rcu - Locklessly get the memory cgroup associated with a folio. * @folio: Pointer to the folio. * * This function assumes that the folio is known to have a * proper memory cgroup pointer. It's not safe to call this function * against some type of folios, e.g. slab folios or ex-slab folios. * * Return: A pointer to the memory cgroup associated with the folio, * or NULL. */ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio) { unsigned long memcg_data = READ_ONCE(folio->memcg_data); VM_BUG_ON_FOLIO(folio_test_slab(folio), folio); WARN_ON_ONCE(!rcu_read_lock_held()); if (memcg_data & MEMCG_DATA_KMEM) { struct obj_cgroup *objcg; objcg = (void *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); return obj_cgroup_memcg(objcg); } return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); } /* * folio_memcg_check - Get the memory cgroup associated with a folio. * @folio: Pointer to the folio. * * Returns a pointer to the memory cgroup associated with the folio, * or NULL. This function unlike folio_memcg() can take any folio * as an argument. It has to be used in cases when it's not known if a folio * has an associated memory cgroup pointer or an object cgroups vector or * an object cgroup. * * For a non-kmem folio any of the following ensures folio and memcg binding * stability: * * - the folio lock * - LRU isolation * - lock_folio_memcg() * - exclusive reference * - mem_cgroup_trylock_pages() * * For a kmem folio a caller should hold an rcu read lock to protect memcg * associated with a kmem folio from being released. */ static inline struct mem_cgroup *folio_memcg_check(struct folio *folio) { /* * Because folio->memcg_data might be changed asynchronously * for slabs, READ_ONCE() should be used here. */ unsigned long memcg_data = READ_ONCE(folio->memcg_data); if (memcg_data & MEMCG_DATA_OBJCGS) return NULL; if (memcg_data & MEMCG_DATA_KMEM) { struct obj_cgroup *objcg; objcg = (void *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); return obj_cgroup_memcg(objcg); } return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); } static inline struct mem_cgroup *page_memcg_check(struct page *page) { if (PageTail(page)) return NULL; return folio_memcg_check((struct folio *)page); } static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg) { struct mem_cgroup *memcg; rcu_read_lock(); retry: memcg = obj_cgroup_memcg(objcg); if (unlikely(!css_tryget(&memcg->css))) goto retry; rcu_read_unlock(); return memcg; } #ifdef CONFIG_MEMCG_KMEM /* * folio_memcg_kmem - Check if the folio has the memcg_kmem flag set. * @folio: Pointer to the folio. * * Checks if the folio has MemcgKmem flag set. The caller must ensure * that the folio has an associated memory cgroup. It's not safe to call * this function against some types of folios, e.g. slab folios. */ static inline bool folio_memcg_kmem(struct folio *folio) { VM_BUG_ON_PGFLAGS(PageTail(&folio->page), &folio->page); VM_BUG_ON_FOLIO(folio->memcg_data & MEMCG_DATA_OBJCGS, folio); return folio->memcg_data & MEMCG_DATA_KMEM; } #else static inline bool folio_memcg_kmem(struct folio *folio) { return false; } #endif static inline bool PageMemcgKmem(struct page *page) { return folio_memcg_kmem(page_folio(page)); } static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { return (memcg == root_mem_cgroup); } static inline bool mem_cgroup_disabled(void) { return !cgroup_subsys_enabled(memory_cgrp_subsys); } static inline void mem_cgroup_protection(struct mem_cgroup *root, struct mem_cgroup *memcg, unsigned long *min, unsigned long *low) { *min = *low = 0; if (mem_cgroup_disabled()) return; /* * There is no reclaim protection applied to a targeted reclaim. * We are special casing this specific case here because * mem_cgroup_calculate_protection is not robust enough to keep * the protection invariant for calculated effective values for * parallel reclaimers with different reclaim target. This is * especially a problem for tail memcgs (as they have pages on LRU) * which would want to have effective values 0 for targeted reclaim * but a different value for external reclaim. * * Example * Let's have global and A's reclaim in parallel: * | * A (low=2G, usage = 3G, max = 3G, children_low_usage = 1.5G) * |\ * | C (low = 1G, usage = 2.5G) * B (low = 1G, usage = 0.5G) * * For the global reclaim * A.elow = A.low * B.elow = min(B.usage, B.low) because children_low_usage <= A.elow * C.elow = min(C.usage, C.low) * * With the effective values resetting we have A reclaim * A.elow = 0 * B.elow = B.low * C.elow = C.low * * If the global reclaim races with A's reclaim then * B.elow = C.elow = 0 because children_low_usage > A.elow) * is possible and reclaiming B would be violating the protection. * */ if (root == memcg) return; *min = READ_ONCE(memcg->memory.emin); *low = READ_ONCE(memcg->memory.elow); } void mem_cgroup_calculate_protection(struct mem_cgroup *root, struct mem_cgroup *memcg); static inline bool mem_cgroup_unprotected(struct mem_cgroup *target, struct mem_cgroup *memcg) { /* * The root memcg doesn't account charges, and doesn't support * protection. The target memcg's protection is ignored, see * mem_cgroup_calculate_protection() and mem_cgroup_protection() */ return mem_cgroup_disabled() || mem_cgroup_is_root(memcg) || memcg == target; } static inline bool mem_cgroup_below_low(struct mem_cgroup *target, struct mem_cgroup *memcg) { if (mem_cgroup_unprotected(target, memcg)) return false; return READ_ONCE(memcg->memory.elow) >= page_counter_read(&memcg->memory); } static inline bool mem_cgroup_below_min(struct mem_cgroup *target, struct mem_cgroup *memcg) { if (mem_cgroup_unprotected(target, memcg)) return false; return READ_ONCE(memcg->memory.emin) >= page_counter_read(&memcg->memory); } void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg); int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp); /** * mem_cgroup_charge - Charge a newly allocated folio to a cgroup. * @folio: Folio to charge. * @mm: mm context of the allocating task. * @gfp: Reclaim mode. * * Try to charge @folio to the memcg that @mm belongs to, reclaiming * pages according to @gfp if necessary. If @mm is NULL, try to * charge to the active memcg. * * Do not use this for folios allocated for swapin. * * Return: 0 on success. Otherwise, an error code is returned. */ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) { if (mem_cgroup_disabled()) return 0; return __mem_cgroup_charge(folio, mm, gfp); } int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, long nr_pages); int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry); void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry); void __mem_cgroup_uncharge(struct folio *folio); /** * mem_cgroup_uncharge - Uncharge a folio. * @folio: Folio to uncharge. * * Uncharge a folio previously charged with mem_cgroup_charge(). */ static inline void mem_cgroup_uncharge(struct folio *folio) { if (mem_cgroup_disabled()) return; __mem_cgroup_uncharge(folio); } void __mem_cgroup_uncharge_list(struct list_head *page_list); static inline void mem_cgroup_uncharge_list(struct list_head *page_list) { if (mem_cgroup_disabled()) return; __mem_cgroup_uncharge_list(page_list); } void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages); void mem_cgroup_replace_folio(struct folio *old, struct folio *new); void mem_cgroup_migrate(struct folio *old, struct folio *new); /** * mem_cgroup_lruvec - get the lru list vector for a memcg & node * @memcg: memcg of the wanted lruvec * @pgdat: pglist_data * * Returns the lru list vector holding pages for a given @memcg & * @pgdat combination. This can be the node lruvec, if the memory * controller is disabled. */ static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg, struct pglist_data *pgdat) { struct mem_cgroup_per_node *mz; struct lruvec *lruvec; if (mem_cgroup_disabled()) { lruvec = &pgdat->__lruvec; goto out; } if (!memcg) memcg = root_mem_cgroup; mz = memcg->nodeinfo[pgdat->node_id]; lruvec = &mz->lruvec; out: /* * Since a node can be onlined after the mem_cgroup was created, * we have to be prepared to initialize lruvec->pgdat here; * and if offlined then reonlined, we need to reinitialize it. */ if (unlikely(lruvec->pgdat != pgdat)) lruvec->pgdat = pgdat; return lruvec; } /** * folio_lruvec - return lruvec for isolating/putting an LRU folio * @folio: Pointer to the folio. * * This function relies on folio->mem_cgroup being stable. */ static inline struct lruvec *folio_lruvec(struct folio *folio) { struct mem_cgroup *memcg = folio_memcg(folio); VM_WARN_ON_ONCE_FOLIO(!memcg && !mem_cgroup_disabled(), folio); return mem_cgroup_lruvec(memcg, folio_pgdat(folio)); } struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm); struct mem_cgroup *get_mem_cgroup_from_current(void); struct lruvec *folio_lruvec_lock(struct folio *folio); struct lruvec *folio_lruvec_lock_irq(struct folio *folio); struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, unsigned long *flags); #ifdef CONFIG_DEBUG_VM void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio); #else static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) { } #endif static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ return css ? container_of(css, struct mem_cgroup, css) : NULL; } static inline bool obj_cgroup_tryget(struct obj_cgroup *objcg) { return percpu_ref_tryget(&objcg->refcnt); } static inline void obj_cgroup_get(struct obj_cgroup *objcg) { percpu_ref_get(&objcg->refcnt); } static inline void obj_cgroup_get_many(struct obj_cgroup *objcg, unsigned long nr) { percpu_ref_get_many(&objcg->refcnt, nr); } static inline void obj_cgroup_put(struct obj_cgroup *objcg) { percpu_ref_put(&objcg->refcnt); } static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg) { return !memcg || css_tryget(&memcg->css); } static inline void mem_cgroup_put(struct mem_cgroup *memcg) { if (memcg) css_put(&memcg->css); } #define mem_cgroup_from_counter(counter, member) \ container_of(counter, struct mem_cgroup, member) struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, struct mem_cgroup *, struct mem_cgroup_reclaim_cookie *); void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, int (*)(struct task_struct *, void *), void *arg); static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) { if (mem_cgroup_disabled()) return 0; return memcg->id.id; } struct mem_cgroup *mem_cgroup_from_id(unsigned short id); #ifdef CONFIG_SHRINKER_DEBUG static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg) { return memcg ? cgroup_ino(memcg->css.cgroup) : 0; } struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino); #endif static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) { return mem_cgroup_from_css(seq_css(m)); } static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) { struct mem_cgroup_per_node *mz; if (mem_cgroup_disabled()) return NULL; mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); return mz->memcg; } /** * parent_mem_cgroup - find the accounting parent of a memcg * @memcg: memcg whose parent to find * * Returns the parent memcg, or NULL if this is the root. */ static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) { return mem_cgroup_from_css(memcg->css.parent); } static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root) { if (root == memcg) return true; return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup); } static inline bool mm_match_cgroup(struct mm_struct *mm, struct mem_cgroup *memcg) { struct mem_cgroup *task_memcg; bool match = false; rcu_read_lock(); task_memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); if (task_memcg) match = mem_cgroup_is_descendant(task_memcg, memcg); rcu_read_unlock(); return match; } struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio); ino_t page_cgroup_ino(struct page *page); static inline bool mem_cgroup_online(struct mem_cgroup *memcg) { if (mem_cgroup_disabled()) return true; return !!(memcg->css.flags & CSS_ONLINE); } void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, int zid, int nr_pages); static inline unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) { struct mem_cgroup_per_node *mz; mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); return READ_ONCE(mz->lru_zone_size[zone_idx][lru]); } void mem_cgroup_handle_over_high(gfp_t gfp_mask); unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg); unsigned long mem_cgroup_size(struct mem_cgroup *memcg); void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p); void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg); static inline void mem_cgroup_enter_user_fault(void) { WARN_ON(current->in_user_fault); current->in_user_fault = 1; } static inline void mem_cgroup_exit_user_fault(void) { WARN_ON(!current->in_user_fault); current->in_user_fault = 0; } static inline bool task_in_memcg_oom(struct task_struct *p) { return p->memcg_in_oom; } bool mem_cgroup_oom_synchronize(bool wait); struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, struct mem_cgroup *oom_domain); void mem_cgroup_print_oom_group(struct mem_cgroup *memcg); void folio_memcg_lock(struct folio *folio); void folio_memcg_unlock(struct folio *folio); void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val); /* try to stablize folio_memcg() for all the pages in a memcg */ static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg) { rcu_read_lock(); if (mem_cgroup_disabled() || !atomic_read(&memcg->moving_account)) return true; rcu_read_unlock(); return false; } static inline void mem_cgroup_unlock_pages(void) { rcu_read_unlock(); } /* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) { unsigned long flags; local_irq_save(flags); __mod_memcg_state(memcg, idx, val); local_irq_restore(flags); } static inline void mod_memcg_page_state(struct page *page, int idx, int val) { struct mem_cgroup *memcg; if (mem_cgroup_disabled()) return; rcu_read_lock(); memcg = page_memcg(page); if (memcg) mod_memcg_state(memcg, idx, val); rcu_read_unlock(); } unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx); static inline unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) { struct mem_cgroup_per_node *pn; long x; if (mem_cgroup_disabled()) return node_page_state(lruvec_pgdat(lruvec), idx); pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); x = READ_ONCE(pn->lruvec_stats.state[idx]); #ifdef CONFIG_SMP if (x < 0) x = 0; #endif return x; } static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, enum node_stat_item idx) { struct mem_cgroup_per_node *pn; long x = 0; if (mem_cgroup_disabled()) return node_page_state(lruvec_pgdat(lruvec), idx); pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); x = READ_ONCE(pn->lruvec_stats.state_local[idx]); #ifdef CONFIG_SMP if (x < 0) x = 0; #endif return x; } void mem_cgroup_flush_stats(void); void mem_cgroup_flush_stats_ratelimited(void); void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val); static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { unsigned long flags; local_irq_save(flags); __mod_lruvec_kmem_state(p, idx, val); local_irq_restore(flags); } static inline void mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { unsigned long flags; local_irq_save(flags); __mod_memcg_lruvec_state(lruvec, idx, val); local_irq_restore(flags); } void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count); static inline void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) { unsigned long flags; local_irq_save(flags); __count_memcg_events(memcg, idx, count); local_irq_restore(flags); } static inline void count_memcg_folio_events(struct folio *folio, enum vm_event_item idx, unsigned long nr) { struct mem_cgroup *memcg = folio_memcg(folio); if (memcg) count_memcg_events(memcg, idx, nr); } static inline void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { struct mem_cgroup *memcg; if (mem_cgroup_disabled()) return; rcu_read_lock(); memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); if (likely(memcg)) count_memcg_events(memcg, idx, 1); rcu_read_unlock(); } static inline void memcg_memory_event(struct mem_cgroup *memcg, enum memcg_memory_event event) { bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX || event == MEMCG_SWAP_FAIL; atomic_long_inc(&memcg->memory_events_local[event]); if (!swap_event) cgroup_file_notify(&memcg->events_local_file); do { atomic_long_inc(&memcg->memory_events[event]); if (swap_event) cgroup_file_notify(&memcg->swap_events_file); else cgroup_file_notify(&memcg->events_file); if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) break; if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) break; } while ((memcg = parent_mem_cgroup(memcg)) && !mem_cgroup_is_root(memcg)); } static inline void memcg_memory_event_mm(struct mm_struct *mm, enum memcg_memory_event event) { struct mem_cgroup *memcg; if (mem_cgroup_disabled()) return; rcu_read_lock(); memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); if (likely(memcg)) memcg_memory_event(memcg, event); rcu_read_unlock(); } void split_page_memcg(struct page *head, unsigned int nr); unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned); #else /* CONFIG_MEMCG */ #define MEM_CGROUP_ID_SHIFT 0 static inline struct mem_cgroup *folio_memcg(struct folio *folio) { return NULL; } static inline struct mem_cgroup *page_memcg(struct page *page) { return NULL; } static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio) { WARN_ON_ONCE(!rcu_read_lock_held()); return NULL; } static inline struct mem_cgroup *folio_memcg_check(struct folio *folio) { return NULL; } static inline struct mem_cgroup *page_memcg_check(struct page *page) { return NULL; } static inline bool folio_memcg_kmem(struct folio *folio) { return false; } static inline bool PageMemcgKmem(struct page *page) { return false; } static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { return true; } static inline bool mem_cgroup_disabled(void) { return true; } static inline void memcg_memory_event(struct mem_cgroup *memcg, enum memcg_memory_event event) { } static inline void memcg_memory_event_mm(struct mm_struct *mm, enum memcg_memory_event event) { } static inline void mem_cgroup_protection(struct mem_cgroup *root, struct mem_cgroup *memcg, unsigned long *min, unsigned long *low) { *min = *low = 0; } static inline void mem_cgroup_calculate_protection(struct mem_cgroup *root, struct mem_cgroup *memcg) { } static inline bool mem_cgroup_unprotected(struct mem_cgroup *target, struct mem_cgroup *memcg) { return true; } static inline bool mem_cgroup_below_low(struct mem_cgroup *target, struct mem_cgroup *memcg) { return false; } static inline bool mem_cgroup_below_min(struct mem_cgroup *target, struct mem_cgroup *memcg) { return false; } static inline void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg) { } static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) { return 0; } static inline int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, long nr_pages) { return 0; } static inline int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry) { return 0; } static inline void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry) { } static inline void mem_cgroup_uncharge(struct folio *folio) { } static inline void mem_cgroup_uncharge_list(struct list_head *page_list) { } static inline void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) { } static inline void mem_cgroup_replace_folio(struct folio *old, struct folio *new) { } static inline void mem_cgroup_migrate(struct folio *old, struct folio *new) { } static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg, struct pglist_data *pgdat) { return &pgdat->__lruvec; } static inline struct lruvec *folio_lruvec(struct folio *folio) { struct pglist_data *pgdat = folio_pgdat(folio); return &pgdat->__lruvec; } static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) { } static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) { return NULL; } static inline bool mm_match_cgroup(struct mm_struct *mm, struct mem_cgroup *memcg) { return true; } static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) { return NULL; } static inline struct mem_cgroup *get_mem_cgroup_from_current(void) { return NULL; } static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css) { return NULL; } static inline void obj_cgroup_put(struct obj_cgroup *objcg) { } static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg) { return true; } static inline void mem_cgroup_put(struct mem_cgroup *memcg) { } static inline struct lruvec *folio_lruvec_lock(struct folio *folio) { struct pglist_data *pgdat = folio_pgdat(folio); spin_lock(&pgdat->__lruvec.lru_lock); return &pgdat->__lruvec; } static inline struct lruvec *folio_lruvec_lock_irq(struct folio *folio) { struct pglist_data *pgdat = folio_pgdat(folio); spin_lock_irq(&pgdat->__lruvec.lru_lock); return &pgdat->__lruvec; } static inline struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, unsigned long *flagsp) { struct pglist_data *pgdat = folio_pgdat(folio); spin_lock_irqsave(&pgdat->__lruvec.lru_lock, *flagsp); return &pgdat->__lruvec; } static inline struct mem_cgroup * mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup *prev, struct mem_cgroup_reclaim_cookie *reclaim) { return NULL; } static inline void mem_cgroup_iter_break(struct mem_cgroup *root, struct mem_cgroup *prev) { } static inline void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, int (*fn)(struct task_struct *, void *), void *arg) { } static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) { return 0; } static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) { WARN_ON_ONCE(id); /* XXX: This should always return root_mem_cgroup */ return NULL; } #ifdef CONFIG_SHRINKER_DEBUG static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg) { return 0; } static inline struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) { return NULL; } #endif static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) { return NULL; } static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) { return NULL; } static inline bool mem_cgroup_online(struct mem_cgroup *memcg) { return true; } static inline unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) { return 0; } static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) { return 0; } static inline unsigned long mem_cgroup_size(struct mem_cgroup *memcg) { return 0; } static inline void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p) { } static inline void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) { } static inline void folio_memcg_lock(struct folio *folio) { } static inline void folio_memcg_unlock(struct folio *folio) { } static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg) { /* to match folio_memcg_rcu() */ rcu_read_lock(); return true; } static inline void mem_cgroup_unlock_pages(void) { rcu_read_unlock(); } static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask) { } static inline void mem_cgroup_enter_user_fault(void) { } static inline void mem_cgroup_exit_user_fault(void) { } static inline bool task_in_memcg_oom(struct task_struct *p) { return false; } static inline bool mem_cgroup_oom_synchronize(bool wait) { return false; } static inline struct mem_cgroup *mem_cgroup_get_oom_group( struct task_struct *victim, struct mem_cgroup *oom_domain) { return NULL; } static inline void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) { } static inline void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int nr) { } static inline void mod_memcg_state(struct mem_cgroup *memcg, int idx, int nr) { } static inline void mod_memcg_page_state(struct page *page, int idx, int val) { } static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { return 0; } static inline unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) { return node_page_state(lruvec_pgdat(lruvec), idx); } static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, enum node_stat_item idx) { return node_page_state(lruvec_pgdat(lruvec), idx); } static inline void mem_cgroup_flush_stats(void) { } static inline void mem_cgroup_flush_stats_ratelimited(void) { } static inline void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { } static inline void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { struct page *page = virt_to_head_page(p); __mod_node_page_state(page_pgdat(page), idx, val); } static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { struct page *page = virt_to_head_page(p); mod_node_page_state(page_pgdat(page), idx, val); } static inline void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) { } static inline void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) { } static inline void count_memcg_folio_events(struct folio *folio, enum vm_event_item idx, unsigned long nr) { } static inline void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { } static inline void split_page_memcg(struct page *head, unsigned int nr) { } static inline unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned) { return 0; } #endif /* CONFIG_MEMCG */ static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx) { __mod_lruvec_kmem_state(p, idx, 1); } static inline void __dec_lruvec_kmem_state(void *p, enum node_stat_item idx) { __mod_lruvec_kmem_state(p, idx, -1); } static inline struct lruvec *parent_lruvec(struct lruvec *lruvec) { struct mem_cgroup *memcg; memcg = lruvec_memcg(lruvec); if (!memcg) return NULL; memcg = parent_mem_cgroup(memcg); if (!memcg) return NULL; return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec)); } static inline void unlock_page_lruvec(struct lruvec *lruvec) { spin_unlock(&lruvec->lru_lock); } static inline void unlock_page_lruvec_irq(struct lruvec *lruvec) { spin_unlock_irq(&lruvec->lru_lock); } static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec, unsigned long flags) { spin_unlock_irqrestore(&lruvec->lru_lock, flags); } /* Test requires a stable page->memcg binding, see page_memcg() */ static inline bool folio_matches_lruvec(struct folio *folio, struct lruvec *lruvec) { return lruvec_pgdat(lruvec) == folio_pgdat(folio) && lruvec_memcg(lruvec) == folio_memcg(folio); } /* Don't lock again iff page's lruvec locked */ static inline struct lruvec *folio_lruvec_relock_irq(struct folio *folio, struct lruvec *locked_lruvec) { if (locked_lruvec) { if (folio_matches_lruvec(folio, locked_lruvec)) return locked_lruvec; unlock_page_lruvec_irq(locked_lruvec); } return folio_lruvec_lock_irq(folio); } /* Don't lock again iff page's lruvec locked */ static inline struct lruvec *folio_lruvec_relock_irqsave(struct folio *folio, struct lruvec *locked_lruvec, unsigned long *flags) { if (locked_lruvec) { if (folio_matches_lruvec(folio, locked_lruvec)) return locked_lruvec; unlock_page_lruvec_irqrestore(locked_lruvec, *flags); } return folio_lruvec_lock_irqsave(folio, flags); } #ifdef CONFIG_CGROUP_WRITEBACK struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb); void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, unsigned long *pheadroom, unsigned long *pdirty, unsigned long *pwriteback); void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio, struct bdi_writeback *wb); static inline void mem_cgroup_track_foreign_dirty(struct folio *folio, struct bdi_writeback *wb) { struct mem_cgroup *memcg; if (mem_cgroup_disabled()) return; memcg = folio_memcg(folio); if (unlikely(memcg && &memcg->css != wb->memcg_css)) mem_cgroup_track_foreign_dirty_slowpath(folio, wb); } void mem_cgroup_flush_foreign(struct bdi_writeback *wb); #else /* CONFIG_CGROUP_WRITEBACK */ static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) { return NULL; } static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, unsigned long *pheadroom, unsigned long *pdirty, unsigned long *pwriteback) { } static inline void mem_cgroup_track_foreign_dirty(struct folio *folio, struct bdi_writeback *wb) { } static inline void mem_cgroup_flush_foreign(struct bdi_writeback *wb) { } #endif /* CONFIG_CGROUP_WRITEBACK */ struct sock; bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, gfp_t gfp_mask); void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages); #ifdef CONFIG_MEMCG extern struct static_key_false memcg_sockets_enabled_key; #define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key) void mem_cgroup_sk_alloc(struct sock *sk); void mem_cgroup_sk_free(struct sock *sk); static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) return !!memcg->tcpmem_pressure; do { if (time_before(jiffies, READ_ONCE(memcg->socket_pressure))) return true; } while ((memcg = parent_mem_cgroup(memcg))); return false; } int alloc_shrinker_info(struct mem_cgroup *memcg); void free_shrinker_info(struct mem_cgroup *memcg); void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id); void reparent_shrinker_deferred(struct mem_cgroup *memcg); #else #define mem_cgroup_sockets_enabled 0 static inline void mem_cgroup_sk_alloc(struct sock *sk) { }; static inline void mem_cgroup_sk_free(struct sock *sk) { }; static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { return false; } static inline void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) { } #endif #ifdef CONFIG_MEMCG_KMEM bool mem_cgroup_kmem_disabled(void); int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order); void __memcg_kmem_uncharge_page(struct page *page, int order); /* * The returned objcg pointer is safe to use without additional * protection within a scope. The scope is defined either by * the current task (similar to the "current" global variable) * or by set_active_memcg() pair. * Please, use obj_cgroup_get() to get a reference if the pointer * needs to be used outside of the local scope. */ struct obj_cgroup *current_obj_cgroup(void); struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio); static inline struct obj_cgroup *get_obj_cgroup_from_current(void) { struct obj_cgroup *objcg = current_obj_cgroup(); if (objcg) obj_cgroup_get(objcg); return objcg; } int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size); void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size); extern struct static_key_false memcg_bpf_enabled_key; static inline bool memcg_bpf_enabled(void) { return static_branch_likely(&memcg_bpf_enabled_key); } extern struct static_key_false memcg_kmem_online_key; static inline bool memcg_kmem_online(void) { return static_branch_likely(&memcg_kmem_online_key); } static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) { if (memcg_kmem_online()) return __memcg_kmem_charge_page(page, gfp, order); return 0; } static inline void memcg_kmem_uncharge_page(struct page *page, int order) { if (memcg_kmem_online()) __memcg_kmem_uncharge_page(page, order); } /* * A helper for accessing memcg's kmem_id, used for getting * corresponding LRU lists. */ static inline int memcg_kmem_id(struct mem_cgroup *memcg) { return memcg ? memcg->kmemcg_id : -1; } struct mem_cgroup *mem_cgroup_from_obj(void *p); struct mem_cgroup *mem_cgroup_from_slab_obj(void *p); static inline void count_objcg_event(struct obj_cgroup *objcg, enum vm_event_item idx) { struct mem_cgroup *memcg; if (!memcg_kmem_online()) return; rcu_read_lock(); memcg = obj_cgroup_memcg(objcg); count_memcg_events(memcg, idx, 1); rcu_read_unlock(); } #else static inline bool mem_cgroup_kmem_disabled(void) { return true; } static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) { return 0; } static inline void memcg_kmem_uncharge_page(struct page *page, int order) { } static inline int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) { return 0; } static inline void __memcg_kmem_uncharge_page(struct page *page, int order) { } static inline struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio) { return NULL; } static inline bool memcg_bpf_enabled(void) { return false; } static inline bool memcg_kmem_online(void) { return false; } static inline int memcg_kmem_id(struct mem_cgroup *memcg) { return -1; } static inline struct mem_cgroup *mem_cgroup_from_obj(void *p) { return NULL; } static inline struct mem_cgroup *mem_cgroup_from_slab_obj(void *p) { return NULL; } static inline void count_objcg_event(struct obj_cgroup *objcg, enum vm_event_item idx) { } #endif /* CONFIG_MEMCG_KMEM */ #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) bool obj_cgroup_may_zswap(struct obj_cgroup *objcg); void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size); void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size); #else static inline bool obj_cgroup_may_zswap(struct obj_cgroup *objcg) { return true; } static inline void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size) { } static inline void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size) { } #endif #endif /* _LINUX_MEMCONTROL_H */
9 23 7 3 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 /* SPDX-License-Identifier: GPL-2.0 */ /* * Definitions for the UDP-Lite (RFC 3828) code. */ #ifndef _UDPLITE_H #define _UDPLITE_H #include <net/ip6_checksum.h> #include <net/udp.h> /* UDP-Lite socket options */ #define UDPLITE_SEND_CSCOV 10 /* sender partial coverage (as sent) */ #define UDPLITE_RECV_CSCOV 11 /* receiver partial coverage (threshold ) */ extern struct proto udplite_prot; extern struct udp_table udplite_table; /* * Checksum computation is all in software, hence simpler getfrag. */ static __inline__ int udplite_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) { struct msghdr *msg = from; return copy_from_iter_full(to, len, &msg->msg_iter) ? 0 : -EFAULT; } /* * Checksumming routines */ static inline int udplite_checksum_init(struct sk_buff *skb, struct udphdr *uh) { u16 cscov; /* In UDPv4 a zero checksum means that the transmitter generated no * checksum. UDP-Lite (like IPv6) mandates checksums, hence packets * with a zero checksum field are illegal. */ if (uh->check == 0) { net_dbg_ratelimited("UDPLite: zeroed checksum field\n"); return 1; } cscov = ntohs(uh->len); if (cscov == 0) /* Indicates that full coverage is required. */ ; else if (cscov < 8 || cscov > skb->len) { /* * Coverage length violates RFC 3828: log and discard silently. */ net_dbg_ratelimited("UDPLite: bad csum coverage %d/%d\n", cscov, skb->len); return 1; } else if (cscov < skb->len) { UDP_SKB_CB(skb)->partial_cov = 1; UDP_SKB_CB(skb)->cscov = cscov; if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; skb->csum_valid = 0; } return 0; } /* Fast-path computation of checksum. Socket may not be locked. */ static inline __wsum udplite_csum(struct sk_buff *skb) { const int off = skb_transport_offset(skb); const struct sock *sk = skb->sk; int len = skb->len - off; if (udp_test_bit(UDPLITE_SEND_CC, sk)) { u16 pcslen = READ_ONCE(udp_sk(sk)->pcslen); if (pcslen < len) { if (pcslen > 0) len = pcslen; udp_hdr(skb)->len = htons(pcslen); } } skb->ip_summed = CHECKSUM_NONE; /* no HW support for checksumming */ return skb_checksum(skb, off, len, 0); } void udplite4_register(void); #endif /* _UDPLITE_H */
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 // SPDX-License-Identifier: GPL-2.0+ /* * Driver for Alauda-based card readers * * Current development and maintenance by: * (c) 2005 Daniel Drake <dsd@gentoo.org> * * The 'Alauda' is a chip manufacturered by RATOC for OEM use. * * Alauda implements a vendor-specific command set to access two media reader * ports (XD, SmartMedia). This driver converts SCSI commands to the commands * which are accepted by these devices. * * The driver was developed through reverse-engineering, with the help of the * sddr09 driver which has many similarities, and with some help from the * (very old) vendor-supplied GPL sma03 driver. * * For protocol info, see http://alauda.sourceforge.net */ #include <linux/module.h> #include <linux/slab.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_device.h> #include "usb.h" #include "transport.h" #include "protocol.h" #include "debug.h" #include "scsiglue.h" #define DRV_NAME "ums-alauda" MODULE_DESCRIPTION("Driver for Alauda-based card readers"); MODULE_AUTHOR("Daniel Drake <dsd@gentoo.org>"); MODULE_LICENSE("GPL"); MODULE_IMPORT_NS(USB_STORAGE); /* * Status bytes */ #define ALAUDA_STATUS_ERROR 0x01 #define ALAUDA_STATUS_READY 0x40 /* * Control opcodes (for request field) */ #define ALAUDA_GET_XD_MEDIA_STATUS 0x08 #define ALAUDA_GET_SM_MEDIA_STATUS 0x98 #define ALAUDA_ACK_XD_MEDIA_CHANGE 0x0a #define ALAUDA_ACK_SM_MEDIA_CHANGE 0x9a #define ALAUDA_GET_XD_MEDIA_SIG 0x86 #define ALAUDA_GET_SM_MEDIA_SIG 0x96 /* * Bulk command identity (byte 0) */ #define ALAUDA_BULK_CMD 0x40 /* * Bulk opcodes (byte 1) */ #define ALAUDA_BULK_GET_REDU_DATA 0x85 #define ALAUDA_BULK_READ_BLOCK 0x94 #define ALAUDA_BULK_ERASE_BLOCK 0xa3 #define ALAUDA_BULK_WRITE_BLOCK 0xb4 #define ALAUDA_BULK_GET_STATUS2 0xb7 #define ALAUDA_BULK_RESET_MEDIA 0xe0 /* * Port to operate on (byte 8) */ #define ALAUDA_PORT_XD 0x00 #define ALAUDA_PORT_SM 0x01 /* * LBA and PBA are unsigned ints. Special values. */ #define UNDEF 0xffff #define SPARE 0xfffe #define UNUSABLE 0xfffd struct alauda_media_info { unsigned long capacity; /* total media size in bytes */ unsigned int pagesize; /* page size in bytes */ unsigned int blocksize; /* number of pages per block */ unsigned int uzonesize; /* number of usable blocks per zone */ unsigned int zonesize; /* number of blocks per zone */ unsigned int blockmask; /* mask to get page from address */ unsigned char pageshift; unsigned char blockshift; unsigned char zoneshift; u16 **lba_to_pba; /* logical to physical block map */ u16 **pba_to_lba; /* physical to logical block map */ }; struct alauda_info { struct alauda_media_info port[2]; int wr_ep; /* endpoint to write data out of */ unsigned char sense_key; unsigned long sense_asc; /* additional sense code */ unsigned long sense_ascq; /* additional sense code qualifier */ }; #define short_pack(lsb,msb) ( ((u16)(lsb)) | ( ((u16)(msb))<<8 ) ) #define LSB_of(s) ((s)&0xFF) #define MSB_of(s) ((s)>>8) #define MEDIA_PORT(us) us->srb->device->lun #define MEDIA_INFO(us) ((struct alauda_info *)us->extra)->port[MEDIA_PORT(us)] #define PBA_LO(pba) ((pba & 0xF) << 5) #define PBA_HI(pba) (pba >> 3) #define PBA_ZONE(pba) (pba >> 11) static int init_alauda(struct us_data *us); /* * The table of devices */ #define UNUSUAL_DEV(id_vendor, id_product, bcdDeviceMin, bcdDeviceMax, \ vendorName, productName, useProtocol, useTransport, \ initFunction, flags) \ { USB_DEVICE_VER(id_vendor, id_product, bcdDeviceMin, bcdDeviceMax), \ .driver_info = (flags) } static struct usb_device_id alauda_usb_ids[] = { # include "unusual_alauda.h" { } /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, alauda_usb_ids); #undef UNUSUAL_DEV /* * The flags table */ #define UNUSUAL_DEV(idVendor, idProduct, bcdDeviceMin, bcdDeviceMax, \ vendor_name, product_name, use_protocol, use_transport, \ init_function, Flags) \ { \ .vendorName = vendor_name, \ .productName = product_name, \ .useProtocol = use_protocol, \ .useTransport = use_transport, \ .initFunction = init_function, \ } static struct us_unusual_dev alauda_unusual_dev_list[] = { # include "unusual_alauda.h" { } /* Terminating entry */ }; #undef UNUSUAL_DEV /* * Media handling */ struct alauda_card_info { unsigned char id; /* id byte */ unsigned char chipshift; /* 1<<cs bytes total capacity */ unsigned char pageshift; /* 1<<ps bytes in a page */ unsigned char blockshift; /* 1<<bs pages per block */ unsigned char zoneshift; /* 1<<zs blocks per zone */ }; static struct alauda_card_info alauda_card_ids[] = { /* NAND flash */ { 0x6e, 20, 8, 4, 8}, /* 1 MB */ { 0xe8, 20, 8, 4, 8}, /* 1 MB */ { 0xec, 20, 8, 4, 8}, /* 1 MB */ { 0x64, 21, 8, 4, 9}, /* 2 MB */ { 0xea, 21, 8, 4, 9}, /* 2 MB */ { 0x6b, 22, 9, 4, 9}, /* 4 MB */ { 0xe3, 22, 9, 4, 9}, /* 4 MB */ { 0xe5, 22, 9, 4, 9}, /* 4 MB */ { 0xe6, 23, 9, 4, 10}, /* 8 MB */ { 0x73, 24, 9, 5, 10}, /* 16 MB */ { 0x75, 25, 9, 5, 10}, /* 32 MB */ { 0x76, 26, 9, 5, 10}, /* 64 MB */ { 0x79, 27, 9, 5, 10}, /* 128 MB */ { 0x71, 28, 9, 5, 10}, /* 256 MB */ /* MASK ROM */ { 0x5d, 21, 9, 4, 8}, /* 2 MB */ { 0xd5, 22, 9, 4, 9}, /* 4 MB */ { 0xd6, 23, 9, 4, 10}, /* 8 MB */ { 0x57, 24, 9, 4, 11}, /* 16 MB */ { 0x58, 25, 9, 4, 12}, /* 32 MB */ { 0,} }; static struct alauda_card_info *alauda_card_find_id(unsigned char id) { int i; for (i = 0; alauda_card_ids[i].id != 0; i++) if (alauda_card_ids[i].id == id) return &(alauda_card_ids[i]); return NULL; } /* * ECC computation. */ static unsigned char parity[256]; static unsigned char ecc2[256]; static void nand_init_ecc(void) { int i, j, a; parity[0] = 0; for (i = 1; i < 256; i++) parity[i] = (parity[i&(i-1)] ^ 1); for (i = 0; i < 256; i++) { a = 0; for (j = 0; j < 8; j++) { if (i & (1<<j)) { if ((j & 1) == 0) a ^= 0x04; if ((j & 2) == 0) a ^= 0x10; if ((j & 4) == 0) a ^= 0x40; } } ecc2[i] = ~(a ^ (a<<1) ^ (parity[i] ? 0xa8 : 0)); } } /* compute 3-byte ecc on 256 bytes */ static void nand_compute_ecc(unsigned char *data, unsigned char *ecc) { int i, j, a; unsigned char par = 0, bit, bits[8] = {0}; /* collect 16 checksum bits */ for (i = 0; i < 256; i++) { par ^= data[i]; bit = parity[data[i]]; for (j = 0; j < 8; j++) if ((i & (1<<j)) == 0) bits[j] ^= bit; } /* put 4+4+4 = 12 bits in the ecc */ a = (bits[3] << 6) + (bits[2] << 4) + (bits[1] << 2) + bits[0]; ecc[0] = ~(a ^ (a<<1) ^ (parity[par] ? 0xaa : 0)); a = (bits[7] << 6) + (bits[6] << 4) + (bits[5] << 2) + bits[4]; ecc[1] = ~(a ^ (a<<1) ^ (parity[par] ? 0xaa : 0)); ecc[2] = ecc2[par]; } static int nand_compare_ecc(unsigned char *data, unsigned char *ecc) { return (data[0] == ecc[0] && data[1] == ecc[1] && data[2] == ecc[2]); } static void nand_store_ecc(unsigned char *data, unsigned char *ecc) { memcpy(data, ecc, 3); } /* * Alauda driver */ /* * Forget our PBA <---> LBA mappings for a particular port */ static void alauda_free_maps (struct alauda_media_info *media_info) { unsigned int shift = media_info->zoneshift + media_info->blockshift + media_info->pageshift; unsigned int num_zones = media_info->capacity >> shift; unsigned int i; if (media_info->lba_to_pba != NULL) for (i = 0; i < num_zones; i++) { kfree(media_info->lba_to_pba[i]); media_info->lba_to_pba[i] = NULL; } if (media_info->pba_to_lba != NULL) for (i = 0; i < num_zones; i++) { kfree(media_info->pba_to_lba[i]); media_info->pba_to_lba[i] = NULL; } } /* * Returns 2 bytes of status data * The first byte describes media status, and second byte describes door status */ static int alauda_get_media_status(struct us_data *us, unsigned char *data) { int rc; unsigned char command; if (MEDIA_PORT(us) == ALAUDA_PORT_XD) command = ALAUDA_GET_XD_MEDIA_STATUS; else command = ALAUDA_GET_SM_MEDIA_STATUS; rc = usb_stor_ctrl_transfer(us, us->recv_ctrl_pipe, command, 0xc0, 0, 1, data, 2); if (rc == USB_STOR_XFER_GOOD) usb_stor_dbg(us, "Media status %02X %02X\n", data[0], data[1]); return rc; } /* * Clears the "media was changed" bit so that we know when it changes again * in the future. */ static int alauda_ack_media(struct us_data *us) { unsigned char command; if (MEDIA_PORT(us) == ALAUDA_PORT_XD) command = ALAUDA_ACK_XD_MEDIA_CHANGE; else command = ALAUDA_ACK_SM_MEDIA_CHANGE; return usb_stor_ctrl_transfer(us, us->send_ctrl_pipe, command, 0x40, 0, 1, NULL, 0); } /* * Retrieves a 4-byte media signature, which indicates manufacturer, capacity, * and some other details. */ static int alauda_get_media_signature(struct us_data *us, unsigned char *data) { unsigned char command; if (MEDIA_PORT(us) == ALAUDA_PORT_XD) command = ALAUDA_GET_XD_MEDIA_SIG; else command = ALAUDA_GET_SM_MEDIA_SIG; return usb_stor_ctrl_transfer(us, us->recv_ctrl_pipe, command, 0xc0, 0, 0, data, 4); } /* * Resets the media status (but not the whole device?) */ static int alauda_reset_media(struct us_data *us) { unsigned char *command = us->iobuf; memset(command, 0, 9); command[0] = ALAUDA_BULK_CMD; command[1] = ALAUDA_BULK_RESET_MEDIA; command[8] = MEDIA_PORT(us); return usb_stor_bulk_transfer_buf(us, us->send_bulk_pipe, command, 9, NULL); } /* * Examines the media and deduces capacity, etc. */ static int alauda_init_media(struct us_data *us) { unsigned char *data = us->iobuf; int ready = 0; struct alauda_card_info *media_info; unsigned int num_zones; while (ready == 0) { msleep(20); if (alauda_get_media_status(us, data) != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; if (data[0] & 0x10) ready = 1; } usb_stor_dbg(us, "We are ready for action!\n"); if (alauda_ack_media(us) != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; msleep(10); if (alauda_get_media_status(us, data) != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; if (data[0] != 0x14) { usb_stor_dbg(us, "Media not ready after ack\n"); return USB_STOR_TRANSPORT_ERROR; } if (alauda_get_media_signature(us, data) != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; usb_stor_dbg(us, "Media signature: %4ph\n", data); media_info = alauda_card_find_id(data[1]); if (media_info == NULL) { pr_warn("alauda_init_media: Unrecognised media signature: %4ph\n", data); return USB_STOR_TRANSPORT_ERROR; } MEDIA_INFO(us).capacity = 1 << media_info->chipshift; usb_stor_dbg(us, "Found media with capacity: %ldMB\n", MEDIA_INFO(us).capacity >> 20); MEDIA_INFO(us).pageshift = media_info->pageshift; MEDIA_INFO(us).blockshift = media_info->blockshift; MEDIA_INFO(us).zoneshift = media_info->zoneshift; MEDIA_INFO(us).pagesize = 1 << media_info->pageshift; MEDIA_INFO(us).blocksize = 1 << media_info->blockshift; MEDIA_INFO(us).zonesize = 1 << media_info->zoneshift; MEDIA_INFO(us).uzonesize = ((1 << media_info->zoneshift) / 128) * 125; MEDIA_INFO(us).blockmask = MEDIA_INFO(us).blocksize - 1; num_zones = MEDIA_INFO(us).capacity >> (MEDIA_INFO(us).zoneshift + MEDIA_INFO(us).blockshift + MEDIA_INFO(us).pageshift); MEDIA_INFO(us).pba_to_lba = kcalloc(num_zones, sizeof(u16*), GFP_NOIO); MEDIA_INFO(us).lba_to_pba = kcalloc(num_zones, sizeof(u16*), GFP_NOIO); if (MEDIA_INFO(us).pba_to_lba == NULL || MEDIA_INFO(us).lba_to_pba == NULL) return USB_STOR_TRANSPORT_ERROR; if (alauda_reset_media(us) != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; return USB_STOR_TRANSPORT_GOOD; } /* * Examines the media status and does the right thing when the media has gone, * appeared, or changed. */ static int alauda_check_media(struct us_data *us) { struct alauda_info *info = (struct alauda_info *) us->extra; unsigned char *status = us->iobuf; int rc; rc = alauda_get_media_status(us, status); if (rc != USB_STOR_XFER_GOOD) { status[0] = 0xF0; /* Pretend there's no media */ status[1] = 0; } /* Check for no media or door open */ if ((status[0] & 0x80) || ((status[0] & 0x1F) == 0x10) || ((status[1] & 0x01) == 0)) { usb_stor_dbg(us, "No media, or door open\n"); alauda_free_maps(&MEDIA_INFO(us)); info->sense_key = 0x02; info->sense_asc = 0x3A; info->sense_ascq = 0x00; return USB_STOR_TRANSPORT_FAILED; } /* Check for media change */ if (status[0] & 0x08) { usb_stor_dbg(us, "Media change detected\n"); alauda_free_maps(&MEDIA_INFO(us)); alauda_init_media(us); info->sense_key = UNIT_ATTENTION; info->sense_asc = 0x28; info->sense_ascq = 0x00; return USB_STOR_TRANSPORT_FAILED; } return USB_STOR_TRANSPORT_GOOD; } /* * Checks the status from the 2nd status register * Returns 3 bytes of status data, only the first is known */ static int alauda_check_status2(struct us_data *us) { int rc; unsigned char command[] = { ALAUDA_BULK_CMD, ALAUDA_BULK_GET_STATUS2, 0, 0, 0, 0, 3, 0, MEDIA_PORT(us) }; unsigned char data[3]; rc = usb_stor_bulk_transfer_buf(us, us->send_bulk_pipe, command, 9, NULL); if (rc != USB_STOR_XFER_GOOD) return rc; rc = usb_stor_bulk_transfer_buf(us, us->recv_bulk_pipe, data, 3, NULL); if (rc != USB_STOR_XFER_GOOD) return rc; usb_stor_dbg(us, "%3ph\n", data); if (data[0] & ALAUDA_STATUS_ERROR) return USB_STOR_XFER_ERROR; return USB_STOR_XFER_GOOD; } /* * Gets the redundancy data for the first page of a PBA * Returns 16 bytes. */ static int alauda_get_redu_data(struct us_data *us, u16 pba, unsigned char *data) { int rc; unsigned char command[] = { ALAUDA_BULK_CMD, ALAUDA_BULK_GET_REDU_DATA, PBA_HI(pba), PBA_ZONE(pba), 0, PBA_LO(pba), 0, 0, MEDIA_PORT(us) }; rc = usb_stor_bulk_transfer_buf(us, us->send_bulk_pipe, command, 9, NULL); if (rc != USB_STOR_XFER_GOOD) return rc; return usb_stor_bulk_transfer_buf(us, us->recv_bulk_pipe, data, 16, NULL); } /* * Finds the first unused PBA in a zone * Returns the absolute PBA of an unused PBA, or 0 if none found. */ static u16 alauda_find_unused_pba(struct alauda_media_info *info, unsigned int zone) { u16 *pba_to_lba = info->pba_to_lba[zone]; unsigned int i; for (i = 0; i < info->zonesize; i++) if (pba_to_lba[i] == UNDEF) return (zone << info->zoneshift) + i; return 0; } /* * Reads the redundancy data for all PBA's in a zone * Produces lba <--> pba mappings */ static int alauda_read_map(struct us_data *us, unsigned int zone) { unsigned char *data = us->iobuf; int result; int i, j; unsigned int zonesize = MEDIA_INFO(us).zonesize; unsigned int uzonesize = MEDIA_INFO(us).uzonesize; unsigned int lba_offset, lba_real, blocknum; unsigned int zone_base_lba = zone * uzonesize; unsigned int zone_base_pba = zone * zonesize; u16 *lba_to_pba = kcalloc(zonesize, sizeof(u16), GFP_NOIO); u16 *pba_to_lba = kcalloc(zonesize, sizeof(u16), GFP_NOIO); if (lba_to_pba == NULL || pba_to_lba == NULL) { result = USB_STOR_TRANSPORT_ERROR; goto error; } usb_stor_dbg(us, "Mapping blocks for zone %d\n", zone); /* 1024 PBA's per zone */ for (i = 0; i < zonesize; i++) lba_to_pba[i] = pba_to_lba[i] = UNDEF; for (i = 0; i < zonesize; i++) { blocknum = zone_base_pba + i; result = alauda_get_redu_data(us, blocknum, data); if (result != USB_STOR_XFER_GOOD) { result = USB_STOR_TRANSPORT_ERROR; goto error; } /* special PBAs have control field 0^16 */ for (j = 0; j < 16; j++) if (data[j] != 0) goto nonz; pba_to_lba[i] = UNUSABLE; usb_stor_dbg(us, "PBA %d has no logical mapping\n", blocknum); continue; nonz: /* unwritten PBAs have control field FF^16 */ for (j = 0; j < 16; j++) if (data[j] != 0xff) goto nonff; continue; nonff: /* normal PBAs start with six FFs */ if (j < 6) { usb_stor_dbg(us, "PBA %d has no logical mapping: reserved area = %02X%02X%02X%02X data status %02X block status %02X\n", blocknum, data[0], data[1], data[2], data[3], data[4], data[5]); pba_to_lba[i] = UNUSABLE; continue; } if ((data[6] >> 4) != 0x01) { usb_stor_dbg(us, "PBA %d has invalid address field %02X%02X/%02X%02X\n", blocknum, data[6], data[7], data[11], data[12]); pba_to_lba[i] = UNUSABLE; continue; } /* check even parity */ if (parity[data[6] ^ data[7]]) { printk(KERN_WARNING "alauda_read_map: Bad parity in LBA for block %d" " (%02X %02X)\n", i, data[6], data[7]); pba_to_lba[i] = UNUSABLE; continue; } lba_offset = short_pack(data[7], data[6]); lba_offset = (lba_offset & 0x07FF) >> 1; lba_real = lba_offset + zone_base_lba; /* * Every 1024 physical blocks ("zone"), the LBA numbers * go back to zero, but are within a higher block of LBA's. * Also, there is a maximum of 1000 LBA's per zone. * In other words, in PBA 1024-2047 you will find LBA 0-999 * which are really LBA 1000-1999. This allows for 24 bad * or special physical blocks per zone. */ if (lba_offset >= uzonesize) { printk(KERN_WARNING "alauda_read_map: Bad low LBA %d for block %d\n", lba_real, blocknum); continue; } if (lba_to_pba[lba_offset] != UNDEF) { printk(KERN_WARNING "alauda_read_map: " "LBA %d seen for PBA %d and %d\n", lba_real, lba_to_pba[lba_offset], blocknum); continue; } pba_to_lba[i] = lba_real; lba_to_pba[lba_offset] = blocknum; continue; } MEDIA_INFO(us).lba_to_pba[zone] = lba_to_pba; MEDIA_INFO(us).pba_to_lba[zone] = pba_to_lba; result = 0; goto out; error: kfree(lba_to_pba); kfree(pba_to_lba); out: return result; } /* * Checks to see whether we have already mapped a certain zone * If we haven't, the map is generated */ static void alauda_ensure_map_for_zone(struct us_data *us, unsigned int zone) { if (MEDIA_INFO(us).lba_to_pba[zone] == NULL || MEDIA_INFO(us).pba_to_lba[zone] == NULL) alauda_read_map(us, zone); } /* * Erases an entire block */ static int alauda_erase_block(struct us_data *us, u16 pba) { int rc; unsigned char command[] = { ALAUDA_BULK_CMD, ALAUDA_BULK_ERASE_BLOCK, PBA_HI(pba), PBA_ZONE(pba), 0, PBA_LO(pba), 0x02, 0, MEDIA_PORT(us) }; unsigned char buf[2]; usb_stor_dbg(us, "Erasing PBA %d\n", pba); rc = usb_stor_bulk_transfer_buf(us, us->send_bulk_pipe, command, 9, NULL); if (rc != USB_STOR_XFER_GOOD) return rc; rc = usb_stor_bulk_transfer_buf(us, us->recv_bulk_pipe, buf, 2, NULL); if (rc != USB_STOR_XFER_GOOD) return rc; usb_stor_dbg(us, "Erase result: %02X %02X\n", buf[0], buf[1]); return rc; } /* * Reads data from a certain offset page inside a PBA, including interleaved * redundancy data. Returns (pagesize+64)*pages bytes in data. */ static int alauda_read_block_raw(struct us_data *us, u16 pba, unsigned int page, unsigned int pages, unsigned char *data) { int rc; unsigned char command[] = { ALAUDA_BULK_CMD, ALAUDA_BULK_READ_BLOCK, PBA_HI(pba), PBA_ZONE(pba), 0, PBA_LO(pba) + page, pages, 0, MEDIA_PORT(us) }; usb_stor_dbg(us, "pba %d page %d count %d\n", pba, page, pages); rc = usb_stor_bulk_transfer_buf(us, us->send_bulk_pipe, command, 9, NULL); if (rc != USB_STOR_XFER_GOOD) return rc; return usb_stor_bulk_transfer_buf(us, us->recv_bulk_pipe, data, (MEDIA_INFO(us).pagesize + 64) * pages, NULL); } /* * Reads data from a certain offset page inside a PBA, excluding redundancy * data. Returns pagesize*pages bytes in data. Note that data must be big enough * to hold (pagesize+64)*pages bytes of data, but you can ignore those 'extra' * trailing bytes outside this function. */ static int alauda_read_block(struct us_data *us, u16 pba, unsigned int page, unsigned int pages, unsigned char *data) { int i, rc; unsigned int pagesize = MEDIA_INFO(us).pagesize; rc = alauda_read_block_raw(us, pba, page, pages, data); if (rc != USB_STOR_XFER_GOOD) return rc; /* Cut out the redundancy data */ for (i = 0; i < pages; i++) { int dest_offset = i * pagesize; int src_offset = i * (pagesize + 64); memmove(data + dest_offset, data + src_offset, pagesize); } return rc; } /* * Writes an entire block of data and checks status after write. * Redundancy data must be already included in data. Data should be * (pagesize+64)*blocksize bytes in length. */ static int alauda_write_block(struct us_data *us, u16 pba, unsigned char *data) { int rc; struct alauda_info *info = (struct alauda_info *) us->extra; unsigned char command[] = { ALAUDA_BULK_CMD, ALAUDA_BULK_WRITE_BLOCK, PBA_HI(pba), PBA_ZONE(pba), 0, PBA_LO(pba), 32, 0, MEDIA_PORT(us) }; usb_stor_dbg(us, "pba %d\n", pba); rc = usb_stor_bulk_transfer_buf(us, us->send_bulk_pipe, command, 9, NULL); if (rc != USB_STOR_XFER_GOOD) return rc; rc = usb_stor_bulk_transfer_buf(us, info->wr_ep, data, (MEDIA_INFO(us).pagesize + 64) * MEDIA_INFO(us).blocksize, NULL); if (rc != USB_STOR_XFER_GOOD) return rc; return alauda_check_status2(us); } /* * Write some data to a specific LBA. */ static int alauda_write_lba(struct us_data *us, u16 lba, unsigned int page, unsigned int pages, unsigned char *ptr, unsigned char *blockbuffer) { u16 pba, lbap, new_pba; unsigned char *bptr, *cptr, *xptr; unsigned char ecc[3]; int i, result; unsigned int uzonesize = MEDIA_INFO(us).uzonesize; unsigned int zonesize = MEDIA_INFO(us).zonesize; unsigned int pagesize = MEDIA_INFO(us).pagesize; unsigned int blocksize = MEDIA_INFO(us).blocksize; unsigned int lba_offset = lba % uzonesize; unsigned int new_pba_offset; unsigned int zone = lba / uzonesize; alauda_ensure_map_for_zone(us, zone); pba = MEDIA_INFO(us).lba_to_pba[zone][lba_offset]; if (pba == 1) { /* * Maybe it is impossible to write to PBA 1. * Fake success, but don't do anything. */ printk(KERN_WARNING "alauda_write_lba: avoid writing to pba 1\n"); return USB_STOR_TRANSPORT_GOOD; } new_pba = alauda_find_unused_pba(&MEDIA_INFO(us), zone); if (!new_pba) { printk(KERN_WARNING "alauda_write_lba: Out of unused blocks\n"); return USB_STOR_TRANSPORT_ERROR; } /* read old contents */ if (pba != UNDEF) { result = alauda_read_block_raw(us, pba, 0, blocksize, blockbuffer); if (result != USB_STOR_XFER_GOOD) return result; } else { memset(blockbuffer, 0, blocksize * (pagesize + 64)); } lbap = (lba_offset << 1) | 0x1000; if (parity[MSB_of(lbap) ^ LSB_of(lbap)]) lbap ^= 1; /* check old contents and fill lba */ for (i = 0; i < blocksize; i++) { bptr = blockbuffer + (i * (pagesize + 64)); cptr = bptr + pagesize; nand_compute_ecc(bptr, ecc); if (!nand_compare_ecc(cptr+13, ecc)) { usb_stor_dbg(us, "Warning: bad ecc in page %d- of pba %d\n", i, pba); nand_store_ecc(cptr+13, ecc); } nand_compute_ecc(bptr + (pagesize / 2), ecc); if (!nand_compare_ecc(cptr+8, ecc)) { usb_stor_dbg(us, "Warning: bad ecc in page %d+ of pba %d\n", i, pba); nand_store_ecc(cptr+8, ecc); } cptr[6] = cptr[11] = MSB_of(lbap); cptr[7] = cptr[12] = LSB_of(lbap); } /* copy in new stuff and compute ECC */ xptr = ptr; for (i = page; i < page+pages; i++) { bptr = blockbuffer + (i * (pagesize + 64)); cptr = bptr + pagesize; memcpy(bptr, xptr, pagesize); xptr += pagesize; nand_compute_ecc(bptr, ecc); nand_store_ecc(cptr+13, ecc); nand_compute_ecc(bptr + (pagesize / 2), ecc); nand_store_ecc(cptr+8, ecc); } result = alauda_write_block(us, new_pba, blockbuffer); if (result != USB_STOR_XFER_GOOD) return result; new_pba_offset = new_pba - (zone * zonesize); MEDIA_INFO(us).pba_to_lba[zone][new_pba_offset] = lba; MEDIA_INFO(us).lba_to_pba[zone][lba_offset] = new_pba; usb_stor_dbg(us, "Remapped LBA %d to PBA %d\n", lba, new_pba); if (pba != UNDEF) { unsigned int pba_offset = pba - (zone * zonesize); result = alauda_erase_block(us, pba); if (result != USB_STOR_XFER_GOOD) return result; MEDIA_INFO(us).pba_to_lba[zone][pba_offset] = UNDEF; } return USB_STOR_TRANSPORT_GOOD; } /* * Read data from a specific sector address */ static int alauda_read_data(struct us_data *us, unsigned long address, unsigned int sectors) { unsigned char *buffer; u16 lba, max_lba; unsigned int page, len, offset; unsigned int blockshift = MEDIA_INFO(us).blockshift; unsigned int pageshift = MEDIA_INFO(us).pageshift; unsigned int blocksize = MEDIA_INFO(us).blocksize; unsigned int pagesize = MEDIA_INFO(us).pagesize; unsigned int uzonesize = MEDIA_INFO(us).uzonesize; struct scatterlist *sg; int result; /* * Since we only read in one block at a time, we have to create * a bounce buffer and move the data a piece at a time between the * bounce buffer and the actual transfer buffer. * We make this buffer big enough to hold temporary redundancy data, * which we use when reading the data blocks. */ len = min(sectors, blocksize) * (pagesize + 64); buffer = kmalloc(len, GFP_NOIO); if (!buffer) return USB_STOR_TRANSPORT_ERROR; /* Figure out the initial LBA and page */ lba = address >> blockshift; page = (address & MEDIA_INFO(us).blockmask); max_lba = MEDIA_INFO(us).capacity >> (blockshift + pageshift); result = USB_STOR_TRANSPORT_GOOD; offset = 0; sg = NULL; while (sectors > 0) { unsigned int zone = lba / uzonesize; /* integer division */ unsigned int lba_offset = lba - (zone * uzonesize); unsigned int pages; u16 pba; alauda_ensure_map_for_zone(us, zone); /* Not overflowing capacity? */ if (lba >= max_lba) { usb_stor_dbg(us, "Error: Requested lba %u exceeds maximum %u\n", lba, max_lba); result = USB_STOR_TRANSPORT_ERROR; break; } /* Find number of pages we can read in this block */ pages = min(sectors, blocksize - page); len = pages << pageshift; /* Find where this lba lives on disk */ pba = MEDIA_INFO(us).lba_to_pba[zone][lba_offset]; if (pba == UNDEF) { /* this lba was never written */ usb_stor_dbg(us, "Read %d zero pages (LBA %d) page %d\n", pages, lba, page); /* * This is not really an error. It just means * that the block has never been written. * Instead of returning USB_STOR_TRANSPORT_ERROR * it is better to return all zero data. */ memset(buffer, 0, len); } else { usb_stor_dbg(us, "Read %d pages, from PBA %d (LBA %d) page %d\n", pages, pba, lba, page); result = alauda_read_block(us, pba, page, pages, buffer); if (result != USB_STOR_TRANSPORT_GOOD) break; } /* Store the data in the transfer buffer */ usb_stor_access_xfer_buf(buffer, len, us->srb, &sg, &offset, TO_XFER_BUF); page = 0; lba++; sectors -= pages; } kfree(buffer); return result; } /* * Write data to a specific sector address */ static int alauda_write_data(struct us_data *us, unsigned long address, unsigned int sectors) { unsigned char *buffer, *blockbuffer; unsigned int page, len, offset; unsigned int blockshift = MEDIA_INFO(us).blockshift; unsigned int pageshift = MEDIA_INFO(us).pageshift; unsigned int blocksize = MEDIA_INFO(us).blocksize; unsigned int pagesize = MEDIA_INFO(us).pagesize; struct scatterlist *sg; u16 lba, max_lba; int result; /* * Since we don't write the user data directly to the device, * we have to create a bounce buffer and move the data a piece * at a time between the bounce buffer and the actual transfer buffer. */ len = min(sectors, blocksize) * pagesize; buffer = kmalloc(len, GFP_NOIO); if (!buffer) return USB_STOR_TRANSPORT_ERROR; /* * We also need a temporary block buffer, where we read in the old data, * overwrite parts with the new data, and manipulate the redundancy data */ blockbuffer = kmalloc_array(pagesize + 64, blocksize, GFP_NOIO); if (!blockbuffer) { kfree(buffer); return USB_STOR_TRANSPORT_ERROR; } /* Figure out the initial LBA and page */ lba = address >> blockshift; page = (address & MEDIA_INFO(us).blockmask); max_lba = MEDIA_INFO(us).capacity >> (pageshift + blockshift); result = USB_STOR_TRANSPORT_GOOD; offset = 0; sg = NULL; while (sectors > 0) { /* Write as many sectors as possible in this block */ unsigned int pages = min(sectors, blocksize - page); len = pages << pageshift; /* Not overflowing capacity? */ if (lba >= max_lba) { usb_stor_dbg(us, "Requested lba %u exceeds maximum %u\n", lba, max_lba); result = USB_STOR_TRANSPORT_ERROR; break; } /* Get the data from the transfer buffer */ usb_stor_access_xfer_buf(buffer, len, us->srb, &sg, &offset, FROM_XFER_BUF); result = alauda_write_lba(us, lba, page, pages, buffer, blockbuffer); if (result != USB_STOR_TRANSPORT_GOOD) break; page = 0; lba++; sectors -= pages; } kfree(buffer); kfree(blockbuffer); return result; } /* * Our interface with the rest of the world */ static void alauda_info_destructor(void *extra) { struct alauda_info *info = (struct alauda_info *) extra; int port; if (!info) return; for (port = 0; port < 2; port++) { struct alauda_media_info *media_info = &info->port[port]; alauda_free_maps(media_info); kfree(media_info->lba_to_pba); kfree(media_info->pba_to_lba); } } /* * Initialize alauda_info struct and find the data-write endpoint */ static int init_alauda(struct us_data *us) { struct alauda_info *info; struct usb_host_interface *altsetting = us->pusb_intf->cur_altsetting; nand_init_ecc(); us->extra = kzalloc(sizeof(struct alauda_info), GFP_NOIO); if (!us->extra) return -ENOMEM; info = (struct alauda_info *) us->extra; us->extra_destructor = alauda_info_destructor; info->wr_ep = usb_sndbulkpipe(us->pusb_dev, altsetting->endpoint[0].desc.bEndpointAddress & USB_ENDPOINT_NUMBER_MASK); return 0; } static int alauda_transport(struct scsi_cmnd *srb, struct us_data *us) { int rc; struct alauda_info *info = (struct alauda_info *) us->extra; unsigned char *ptr = us->iobuf; static unsigned char inquiry_response[36] = { 0x00, 0x80, 0x00, 0x01, 0x1F, 0x00, 0x00, 0x00 }; if (srb->cmnd[0] == INQUIRY) { usb_stor_dbg(us, "INQUIRY - Returning bogus response\n"); memcpy(ptr, inquiry_response, sizeof(inquiry_response)); fill_inquiry_response(us, ptr, 36); return USB_STOR_TRANSPORT_GOOD; } if (srb->cmnd[0] == TEST_UNIT_READY) { usb_stor_dbg(us, "TEST_UNIT_READY\n"); return alauda_check_media(us); } if (srb->cmnd[0] == READ_CAPACITY) { unsigned int num_zones; unsigned long capacity; rc = alauda_check_media(us); if (rc != USB_STOR_TRANSPORT_GOOD) return rc; num_zones = MEDIA_INFO(us).capacity >> (MEDIA_INFO(us).zoneshift + MEDIA_INFO(us).blockshift + MEDIA_INFO(us).pageshift); capacity = num_zones * MEDIA_INFO(us).uzonesize * MEDIA_INFO(us).blocksize; /* Report capacity and page size */ ((__be32 *) ptr)[0] = cpu_to_be32(capacity - 1); ((__be32 *) ptr)[1] = cpu_to_be32(512); usb_stor_set_xfer_buf(ptr, 8, srb); return USB_STOR_TRANSPORT_GOOD; } if (srb->cmnd[0] == READ_10) { unsigned int page, pages; rc = alauda_check_media(us); if (rc != USB_STOR_TRANSPORT_GOOD) return rc; page = short_pack(srb->cmnd[3], srb->cmnd[2]); page <<= 16; page |= short_pack(srb->cmnd[5], srb->cmnd[4]); pages = short_pack(srb->cmnd[8], srb->cmnd[7]); usb_stor_dbg(us, "READ_10: page %d pagect %d\n", page, pages); return alauda_read_data(us, page, pages); } if (srb->cmnd[0] == WRITE_10) { unsigned int page, pages; rc = alauda_check_media(us); if (rc != USB_STOR_TRANSPORT_GOOD) return rc; page = short_pack(srb->cmnd[3], srb->cmnd[2]); page <<= 16; page |= short_pack(srb->cmnd[5], srb->cmnd[4]); pages = short_pack(srb->cmnd[8], srb->cmnd[7]); usb_stor_dbg(us, "WRITE_10: page %d pagect %d\n", page, pages); return alauda_write_data(us, page, pages); } if (srb->cmnd[0] == REQUEST_SENSE) { usb_stor_dbg(us, "REQUEST_SENSE\n"); memset(ptr, 0, 18); ptr[0] = 0xF0; ptr[2] = info->sense_key; ptr[7] = 11; ptr[12] = info->sense_asc; ptr[13] = info->sense_ascq; usb_stor_set_xfer_buf(ptr, 18, srb); return USB_STOR_TRANSPORT_GOOD; } if (srb->cmnd[0] == ALLOW_MEDIUM_REMOVAL) { /* * sure. whatever. not like we can stop the user from popping * the media out of the device (no locking doors, etc) */ return USB_STOR_TRANSPORT_GOOD; } usb_stor_dbg(us, "Gah! Unknown command: %d (0x%x)\n", srb->cmnd[0], srb->cmnd[0]); info->sense_key = 0x05; info->sense_asc = 0x20; info->sense_ascq = 0x00; return USB_STOR_TRANSPORT_FAILED; } static struct scsi_host_template alauda_host_template; static int alauda_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct us_data *us; int result; result = usb_stor_probe1(&us, intf, id, (id - alauda_usb_ids) + alauda_unusual_dev_list, &alauda_host_template); if (result) return result; us->transport_name = "Alauda Control/Bulk"; us->transport = alauda_transport; us->transport_reset = usb_stor_Bulk_reset; us->max_lun = 1; result = usb_stor_probe2(us); return result; } static struct usb_driver alauda_driver = { .name = DRV_NAME, .probe = alauda_probe, .disconnect = usb_stor_disconnect, .suspend = usb_stor_suspend, .resume = usb_stor_resume, .reset_resume = usb_stor_reset_resume, .pre_reset = usb_stor_pre_reset, .post_reset = usb_stor_post_reset, .id_table = alauda_usb_ids, .soft_unbind = 1, .no_dynamic_id = 1, }; module_usb_stor_driver(alauda_driver, alauda_host_template, DRV_NAME);
207 207 27 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 // SPDX-License-Identifier: GPL-2.0-only /* * The NFC Controller Interface is the communication protocol between an * NFC Controller (NFCC) and a Device Host (DH). * This is the HCI over NCI implementation, as specified in the 10.2 * section of the NCI 1.1 specification. * * Copyright (C) 2014 STMicroelectronics SAS. All rights reserved. */ #include <linux/skbuff.h> #include "../nfc.h" #include <net/nfc/nci.h> #include <net/nfc/nci_core.h> #include <linux/nfc.h> #include <linux/kcov.h> struct nci_data { u8 conn_id; u8 pipe; u8 cmd; const u8 *data; u32 data_len; } __packed; struct nci_hci_create_pipe_params { u8 src_gate; u8 dest_host; u8 dest_gate; } __packed; struct nci_hci_create_pipe_resp { u8 src_host; u8 src_gate; u8 dest_host; u8 dest_gate; u8 pipe; } __packed; struct nci_hci_delete_pipe_noti { u8 pipe; } __packed; struct nci_hci_all_pipe_cleared_noti { u8 host; } __packed; struct nci_hcp_message { u8 header; /* type -cmd,evt,rsp- + instruction */ u8 data[]; } __packed; struct nci_hcp_packet { u8 header; /* cbit+pipe */ struct nci_hcp_message message; } __packed; #define NCI_HCI_ANY_SET_PARAMETER 0x01 #define NCI_HCI_ANY_GET_PARAMETER 0x02 #define NCI_HCI_ANY_CLOSE_PIPE 0x04 #define NCI_HCI_ADM_CLEAR_ALL_PIPE 0x14 #define NCI_HFP_NO_CHAINING 0x80 #define NCI_NFCEE_ID_HCI 0x80 #define NCI_EVT_HOT_PLUG 0x03 #define NCI_HCI_ADMIN_PARAM_SESSION_IDENTITY 0x01 #define NCI_HCI_ADM_CREATE_PIPE 0x10 #define NCI_HCI_ADM_DELETE_PIPE 0x11 /* HCP headers */ #define NCI_HCI_HCP_PACKET_HEADER_LEN 1 #define NCI_HCI_HCP_MESSAGE_HEADER_LEN 1 #define NCI_HCI_HCP_HEADER_LEN 2 /* HCP types */ #define NCI_HCI_HCP_COMMAND 0x00 #define NCI_HCI_HCP_EVENT 0x01 #define NCI_HCI_HCP_RESPONSE 0x02 #define NCI_HCI_ADM_NOTIFY_PIPE_CREATED 0x12 #define NCI_HCI_ADM_NOTIFY_PIPE_DELETED 0x13 #define NCI_HCI_ADM_NOTIFY_ALL_PIPE_CLEARED 0x15 #define NCI_HCI_FRAGMENT 0x7f #define NCI_HCP_HEADER(type, instr) ((((type) & 0x03) << 6) |\ ((instr) & 0x3f)) #define NCI_HCP_MSG_GET_TYPE(header) ((header & 0xc0) >> 6) #define NCI_HCP_MSG_GET_CMD(header) (header & 0x3f) #define NCI_HCP_MSG_GET_PIPE(header) (header & 0x7f) static int nci_hci_result_to_errno(u8 result) { switch (result) { case NCI_HCI_ANY_OK: return 0; case NCI_HCI_ANY_E_REG_PAR_UNKNOWN: return -EOPNOTSUPP; case NCI_HCI_ANY_E_TIMEOUT: return -ETIME; default: return -1; } } /* HCI core */ static void nci_hci_reset_pipes(struct nci_hci_dev *hdev) { int i; for (i = 0; i < NCI_HCI_MAX_PIPES; i++) { hdev->pipes[i].gate = NCI_HCI_INVALID_GATE; hdev->pipes[i].host = NCI_HCI_INVALID_HOST; } memset(hdev->gate2pipe, NCI_HCI_INVALID_PIPE, sizeof(hdev->gate2pipe)); } static void nci_hci_reset_pipes_per_host(struct nci_dev *ndev, u8 host) { int i; for (i = 0; i < NCI_HCI_MAX_PIPES; i++) { if (ndev->hci_dev->pipes[i].host == host) { ndev->hci_dev->pipes[i].gate = NCI_HCI_INVALID_GATE; ndev->hci_dev->pipes[i].host = NCI_HCI_INVALID_HOST; } } } /* Fragment HCI data over NCI packet. * NFC Forum NCI 10.2.2 Data Exchange: * The payload of the Data Packets sent on the Logical Connection SHALL be * valid HCP packets, as defined within [ETSI_102622]. Each Data Packet SHALL * contain a single HCP packet. NCI Segmentation and Reassembly SHALL NOT be * applied to Data Messages in either direction. The HCI fragmentation mechanism * is used if required. */ static int nci_hci_send_data(struct nci_dev *ndev, u8 pipe, const u8 data_type, const u8 *data, size_t data_len) { const struct nci_conn_info *conn_info; struct sk_buff *skb; int len, i, r; u8 cb = pipe; conn_info = ndev->hci_dev->conn_info; if (!conn_info) return -EPROTO; i = 0; skb = nci_skb_alloc(ndev, conn_info->max_pkt_payload_len + NCI_DATA_HDR_SIZE, GFP_ATOMIC); if (!skb) return -ENOMEM; skb_reserve(skb, NCI_DATA_HDR_SIZE + 2); *(u8 *)skb_push(skb, 1) = data_type; do { /* If last packet add NCI_HFP_NO_CHAINING */ if (i + conn_info->max_pkt_payload_len - (skb->len + 1) >= data_len) { cb |= NCI_HFP_NO_CHAINING; len = data_len - i; } else { len = conn_info->max_pkt_payload_len - skb->len - 1; } *(u8 *)skb_push(skb, 1) = cb; if (len > 0) skb_put_data(skb, data + i, len); r = nci_send_data(ndev, conn_info->conn_id, skb); if (r < 0) return r; i += len; if (i < data_len) { skb = nci_skb_alloc(ndev, conn_info->max_pkt_payload_len + NCI_DATA_HDR_SIZE, GFP_ATOMIC); if (!skb) return -ENOMEM; skb_reserve(skb, NCI_DATA_HDR_SIZE + 1); } } while (i < data_len); return i; } static void nci_hci_send_data_req(struct nci_dev *ndev, const void *opt) { const struct nci_data *data = opt; nci_hci_send_data(ndev, data->pipe, data->cmd, data->data, data->data_len); } int nci_hci_send_event(struct nci_dev *ndev, u8 gate, u8 event, const u8 *param, size_t param_len) { u8 pipe = ndev->hci_dev->gate2pipe[gate]; if (pipe == NCI_HCI_INVALID_PIPE) return -EADDRNOTAVAIL; return nci_hci_send_data(ndev, pipe, NCI_HCP_HEADER(NCI_HCI_HCP_EVENT, event), param, param_len); } EXPORT_SYMBOL(nci_hci_send_event); int nci_hci_send_cmd(struct nci_dev *ndev, u8 gate, u8 cmd, const u8 *param, size_t param_len, struct sk_buff **skb) { const struct nci_hcp_message *message; const struct nci_conn_info *conn_info; struct nci_data data; int r; u8 pipe = ndev->hci_dev->gate2pipe[gate]; if (pipe == NCI_HCI_INVALID_PIPE) return -EADDRNOTAVAIL; conn_info = ndev->hci_dev->conn_info; if (!conn_info) return -EPROTO; data.conn_id = conn_info->conn_id; data.pipe = pipe; data.cmd = NCI_HCP_HEADER(NCI_HCI_HCP_COMMAND, cmd); data.data = param; data.data_len = param_len; r = nci_request(ndev, nci_hci_send_data_req, &data, msecs_to_jiffies(NCI_DATA_TIMEOUT)); if (r == NCI_STATUS_OK) { message = (struct nci_hcp_message *)conn_info->rx_skb->data; r = nci_hci_result_to_errno( NCI_HCP_MSG_GET_CMD(message->header)); skb_pull(conn_info->rx_skb, NCI_HCI_HCP_MESSAGE_HEADER_LEN); if (!r && skb) *skb = conn_info->rx_skb; } return r; } EXPORT_SYMBOL(nci_hci_send_cmd); int nci_hci_clear_all_pipes(struct nci_dev *ndev) { int r; r = nci_hci_send_cmd(ndev, NCI_HCI_ADMIN_GATE, NCI_HCI_ADM_CLEAR_ALL_PIPE, NULL, 0, NULL); if (r < 0) return r; nci_hci_reset_pipes(ndev->hci_dev); return r; } EXPORT_SYMBOL(nci_hci_clear_all_pipes); static void nci_hci_event_received(struct nci_dev *ndev, u8 pipe, u8 event, struct sk_buff *skb) { if (ndev->ops->hci_event_received) ndev->ops->hci_event_received(ndev, pipe, event, skb); } static void nci_hci_cmd_received(struct nci_dev *ndev, u8 pipe, u8 cmd, struct sk_buff *skb) { u8 gate = ndev->hci_dev->pipes[pipe].gate; u8 status = NCI_HCI_ANY_OK | ~NCI_HCI_FRAGMENT; u8 dest_gate, new_pipe; struct nci_hci_create_pipe_resp *create_info; struct nci_hci_delete_pipe_noti *delete_info; struct nci_hci_all_pipe_cleared_noti *cleared_info; pr_debug("from gate %x pipe %x cmd %x\n", gate, pipe, cmd); switch (cmd) { case NCI_HCI_ADM_NOTIFY_PIPE_CREATED: if (skb->len != 5) { status = NCI_HCI_ANY_E_NOK; goto exit; } create_info = (struct nci_hci_create_pipe_resp *)skb->data; dest_gate = create_info->dest_gate; new_pipe = create_info->pipe; if (new_pipe >= NCI_HCI_MAX_PIPES) { status = NCI_HCI_ANY_E_NOK; goto exit; } /* Save the new created pipe and bind with local gate, * the description for skb->data[3] is destination gate id * but since we received this cmd from host controller, we * are the destination and it is our local gate */ ndev->hci_dev->gate2pipe[dest_gate] = new_pipe; ndev->hci_dev->pipes[new_pipe].gate = dest_gate; ndev->hci_dev->pipes[new_pipe].host = create_info->src_host; break; case NCI_HCI_ANY_OPEN_PIPE: /* If the pipe is not created report an error */ if (gate == NCI_HCI_INVALID_GATE) { status = NCI_HCI_ANY_E_NOK; goto exit; } break; case NCI_HCI_ADM_NOTIFY_PIPE_DELETED: if (skb->len != 1) { status = NCI_HCI_ANY_E_NOK; goto exit; } delete_info = (struct nci_hci_delete_pipe_noti *)skb->data; if (delete_info->pipe >= NCI_HCI_MAX_PIPES) { status = NCI_HCI_ANY_E_NOK; goto exit; } ndev->hci_dev->pipes[delete_info->pipe].gate = NCI_HCI_INVALID_GATE; ndev->hci_dev->pipes[delete_info->pipe].host = NCI_HCI_INVALID_HOST; break; case NCI_HCI_ADM_NOTIFY_ALL_PIPE_CLEARED: if (skb->len != 1) { status = NCI_HCI_ANY_E_NOK; goto exit; } cleared_info = (struct nci_hci_all_pipe_cleared_noti *)skb->data; nci_hci_reset_pipes_per_host(ndev, cleared_info->host); break; default: pr_debug("Discarded unknown cmd %x to gate %x\n", cmd, gate); break; } if (ndev->ops->hci_cmd_received) ndev->ops->hci_cmd_received(ndev, pipe, cmd, skb); exit: nci_hci_send_data(ndev, pipe, status, NULL, 0); kfree_skb(skb); } static void nci_hci_resp_received(struct nci_dev *ndev, u8 pipe, struct sk_buff *skb) { struct nci_conn_info *conn_info; conn_info = ndev->hci_dev->conn_info; if (!conn_info) goto exit; conn_info->rx_skb = skb; exit: nci_req_complete(ndev, NCI_STATUS_OK); } /* Receive hcp message for pipe, with type and cmd. * skb contains optional message data only. */ static void nci_hci_hcp_message_rx(struct nci_dev *ndev, u8 pipe, u8 type, u8 instruction, struct sk_buff *skb) { switch (type) { case NCI_HCI_HCP_RESPONSE: nci_hci_resp_received(ndev, pipe, skb); break; case NCI_HCI_HCP_COMMAND: nci_hci_cmd_received(ndev, pipe, instruction, skb); break; case NCI_HCI_HCP_EVENT: nci_hci_event_received(ndev, pipe, instruction, skb); break; default: pr_err("UNKNOWN MSG Type %d, instruction=%d\n", type, instruction); kfree_skb(skb); break; } nci_req_complete(ndev, NCI_STATUS_OK); } static void nci_hci_msg_rx_work(struct work_struct *work) { struct nci_hci_dev *hdev = container_of(work, struct nci_hci_dev, msg_rx_work); struct sk_buff *skb; const struct nci_hcp_message *message; u8 pipe, type, instruction; for (; (skb = skb_dequeue(&hdev->msg_rx_queue)); kcov_remote_stop()) { kcov_remote_start_common(skb_get_kcov_handle(skb)); pipe = NCI_HCP_MSG_GET_PIPE(skb->data[0]); skb_pull(skb, NCI_HCI_HCP_PACKET_HEADER_LEN); message = (struct nci_hcp_message *)skb->data; type = NCI_HCP_MSG_GET_TYPE(message->header); instruction = NCI_HCP_MSG_GET_CMD(message->header); skb_pull(skb, NCI_HCI_HCP_MESSAGE_HEADER_LEN); nci_hci_hcp_message_rx(hdev->ndev, pipe, type, instruction, skb); } } void nci_hci_data_received_cb(void *context, struct sk_buff *skb, int err) { struct nci_dev *ndev = (struct nci_dev *)context; struct nci_hcp_packet *packet; u8 pipe, type; struct sk_buff *hcp_skb; struct sk_buff *frag_skb; int msg_len; if (err) { nci_req_complete(ndev, err); return; } packet = (struct nci_hcp_packet *)skb->data; if ((packet->header & ~NCI_HCI_FRAGMENT) == 0) { skb_queue_tail(&ndev->hci_dev->rx_hcp_frags, skb); return; } /* it's the last fragment. Does it need re-aggregation? */ if (skb_queue_len(&ndev->hci_dev->rx_hcp_frags)) { pipe = NCI_HCP_MSG_GET_PIPE(packet->header); skb_queue_tail(&ndev->hci_dev->rx_hcp_frags, skb); msg_len = 0; skb_queue_walk(&ndev->hci_dev->rx_hcp_frags, frag_skb) { msg_len += (frag_skb->len - NCI_HCI_HCP_PACKET_HEADER_LEN); } hcp_skb = nfc_alloc_recv_skb(NCI_HCI_HCP_PACKET_HEADER_LEN + msg_len, GFP_KERNEL); if (!hcp_skb) { nci_req_complete(ndev, -ENOMEM); return; } skb_put_u8(hcp_skb, pipe); skb_queue_walk(&ndev->hci_dev->rx_hcp_frags, frag_skb) { msg_len = frag_skb->len - NCI_HCI_HCP_PACKET_HEADER_LEN; skb_put_data(hcp_skb, frag_skb->data + NCI_HCI_HCP_PACKET_HEADER_LEN, msg_len); } skb_queue_purge(&ndev->hci_dev->rx_hcp_frags); } else { packet->header &= NCI_HCI_FRAGMENT; hcp_skb = skb; } /* if this is a response, dispatch immediately to * unblock waiting cmd context. Otherwise, enqueue to dispatch * in separate context where handler can also execute command. */ packet = (struct nci_hcp_packet *)hcp_skb->data; type = NCI_HCP_MSG_GET_TYPE(packet->message.header); if (type == NCI_HCI_HCP_RESPONSE) { pipe = NCI_HCP_MSG_GET_PIPE(packet->header); skb_pull(hcp_skb, NCI_HCI_HCP_PACKET_HEADER_LEN); nci_hci_hcp_message_rx(ndev, pipe, type, NCI_STATUS_OK, hcp_skb); } else { skb_queue_tail(&ndev->hci_dev->msg_rx_queue, hcp_skb); schedule_work(&ndev->hci_dev->msg_rx_work); } } int nci_hci_open_pipe(struct nci_dev *ndev, u8 pipe) { struct nci_data data; const struct nci_conn_info *conn_info; conn_info = ndev->hci_dev->conn_info; if (!conn_info) return -EPROTO; data.conn_id = conn_info->conn_id; data.pipe = pipe; data.cmd = NCI_HCP_HEADER(NCI_HCI_HCP_COMMAND, NCI_HCI_ANY_OPEN_PIPE); data.data = NULL; data.data_len = 0; return nci_request(ndev, nci_hci_send_data_req, &data, msecs_to_jiffies(NCI_DATA_TIMEOUT)); } EXPORT_SYMBOL(nci_hci_open_pipe); static u8 nci_hci_create_pipe(struct nci_dev *ndev, u8 dest_host, u8 dest_gate, int *result) { u8 pipe; struct sk_buff *skb; struct nci_hci_create_pipe_params params; const struct nci_hci_create_pipe_resp *resp; pr_debug("gate=%d\n", dest_gate); params.src_gate = NCI_HCI_ADMIN_GATE; params.dest_host = dest_host; params.dest_gate = dest_gate; *result = nci_hci_send_cmd(ndev, NCI_HCI_ADMIN_GATE, NCI_HCI_ADM_CREATE_PIPE, (u8 *)&params, sizeof(params), &skb); if (*result < 0) return NCI_HCI_INVALID_PIPE; resp = (struct nci_hci_create_pipe_resp *)skb->data; pipe = resp->pipe; kfree_skb(skb); pr_debug("pipe created=%d\n", pipe); return pipe; } static int nci_hci_delete_pipe(struct nci_dev *ndev, u8 pipe) { return nci_hci_send_cmd(ndev, NCI_HCI_ADMIN_GATE, NCI_HCI_ADM_DELETE_PIPE, &pipe, 1, NULL); } int nci_hci_set_param(struct nci_dev *ndev, u8 gate, u8 idx, const u8 *param, size_t param_len) { const struct nci_hcp_message *message; const struct nci_conn_info *conn_info; struct nci_data data; int r; u8 *tmp; u8 pipe = ndev->hci_dev->gate2pipe[gate]; pr_debug("idx=%d to gate %d\n", idx, gate); if (pipe == NCI_HCI_INVALID_PIPE) return -EADDRNOTAVAIL; conn_info = ndev->hci_dev->conn_info; if (!conn_info) return -EPROTO; tmp = kmalloc(1 + param_len, GFP_KERNEL); if (!tmp) return -ENOMEM; *tmp = idx; memcpy(tmp + 1, param, param_len); data.conn_id = conn_info->conn_id; data.pipe = pipe; data.cmd = NCI_HCP_HEADER(NCI_HCI_HCP_COMMAND, NCI_HCI_ANY_SET_PARAMETER); data.data = tmp; data.data_len = param_len + 1; r = nci_request(ndev, nci_hci_send_data_req, &data, msecs_to_jiffies(NCI_DATA_TIMEOUT)); if (r == NCI_STATUS_OK) { message = (struct nci_hcp_message *)conn_info->rx_skb->data; r = nci_hci_result_to_errno( NCI_HCP_MSG_GET_CMD(message->header)); skb_pull(conn_info->rx_skb, NCI_HCI_HCP_MESSAGE_HEADER_LEN); } kfree(tmp); return r; } EXPORT_SYMBOL(nci_hci_set_param); int nci_hci_get_param(struct nci_dev *ndev, u8 gate, u8 idx, struct sk_buff **skb) { const struct nci_hcp_message *message; const struct nci_conn_info *conn_info; struct nci_data data; int r; u8 pipe = ndev->hci_dev->gate2pipe[gate]; pr_debug("idx=%d to gate %d\n", idx, gate); if (pipe == NCI_HCI_INVALID_PIPE) return -EADDRNOTAVAIL; conn_info = ndev->hci_dev->conn_info; if (!conn_info) return -EPROTO; data.conn_id = conn_info->conn_id; data.pipe = pipe; data.cmd = NCI_HCP_HEADER(NCI_HCI_HCP_COMMAND, NCI_HCI_ANY_GET_PARAMETER); data.data = &idx; data.data_len = 1; r = nci_request(ndev, nci_hci_send_data_req, &data, msecs_to_jiffies(NCI_DATA_TIMEOUT)); if (r == NCI_STATUS_OK) { message = (struct nci_hcp_message *)conn_info->rx_skb->data; r = nci_hci_result_to_errno( NCI_HCP_MSG_GET_CMD(message->header)); skb_pull(conn_info->rx_skb, NCI_HCI_HCP_MESSAGE_HEADER_LEN); if (!r && skb) *skb = conn_info->rx_skb; } return r; } EXPORT_SYMBOL(nci_hci_get_param); int nci_hci_connect_gate(struct nci_dev *ndev, u8 dest_host, u8 dest_gate, u8 pipe) { bool pipe_created = false; int r; if (pipe == NCI_HCI_DO_NOT_OPEN_PIPE) return 0; if (ndev->hci_dev->gate2pipe[dest_gate] != NCI_HCI_INVALID_PIPE) return -EADDRINUSE; if (pipe != NCI_HCI_INVALID_PIPE) goto open_pipe; switch (dest_gate) { case NCI_HCI_LINK_MGMT_GATE: pipe = NCI_HCI_LINK_MGMT_PIPE; break; case NCI_HCI_ADMIN_GATE: pipe = NCI_HCI_ADMIN_PIPE; break; default: pipe = nci_hci_create_pipe(ndev, dest_host, dest_gate, &r); if (pipe == NCI_HCI_INVALID_PIPE) return r; pipe_created = true; break; } open_pipe: r = nci_hci_open_pipe(ndev, pipe); if (r < 0) { if (pipe_created) { if (nci_hci_delete_pipe(ndev, pipe) < 0) { /* TODO: Cannot clean by deleting pipe... * -> inconsistent state */ } } return r; } ndev->hci_dev->pipes[pipe].gate = dest_gate; ndev->hci_dev->pipes[pipe].host = dest_host; ndev->hci_dev->gate2pipe[dest_gate] = pipe; return 0; } EXPORT_SYMBOL(nci_hci_connect_gate); static int nci_hci_dev_connect_gates(struct nci_dev *ndev, u8 gate_count, const struct nci_hci_gate *gates) { int r; while (gate_count--) { r = nci_hci_connect_gate(ndev, gates->dest_host, gates->gate, gates->pipe); if (r < 0) return r; gates++; } return 0; } int nci_hci_dev_session_init(struct nci_dev *ndev) { struct nci_conn_info *conn_info; struct sk_buff *skb; int r; ndev->hci_dev->count_pipes = 0; ndev->hci_dev->expected_pipes = 0; conn_info = ndev->hci_dev->conn_info; if (!conn_info) return -EPROTO; conn_info->data_exchange_cb = nci_hci_data_received_cb; conn_info->data_exchange_cb_context = ndev; nci_hci_reset_pipes(ndev->hci_dev); if (ndev->hci_dev->init_data.gates[0].gate != NCI_HCI_ADMIN_GATE) return -EPROTO; r = nci_hci_connect_gate(ndev, ndev->hci_dev->init_data.gates[0].dest_host, ndev->hci_dev->init_data.gates[0].gate, ndev->hci_dev->init_data.gates[0].pipe); if (r < 0) return r; r = nci_hci_get_param(ndev, NCI_HCI_ADMIN_GATE, NCI_HCI_ADMIN_PARAM_SESSION_IDENTITY, &skb); if (r < 0) return r; if (skb->len && skb->len == strlen(ndev->hci_dev->init_data.session_id) && !memcmp(ndev->hci_dev->init_data.session_id, skb->data, skb->len) && ndev->ops->hci_load_session) { /* Restore gate<->pipe table from some proprietary location. */ r = ndev->ops->hci_load_session(ndev); } else { r = nci_hci_clear_all_pipes(ndev); if (r < 0) goto exit; r = nci_hci_dev_connect_gates(ndev, ndev->hci_dev->init_data.gate_count, ndev->hci_dev->init_data.gates); if (r < 0) goto exit; r = nci_hci_set_param(ndev, NCI_HCI_ADMIN_GATE, NCI_HCI_ADMIN_PARAM_SESSION_IDENTITY, ndev->hci_dev->init_data.session_id, strlen(ndev->hci_dev->init_data.session_id)); } exit: kfree_skb(skb); return r; } EXPORT_SYMBOL(nci_hci_dev_session_init); struct nci_hci_dev *nci_hci_allocate(struct nci_dev *ndev) { struct nci_hci_dev *hdev; hdev = kzalloc(sizeof(*hdev), GFP_KERNEL); if (!hdev) return NULL; skb_queue_head_init(&hdev->rx_hcp_frags); INIT_WORK(&hdev->msg_rx_work, nci_hci_msg_rx_work); skb_queue_head_init(&hdev->msg_rx_queue); hdev->ndev = ndev; return hdev; } void nci_hci_deallocate(struct nci_dev *ndev) { kfree(ndev->hci_dev); }
9 354 528 23 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions of the Internet Protocol. * * Version: @(#)in.h 1.0.1 04/21/93 * * Authors: Original taken from the GNU Project <netinet/in.h> file. * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> */ #ifndef _LINUX_IN_H #define _LINUX_IN_H #include <linux/errno.h> #include <uapi/linux/in.h> static inline int proto_ports_offset(int proto) { switch (proto) { case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_DCCP: case IPPROTO_ESP: /* SPI */ case IPPROTO_SCTP: case IPPROTO_UDPLITE: return 0; case IPPROTO_AH: /* SPI */ return 4; default: return -EINVAL; } } static inline bool ipv4_is_loopback(__be32 addr) { return (addr & htonl(0xff000000)) == htonl(0x7f000000); } static inline bool ipv4_is_multicast(__be32 addr) { return (addr & htonl(0xf0000000)) == htonl(0xe0000000); } static inline bool ipv4_is_local_multicast(__be32 addr) { return (addr & htonl(0xffffff00)) == htonl(0xe0000000); } static inline bool ipv4_is_lbcast(__be32 addr) { /* limited broadcast */ return addr == htonl(INADDR_BROADCAST); } static inline bool ipv4_is_all_snoopers(__be32 addr) { return addr == htonl(INADDR_ALLSNOOPERS_GROUP); } static inline bool ipv4_is_zeronet(__be32 addr) { return (addr == 0); } /* Special-Use IPv4 Addresses (RFC3330) */ static inline bool ipv4_is_private_10(__be32 addr) { return (addr & htonl(0xff000000)) == htonl(0x0a000000); } static inline bool ipv4_is_private_172(__be32 addr) { return (addr & htonl(0xfff00000)) == htonl(0xac100000); } static inline bool ipv4_is_private_192(__be32 addr) { return (addr & htonl(0xffff0000)) == htonl(0xc0a80000); } static inline bool ipv4_is_linklocal_169(__be32 addr) { return (addr & htonl(0xffff0000)) == htonl(0xa9fe0000); } static inline bool ipv4_is_anycast_6to4(__be32 addr) { return (addr & htonl(0xffffff00)) == htonl(0xc0586300); } static inline bool ipv4_is_test_192(__be32 addr) { return (addr & htonl(0xffffff00)) == htonl(0xc0000200); } static inline bool ipv4_is_test_198(__be32 addr) { return (addr & htonl(0xfffe0000)) == htonl(0xc6120000); } #endif /* _LINUX_IN_H */
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES */ #include <linux/file.h> #include <linux/interval_tree.h> #include <linux/iommu.h> #include <linux/iommufd.h> #include <linux/slab.h> #include <linux/vfio.h> #include <uapi/linux/vfio.h> #include <uapi/linux/iommufd.h> #include "iommufd_private.h" static struct iommufd_ioas *get_compat_ioas(struct iommufd_ctx *ictx) { struct iommufd_ioas *ioas = ERR_PTR(-ENODEV); xa_lock(&ictx->objects); if (!ictx->vfio_ioas || !iommufd_lock_obj(&ictx->vfio_ioas->obj)) goto out_unlock; ioas = ictx->vfio_ioas; out_unlock: xa_unlock(&ictx->objects); return ioas; } /** * iommufd_vfio_compat_ioas_get_id - Ensure a compat IOAS exists * @ictx: Context to operate on * @out_ioas_id: The IOAS ID of the compatibility IOAS * * Return the ID of the current compatibility IOAS. The ID can be passed into * other functions that take an ioas_id. */ int iommufd_vfio_compat_ioas_get_id(struct iommufd_ctx *ictx, u32 *out_ioas_id) { struct iommufd_ioas *ioas; ioas = get_compat_ioas(ictx); if (IS_ERR(ioas)) return PTR_ERR(ioas); *out_ioas_id = ioas->obj.id; iommufd_put_object(&ioas->obj); return 0; } EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_get_id, IOMMUFD_VFIO); /** * iommufd_vfio_compat_set_no_iommu - Called when a no-iommu device is attached * @ictx: Context to operate on * * This allows selecting the VFIO_NOIOMMU_IOMMU and blocks normal types. */ int iommufd_vfio_compat_set_no_iommu(struct iommufd_ctx *ictx) { int ret; xa_lock(&ictx->objects); if (!ictx->vfio_ioas) { ictx->no_iommu_mode = 1; ret = 0; } else { ret = -EINVAL; } xa_unlock(&ictx->objects); return ret; } EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_set_no_iommu, IOMMUFD_VFIO); /** * iommufd_vfio_compat_ioas_create - Ensure the compat IOAS is created * @ictx: Context to operate on * * The compatibility IOAS is the IOAS that the vfio compatibility ioctls operate * on since they do not have an IOAS ID input in their ABI. Only attaching a * group should cause a default creation of the internal ioas, this does nothing * if an existing ioas has already been assigned somehow. */ int iommufd_vfio_compat_ioas_create(struct iommufd_ctx *ictx) { struct iommufd_ioas *ioas = NULL; int ret; ioas = iommufd_ioas_alloc(ictx); if (IS_ERR(ioas)) return PTR_ERR(ioas); xa_lock(&ictx->objects); /* * VFIO won't allow attaching a container to both iommu and no iommu * operation */ if (ictx->no_iommu_mode) { ret = -EINVAL; goto out_abort; } if (ictx->vfio_ioas && iommufd_lock_obj(&ictx->vfio_ioas->obj)) { ret = 0; iommufd_put_object(&ictx->vfio_ioas->obj); goto out_abort; } ictx->vfio_ioas = ioas; xa_unlock(&ictx->objects); /* * An automatically created compat IOAS is treated as a userspace * created object. Userspace can learn the ID via IOMMU_VFIO_IOAS_GET, * and if not manually destroyed it will be destroyed automatically * at iommufd release. */ iommufd_object_finalize(ictx, &ioas->obj); return 0; out_abort: xa_unlock(&ictx->objects); iommufd_object_abort(ictx, &ioas->obj); return ret; } EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_create, IOMMUFD_VFIO); int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd) { struct iommu_vfio_ioas *cmd = ucmd->cmd; struct iommufd_ioas *ioas; if (cmd->__reserved) return -EOPNOTSUPP; switch (cmd->op) { case IOMMU_VFIO_IOAS_GET: ioas = get_compat_ioas(ucmd->ictx); if (IS_ERR(ioas)) return PTR_ERR(ioas); cmd->ioas_id = ioas->obj.id; iommufd_put_object(&ioas->obj); return iommufd_ucmd_respond(ucmd, sizeof(*cmd)); case IOMMU_VFIO_IOAS_SET: ioas = iommufd_get_ioas(ucmd->ictx, cmd->ioas_id); if (IS_ERR(ioas)) return PTR_ERR(ioas); xa_lock(&ucmd->ictx->objects); ucmd->ictx->vfio_ioas = ioas; xa_unlock(&ucmd->ictx->objects); iommufd_put_object(&ioas->obj); return 0; case IOMMU_VFIO_IOAS_CLEAR: xa_lock(&ucmd->ictx->objects); ucmd->ictx->vfio_ioas = NULL; xa_unlock(&ucmd->ictx->objects); return 0; default: return -EOPNOTSUPP; } } static int iommufd_vfio_map_dma(struct iommufd_ctx *ictx, unsigned int cmd, void __user *arg) { u32 supported_flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; size_t minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); struct vfio_iommu_type1_dma_map map; int iommu_prot = IOMMU_CACHE; struct iommufd_ioas *ioas; unsigned long iova; int rc; if (copy_from_user(&map, arg, minsz)) return -EFAULT; if (map.argsz < minsz || map.flags & ~supported_flags) return -EINVAL; if (map.flags & VFIO_DMA_MAP_FLAG_READ) iommu_prot |= IOMMU_READ; if (map.flags & VFIO_DMA_MAP_FLAG_WRITE) iommu_prot |= IOMMU_WRITE; ioas = get_compat_ioas(ictx); if (IS_ERR(ioas)) return PTR_ERR(ioas); /* * Maps created through the legacy interface always use VFIO compatible * rlimit accounting. If the user wishes to use the faster user based * rlimit accounting then they must use the new interface. */ iova = map.iova; rc = iopt_map_user_pages(ictx, &ioas->iopt, &iova, u64_to_user_ptr(map.vaddr), map.size, iommu_prot, 0); iommufd_put_object(&ioas->obj); return rc; } static int iommufd_vfio_unmap_dma(struct iommufd_ctx *ictx, unsigned int cmd, void __user *arg) { size_t minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size); /* * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP is obsoleted by the new * dirty tracking direction: * https://lore.kernel.org/kvm/20220731125503.142683-1-yishaih@nvidia.com/ * https://lore.kernel.org/kvm/20220428210933.3583-1-joao.m.martins@oracle.com/ */ u32 supported_flags = VFIO_DMA_UNMAP_FLAG_ALL; struct vfio_iommu_type1_dma_unmap unmap; unsigned long unmapped = 0; struct iommufd_ioas *ioas; int rc; if (copy_from_user(&unmap, arg, minsz)) return -EFAULT; if (unmap.argsz < minsz || unmap.flags & ~supported_flags) return -EINVAL; ioas = get_compat_ioas(ictx); if (IS_ERR(ioas)) return PTR_ERR(ioas); if (unmap.flags & VFIO_DMA_UNMAP_FLAG_ALL) { if (unmap.iova != 0 || unmap.size != 0) { rc = -EINVAL; goto err_put; } rc = iopt_unmap_all(&ioas->iopt, &unmapped); } else { if (READ_ONCE(ioas->iopt.disable_large_pages)) { /* * Create cuts at the start and last of the requested * range. If the start IOVA is 0 then it doesn't need to * be cut. */ unsigned long iovas[] = { unmap.iova + unmap.size - 1, unmap.iova - 1 }; rc = iopt_cut_iova(&ioas->iopt, iovas, unmap.iova ? 2 : 1); if (rc) goto err_put; } rc = iopt_unmap_iova(&ioas->iopt, unmap.iova, unmap.size, &unmapped); } unmap.size = unmapped; if (copy_to_user(arg, &unmap, minsz)) rc = -EFAULT; err_put: iommufd_put_object(&ioas->obj); return rc; } static int iommufd_vfio_cc_iommu(struct iommufd_ctx *ictx) { struct iommufd_hwpt_paging *hwpt_paging; struct iommufd_ioas *ioas; int rc = 1; ioas = get_compat_ioas(ictx); if (IS_ERR(ioas)) return PTR_ERR(ioas); mutex_lock(&ioas->mutex); list_for_each_entry(hwpt_paging, &ioas->hwpt_list, hwpt_item) { if (!hwpt_paging->enforce_cache_coherency) { rc = 0; break; } } mutex_unlock(&ioas->mutex); iommufd_put_object(&ioas->obj); return rc; } static int iommufd_vfio_check_extension(struct iommufd_ctx *ictx, unsigned long type) { switch (type) { case VFIO_TYPE1_IOMMU: case VFIO_TYPE1v2_IOMMU: case VFIO_UNMAP_ALL: return 1; case VFIO_NOIOMMU_IOMMU: return IS_ENABLED(CONFIG_VFIO_NOIOMMU); case VFIO_DMA_CC_IOMMU: return iommufd_vfio_cc_iommu(ictx); /* * This is obsolete, and to be removed from VFIO. It was an incomplete * idea that got merged. * https://lore.kernel.org/kvm/0-v1-0093c9b0e345+19-vfio_no_nesting_jgg@nvidia.com/ */ case VFIO_TYPE1_NESTING_IOMMU: return 0; /* * VFIO_DMA_MAP_FLAG_VADDR * https://lore.kernel.org/kvm/1611939252-7240-1-git-send-email-steven.sistare@oracle.com/ * https://lore.kernel.org/all/Yz777bJZjTyLrHEQ@nvidia.com/ * * It is hard to see how this could be implemented safely. */ case VFIO_UPDATE_VADDR: default: return 0; } } static int iommufd_vfio_set_iommu(struct iommufd_ctx *ictx, unsigned long type) { bool no_iommu_mode = READ_ONCE(ictx->no_iommu_mode); struct iommufd_ioas *ioas = NULL; int rc = 0; /* * Emulation for NOIOMMU is imperfect in that VFIO blocks almost all * other ioctls. We let them keep working but they mostly fail since no * IOAS should exist. */ if (IS_ENABLED(CONFIG_VFIO_NOIOMMU) && type == VFIO_NOIOMMU_IOMMU && no_iommu_mode) { if (!capable(CAP_SYS_RAWIO)) return -EPERM; return 0; } if ((type != VFIO_TYPE1_IOMMU && type != VFIO_TYPE1v2_IOMMU) || no_iommu_mode) return -EINVAL; /* VFIO fails the set_iommu if there is no group */ ioas = get_compat_ioas(ictx); if (IS_ERR(ioas)) return PTR_ERR(ioas); /* * The difference between TYPE1 and TYPE1v2 is the ability to unmap in * the middle of mapped ranges. This is complicated by huge page support * which creates single large IOPTEs that cannot be split by the iommu * driver. TYPE1 is very old at this point and likely nothing uses it, * however it is simple enough to emulate by simply disabling the * problematic large IOPTEs. Then we can safely unmap within any range. */ if (type == VFIO_TYPE1_IOMMU) rc = iopt_disable_large_pages(&ioas->iopt); iommufd_put_object(&ioas->obj); return rc; } static unsigned long iommufd_get_pagesizes(struct iommufd_ioas *ioas) { struct io_pagetable *iopt = &ioas->iopt; unsigned long pgsize_bitmap = ULONG_MAX; struct iommu_domain *domain; unsigned long index; down_read(&iopt->domains_rwsem); xa_for_each(&iopt->domains, index, domain) pgsize_bitmap &= domain->pgsize_bitmap; /* See vfio_update_pgsize_bitmap() */ if (pgsize_bitmap & ~PAGE_MASK) { pgsize_bitmap &= PAGE_MASK; pgsize_bitmap |= PAGE_SIZE; } pgsize_bitmap = max(pgsize_bitmap, ioas->iopt.iova_alignment); up_read(&iopt->domains_rwsem); return pgsize_bitmap; } static int iommufd_fill_cap_iova(struct iommufd_ioas *ioas, struct vfio_info_cap_header __user *cur, size_t avail) { struct vfio_iommu_type1_info_cap_iova_range __user *ucap_iovas = container_of(cur, struct vfio_iommu_type1_info_cap_iova_range __user, header); struct vfio_iommu_type1_info_cap_iova_range cap_iovas = { .header = { .id = VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, .version = 1, }, }; struct interval_tree_span_iter span; interval_tree_for_each_span(&span, &ioas->iopt.reserved_itree, 0, ULONG_MAX) { struct vfio_iova_range range; if (!span.is_hole) continue; range.start = span.start_hole; range.end = span.last_hole; if (avail >= struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas + 1) && copy_to_user(&ucap_iovas->iova_ranges[cap_iovas.nr_iovas], &range, sizeof(range))) return -EFAULT; cap_iovas.nr_iovas++; } if (avail >= struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas) && copy_to_user(ucap_iovas, &cap_iovas, sizeof(cap_iovas))) return -EFAULT; return struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas); } static int iommufd_fill_cap_dma_avail(struct iommufd_ioas *ioas, struct vfio_info_cap_header __user *cur, size_t avail) { struct vfio_iommu_type1_info_dma_avail cap_dma = { .header = { .id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL, .version = 1, }, /* * iommufd's limit is based on the cgroup's memory limit. * Normally vfio would return U16_MAX here, and provide a module * parameter to adjust it. Since S390 qemu userspace actually * pays attention and needs a value bigger than U16_MAX return * U32_MAX. */ .avail = U32_MAX, }; if (avail >= sizeof(cap_dma) && copy_to_user(cur, &cap_dma, sizeof(cap_dma))) return -EFAULT; return sizeof(cap_dma); } static int iommufd_vfio_iommu_get_info(struct iommufd_ctx *ictx, void __user *arg) { typedef int (*fill_cap_fn)(struct iommufd_ioas *ioas, struct vfio_info_cap_header __user *cur, size_t avail); static const fill_cap_fn fill_fns[] = { iommufd_fill_cap_dma_avail, iommufd_fill_cap_iova, }; size_t minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes); struct vfio_info_cap_header __user *last_cap = NULL; struct vfio_iommu_type1_info info = {}; struct iommufd_ioas *ioas; size_t total_cap_size; int rc; int i; if (copy_from_user(&info, arg, minsz)) return -EFAULT; if (info.argsz < minsz) return -EINVAL; minsz = min_t(size_t, info.argsz, sizeof(info)); ioas = get_compat_ioas(ictx); if (IS_ERR(ioas)) return PTR_ERR(ioas); info.flags = VFIO_IOMMU_INFO_PGSIZES; info.iova_pgsizes = iommufd_get_pagesizes(ioas); info.cap_offset = 0; down_read(&ioas->iopt.iova_rwsem); total_cap_size = sizeof(info); for (i = 0; i != ARRAY_SIZE(fill_fns); i++) { int cap_size; if (info.argsz > total_cap_size) cap_size = fill_fns[i](ioas, arg + total_cap_size, info.argsz - total_cap_size); else cap_size = fill_fns[i](ioas, NULL, 0); if (cap_size < 0) { rc = cap_size; goto out_put; } cap_size = ALIGN(cap_size, sizeof(u64)); if (last_cap && info.argsz >= total_cap_size && put_user(total_cap_size, &last_cap->next)) { rc = -EFAULT; goto out_put; } last_cap = arg + total_cap_size; total_cap_size += cap_size; } /* * If the user did not provide enough space then only some caps are * returned and the argsz will be updated to the correct amount to get * all caps. */ if (info.argsz >= total_cap_size) info.cap_offset = sizeof(info); info.argsz = total_cap_size; info.flags |= VFIO_IOMMU_INFO_CAPS; if (copy_to_user(arg, &info, minsz)) { rc = -EFAULT; goto out_put; } rc = 0; out_put: up_read(&ioas->iopt.iova_rwsem); iommufd_put_object(&ioas->obj); return rc; } int iommufd_vfio_ioctl(struct iommufd_ctx *ictx, unsigned int cmd, unsigned long arg) { void __user *uarg = (void __user *)arg; switch (cmd) { case VFIO_GET_API_VERSION: return VFIO_API_VERSION; case VFIO_SET_IOMMU: return iommufd_vfio_set_iommu(ictx, arg); case VFIO_CHECK_EXTENSION: return iommufd_vfio_check_extension(ictx, arg); case VFIO_IOMMU_GET_INFO: return iommufd_vfio_iommu_get_info(ictx, uarg); case VFIO_IOMMU_MAP_DMA: return iommufd_vfio_map_dma(ictx, cmd, uarg); case VFIO_IOMMU_UNMAP_DMA: return iommufd_vfio_unmap_dma(ictx, cmd, uarg); case VFIO_IOMMU_DIRTY_PAGES: default: return -ENOIOCTLCMD; } return -ENOIOCTLCMD; }
49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 /* * Copyright (C) 2011-2013 Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include <linux/errno.h> #include <linux/export.h> #include <linux/kernel.h> #include <drm/drm_mode.h> #include <drm/drm_print.h> #include <drm/drm_rect.h> /** * drm_rect_intersect - intersect two rectangles * @r1: first rectangle * @r2: second rectangle * * Calculate the intersection of rectangles @r1 and @r2. * @r1 will be overwritten with the intersection. * * RETURNS: * %true if rectangle @r1 is still visible after the operation, * %false otherwise. */ bool drm_rect_intersect(struct drm_rect *r1, const struct drm_rect *r2) { r1->x1 = max(r1->x1, r2->x1); r1->y1 = max(r1->y1, r2->y1); r1->x2 = min(r1->x2, r2->x2); r1->y2 = min(r1->y2, r2->y2); return drm_rect_visible(r1); } EXPORT_SYMBOL(drm_rect_intersect); static u32 clip_scaled(int src, int dst, int *clip) { u64 tmp; if (dst == 0) return 0; /* Only clip what we have. Keeps the result bounded. */ *clip = min(*clip, dst); tmp = mul_u32_u32(src, dst - *clip); /* * Round toward 1.0 when clipping so that we don't accidentally * change upscaling to downscaling or vice versa. */ if (src < (dst << 16)) return DIV_ROUND_UP_ULL(tmp, dst); else return DIV_ROUND_DOWN_ULL(tmp, dst); } /** * drm_rect_clip_scaled - perform a scaled clip operation * @src: source window rectangle * @dst: destination window rectangle * @clip: clip rectangle * * Clip rectangle @dst by rectangle @clip. Clip rectangle @src by * the corresponding amounts, retaining the vertical and horizontal scaling * factors from @src to @dst. * * RETURNS: * * %true if rectangle @dst is still visible after being clipped, * %false otherwise. */ bool drm_rect_clip_scaled(struct drm_rect *src, struct drm_rect *dst, const struct drm_rect *clip) { int diff; diff = clip->x1 - dst->x1; if (diff > 0) { u32 new_src_w = clip_scaled(drm_rect_width(src), drm_rect_width(dst), &diff); src->x1 = src->x2 - new_src_w; dst->x1 += diff; } diff = clip->y1 - dst->y1; if (diff > 0) { u32 new_src_h = clip_scaled(drm_rect_height(src), drm_rect_height(dst), &diff); src->y1 = src->y2 - new_src_h; dst->y1 += diff; } diff = dst->x2 - clip->x2; if (diff > 0) { u32 new_src_w = clip_scaled(drm_rect_width(src), drm_rect_width(dst), &diff); src->x2 = src->x1 + new_src_w; dst->x2 -= diff; } diff = dst->y2 - clip->y2; if (diff > 0) { u32 new_src_h = clip_scaled(drm_rect_height(src), drm_rect_height(dst), &diff); src->y2 = src->y1 + new_src_h; dst->y2 -= diff; } return drm_rect_visible(dst); } EXPORT_SYMBOL(drm_rect_clip_scaled); static int drm_calc_scale(int src, int dst) { int scale = 0; if (WARN_ON(src < 0 || dst < 0)) return -EINVAL; if (dst == 0) return 0; if (src > (dst << 16)) return DIV_ROUND_UP(src, dst); else scale = src / dst; return scale; } /** * drm_rect_calc_hscale - calculate the horizontal scaling factor * @src: source window rectangle * @dst: destination window rectangle * @min_hscale: minimum allowed horizontal scaling factor * @max_hscale: maximum allowed horizontal scaling factor * * Calculate the horizontal scaling factor as * (@src width) / (@dst width). * * If the scale is below 1 << 16, round down. If the scale is above * 1 << 16, round up. This will calculate the scale with the most * pessimistic limit calculation. * * RETURNS: * The horizontal scaling factor, or errno of out of limits. */ int drm_rect_calc_hscale(const struct drm_rect *src, const struct drm_rect *dst, int min_hscale, int max_hscale) { int src_w = drm_rect_width(src); int dst_w = drm_rect_width(dst); int hscale = drm_calc_scale(src_w, dst_w); if (hscale < 0 || dst_w == 0) return hscale; if (hscale < min_hscale || hscale > max_hscale) return -ERANGE; return hscale; } EXPORT_SYMBOL(drm_rect_calc_hscale); /** * drm_rect_calc_vscale - calculate the vertical scaling factor * @src: source window rectangle * @dst: destination window rectangle * @min_vscale: minimum allowed vertical scaling factor * @max_vscale: maximum allowed vertical scaling factor * * Calculate the vertical scaling factor as * (@src height) / (@dst height). * * If the scale is below 1 << 16, round down. If the scale is above * 1 << 16, round up. This will calculate the scale with the most * pessimistic limit calculation. * * RETURNS: * The vertical scaling factor, or errno of out of limits. */ int drm_rect_calc_vscale(const struct drm_rect *src, const struct drm_rect *dst, int min_vscale, int max_vscale) { int src_h = drm_rect_height(src); int dst_h = drm_rect_height(dst); int vscale = drm_calc_scale(src_h, dst_h); if (vscale < 0 || dst_h == 0) return vscale; if (vscale < min_vscale || vscale > max_vscale) return -ERANGE; return vscale; } EXPORT_SYMBOL(drm_rect_calc_vscale); /** * drm_rect_debug_print - print the rectangle information * @prefix: prefix string * @r: rectangle to print * @fixed_point: rectangle is in 16.16 fixed point format */ void drm_rect_debug_print(const char *prefix, const struct drm_rect *r, bool fixed_point) { if (fixed_point) DRM_DEBUG_KMS("%s" DRM_RECT_FP_FMT "\n", prefix, DRM_RECT_FP_ARG(r)); else DRM_DEBUG_KMS("%s" DRM_RECT_FMT "\n", prefix, DRM_RECT_ARG(r)); } EXPORT_SYMBOL(drm_rect_debug_print); /** * drm_rect_rotate - Rotate the rectangle * @r: rectangle to be rotated * @width: Width of the coordinate space * @height: Height of the coordinate space * @rotation: Transformation to be applied * * Apply @rotation to the coordinates of rectangle @r. * * @width and @height combined with @rotation define * the location of the new origin. * * @width correcsponds to the horizontal and @height * to the vertical axis of the untransformed coordinate * space. */ void drm_rect_rotate(struct drm_rect *r, int width, int height, unsigned int rotation) { struct drm_rect tmp; if (rotation & (DRM_MODE_REFLECT_X | DRM_MODE_REFLECT_Y)) { tmp = *r; if (rotation & DRM_MODE_REFLECT_X) { r->x1 = width - tmp.x2; r->x2 = width - tmp.x1; } if (rotation & DRM_MODE_REFLECT_Y) { r->y1 = height - tmp.y2; r->y2 = height - tmp.y1; } } switch (rotation & DRM_MODE_ROTATE_MASK) { case DRM_MODE_ROTATE_0: break; case DRM_MODE_ROTATE_90: tmp = *r; r->x1 = tmp.y1; r->x2 = tmp.y2; r->y1 = width - tmp.x2; r->y2 = width - tmp.x1; break; case DRM_MODE_ROTATE_180: tmp = *r; r->x1 = width - tmp.x2; r->x2 = width - tmp.x1; r->y1 = height - tmp.y2; r->y2 = height - tmp.y1; break; case DRM_MODE_ROTATE_270: tmp = *r; r->x1 = height - tmp.y2; r->x2 = height - tmp.y1; r->y1 = tmp.x1; r->y2 = tmp.x2; break; default: break; } } EXPORT_SYMBOL(drm_rect_rotate); /** * drm_rect_rotate_inv - Inverse rotate the rectangle * @r: rectangle to be rotated * @width: Width of the coordinate space * @height: Height of the coordinate space * @rotation: Transformation whose inverse is to be applied * * Apply the inverse of @rotation to the coordinates * of rectangle @r. * * @width and @height combined with @rotation define * the location of the new origin. * * @width correcsponds to the horizontal and @height * to the vertical axis of the original untransformed * coordinate space, so that you never have to flip * them when doing a rotatation and its inverse. * That is, if you do :: * * drm_rect_rotate(&r, width, height, rotation); * drm_rect_rotate_inv(&r, width, height, rotation); * * you will always get back the original rectangle. */ void drm_rect_rotate_inv(struct drm_rect *r, int width, int height, unsigned int rotation) { struct drm_rect tmp; switch (rotation & DRM_MODE_ROTATE_MASK) { case DRM_MODE_ROTATE_0: break; case DRM_MODE_ROTATE_90: tmp = *r; r->x1 = width - tmp.y2; r->x2 = width - tmp.y1; r->y1 = tmp.x1; r->y2 = tmp.x2; break; case DRM_MODE_ROTATE_180: tmp = *r; r->x1 = width - tmp.x2; r->x2 = width - tmp.x1; r->y1 = height - tmp.y2; r->y2 = height - tmp.y1; break; case DRM_MODE_ROTATE_270: tmp = *r; r->x1 = tmp.y1; r->x2 = tmp.y2; r->y1 = height - tmp.x2; r->y2 = height - tmp.x1; break; default: break; } if (rotation & (DRM_MODE_REFLECT_X | DRM_MODE_REFLECT_Y)) { tmp = *r; if (rotation & DRM_MODE_REFLECT_X) { r->x1 = width - tmp.x2; r->x2 = width - tmp.x1; } if (rotation & DRM_MODE_REFLECT_Y) { r->y1 = height - tmp.y2; r->y2 = height - tmp.y1; } } } EXPORT_SYMBOL(drm_rect_rotate_inv);
41 35 35 35 4 4 4 4 4 6 6 6 5 2 1 1 1 49 49 73 72 49 19 14 49 47 33 33 5 33 47 47 47 47 47 2 73 1 1 1 1 1 6 6 6 6 41 41 40 40 40 40 35 40 40 41 231 231 226 226 43 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * PACKET - implements raw packet sockets. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Alan Cox, <gw4pts@gw4pts.ampr.org> * * Fixes: * Alan Cox : verify_area() now used correctly * Alan Cox : new skbuff lists, look ma no backlogs! * Alan Cox : tidied skbuff lists. * Alan Cox : Now uses generic datagram routines I * added. Also fixed the peek/read crash * from all old Linux datagram code. * Alan Cox : Uses the improved datagram code. * Alan Cox : Added NULL's for socket options. * Alan Cox : Re-commented the code. * Alan Cox : Use new kernel side addressing * Rob Janssen : Correct MTU usage. * Dave Platt : Counter leaks caused by incorrect * interrupt locking and some slightly * dubious gcc output. Can you read * compiler: it said _VOLATILE_ * Richard Kooijman : Timestamp fixes. * Alan Cox : New buffers. Use sk->mac.raw. * Alan Cox : sendmsg/recvmsg support. * Alan Cox : Protocol setting support * Alexey Kuznetsov : Untied from IPv4 stack. * Cyrus Durgin : Fixed kerneld for kmod. * Michal Ostrowski : Module initialization cleanup. * Ulises Alonso : Frame number limit removal and * packet_set_ring memory leak. * Eric Biederman : Allow for > 8 byte hardware addresses. * The convention is that longer addresses * will simply extend the hardware address * byte arrays at the end of sockaddr_ll * and packet_mreq. * Johann Baudy : Added TX RING. * Chetan Loke : Implemented TPACKET_V3 block abstraction * layer. * Copyright (C) 2011, <lokec@ccs.neu.edu> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/ethtool.h> #include <linux/filter.h> #include <linux/types.h> #include <linux/mm.h> #include <linux/capability.h> #include <linux/fcntl.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/if_packet.h> #include <linux/wireless.h> #include <linux/kernel.h> #include <linux/kmod.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <net/net_namespace.h> #include <net/ip.h> #include <net/protocol.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/errno.h> #include <linux/timer.h> #include <linux/uaccess.h> #include <asm/ioctls.h> #include <asm/page.h> #include <asm/cacheflush.h> #include <asm/io.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/poll.h> #include <linux/module.h> #include <linux/init.h> #include <linux/mutex.h> #include <linux/if_vlan.h> #include <linux/virtio_net.h> #include <linux/errqueue.h> #include <linux/net_tstamp.h> #include <linux/percpu.h> #ifdef CONFIG_INET #include <net/inet_common.h> #endif #include <linux/bpf.h> #include <net/compat.h> #include <linux/netfilter_netdev.h> #include "internal.h" /* Assumptions: - If the device has no dev->header_ops->create, there is no LL header visible above the device. In this case, its hard_header_len should be 0. The device may prepend its own header internally. In this case, its needed_headroom should be set to the space needed for it to add its internal header. For example, a WiFi driver pretending to be an Ethernet driver should set its hard_header_len to be the Ethernet header length, and set its needed_headroom to be (the real WiFi header length - the fake Ethernet header length). - packet socket receives packets with pulled ll header, so that SOCK_RAW should push it back. On receive: ----------- Incoming, dev_has_header(dev) == true mac_header -> ll header data -> data Outgoing, dev_has_header(dev) == true mac_header -> ll header data -> ll header Incoming, dev_has_header(dev) == false mac_header -> data However drivers often make it point to the ll header. This is incorrect because the ll header should be invisible to us. data -> data Outgoing, dev_has_header(dev) == false mac_header -> data. ll header is invisible to us. data -> data Resume If dev_has_header(dev) == false we are unable to restore the ll header, because it is invisible to us. On transmit: ------------ dev_has_header(dev) == true mac_header -> ll header data -> ll header dev_has_header(dev) == false (ll header is invisible to us) mac_header -> data data -> data We should set network_header on output to the correct position, packet classifier depends on it. */ /* Private packet socket structures. */ /* identical to struct packet_mreq except it has * a longer address field. */ struct packet_mreq_max { int mr_ifindex; unsigned short mr_type; unsigned short mr_alen; unsigned char mr_address[MAX_ADDR_LEN]; }; union tpacket_uhdr { struct tpacket_hdr *h1; struct tpacket2_hdr *h2; struct tpacket3_hdr *h3; void *raw; }; static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, int closing, int tx_ring); #define V3_ALIGNMENT (8) #define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT)) #define BLK_PLUS_PRIV(sz_of_priv) \ (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT)) #define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status) #define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts) #define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt) #define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len) #define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num) #define BLOCK_O2PRIV(x) ((x)->offset_to_priv) struct packet_sock; static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); static void *packet_previous_frame(struct packet_sock *po, struct packet_ring_buffer *rb, int status); static void packet_increment_head(struct packet_ring_buffer *buff); static int prb_curr_blk_in_use(struct tpacket_block_desc *); static void *prb_dispatch_next_block(struct tpacket_kbdq_core *, struct packet_sock *); static void prb_retire_current_block(struct tpacket_kbdq_core *, struct packet_sock *, unsigned int status); static int prb_queue_frozen(struct tpacket_kbdq_core *); static void prb_open_block(struct tpacket_kbdq_core *, struct tpacket_block_desc *); static void prb_retire_rx_blk_timer_expired(struct timer_list *); static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *); static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *); static void prb_clear_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *); static void prb_fill_vlan_info(struct tpacket_kbdq_core *, struct tpacket3_hdr *); static void packet_flush_mclist(struct sock *sk); static u16 packet_pick_tx_queue(struct sk_buff *skb); struct packet_skb_cb { union { struct sockaddr_pkt pkt; union { /* Trick: alias skb original length with * ll.sll_family and ll.protocol in order * to save room. */ unsigned int origlen; struct sockaddr_ll ll; }; } sa; }; #define vio_le() virtio_legacy_is_little_endian() #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb)) #define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc)) #define GET_PBLOCK_DESC(x, bid) \ ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer)) #define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \ ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer)) #define GET_NEXT_PRB_BLK_NUM(x) \ (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \ ((x)->kactive_blk_num+1) : 0) static void __fanout_unlink(struct sock *sk, struct packet_sock *po); static void __fanout_link(struct sock *sk, struct packet_sock *po); #ifdef CONFIG_NETFILTER_EGRESS static noinline struct sk_buff *nf_hook_direct_egress(struct sk_buff *skb) { struct sk_buff *next, *head = NULL, *tail; int rc; rcu_read_lock(); for (; skb != NULL; skb = next) { next = skb->next; skb_mark_not_on_list(skb); if (!nf_hook_egress(skb, &rc, skb->dev)) continue; if (!head) head = skb; else tail->next = skb; tail = skb; } rcu_read_unlock(); return head; } #endif static int packet_xmit(const struct packet_sock *po, struct sk_buff *skb) { if (!packet_sock_flag(po, PACKET_SOCK_QDISC_BYPASS)) return dev_queue_xmit(skb); #ifdef CONFIG_NETFILTER_EGRESS if (nf_hook_egress_active()) { skb = nf_hook_direct_egress(skb); if (!skb) return NET_XMIT_DROP; } #endif return dev_direct_xmit(skb, packet_pick_tx_queue(skb)); } static struct net_device *packet_cached_dev_get(struct packet_sock *po) { struct net_device *dev; rcu_read_lock(); dev = rcu_dereference(po->cached_dev); dev_hold(dev); rcu_read_unlock(); return dev; } static void packet_cached_dev_assign(struct packet_sock *po, struct net_device *dev) { rcu_assign_pointer(po->cached_dev, dev); } static void packet_cached_dev_reset(struct packet_sock *po) { RCU_INIT_POINTER(po->cached_dev, NULL); } static u16 packet_pick_tx_queue(struct sk_buff *skb) { struct net_device *dev = skb->dev; const struct net_device_ops *ops = dev->netdev_ops; int cpu = raw_smp_processor_id(); u16 queue_index; #ifdef CONFIG_XPS skb->sender_cpu = cpu + 1; #endif skb_record_rx_queue(skb, cpu % dev->real_num_tx_queues); if (ops->ndo_select_queue) { queue_index = ops->ndo_select_queue(dev, skb, NULL); queue_index = netdev_cap_txqueue(dev, queue_index); } else { queue_index = netdev_pick_tx(dev, skb, NULL); } return queue_index; } /* __register_prot_hook must be invoked through register_prot_hook * or from a context in which asynchronous accesses to the packet * socket is not possible (packet_create()). */ static void __register_prot_hook(struct sock *sk) { struct packet_sock *po = pkt_sk(sk); if (!packet_sock_flag(po, PACKET_SOCK_RUNNING)) { if (po->fanout) __fanout_link(sk, po); else dev_add_pack(&po->prot_hook); sock_hold(sk); packet_sock_flag_set(po, PACKET_SOCK_RUNNING, 1); } } static void register_prot_hook(struct sock *sk) { lockdep_assert_held_once(&pkt_sk(sk)->bind_lock); __register_prot_hook(sk); } /* If the sync parameter is true, we will temporarily drop * the po->bind_lock and do a synchronize_net to make sure no * asynchronous packet processing paths still refer to the elements * of po->prot_hook. If the sync parameter is false, it is the * callers responsibility to take care of this. */ static void __unregister_prot_hook(struct sock *sk, bool sync) { struct packet_sock *po = pkt_sk(sk); lockdep_assert_held_once(&po->bind_lock); packet_sock_flag_set(po, PACKET_SOCK_RUNNING, 0); if (po->fanout) __fanout_unlink(sk, po); else __dev_remove_pack(&po->prot_hook); __sock_put(sk); if (sync) { spin_unlock(&po->bind_lock); synchronize_net(); spin_lock(&po->bind_lock); } } static void unregister_prot_hook(struct sock *sk, bool sync) { struct packet_sock *po = pkt_sk(sk); if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) __unregister_prot_hook(sk, sync); } static inline struct page * __pure pgv_to_page(void *addr) { if (is_vmalloc_addr(addr)) return vmalloc_to_page(addr); return virt_to_page(addr); } static void __packet_set_status(struct packet_sock *po, void *frame, int status) { union tpacket_uhdr h; /* WRITE_ONCE() are paired with READ_ONCE() in __packet_get_status */ h.raw = frame; switch (po->tp_version) { case TPACKET_V1: WRITE_ONCE(h.h1->tp_status, status); flush_dcache_page(pgv_to_page(&h.h1->tp_status)); break; case TPACKET_V2: WRITE_ONCE(h.h2->tp_status, status); flush_dcache_page(pgv_to_page(&h.h2->tp_status)); break; case TPACKET_V3: WRITE_ONCE(h.h3->tp_status, status); flush_dcache_page(pgv_to_page(&h.h3->tp_status)); break; default: WARN(1, "TPACKET version not supported.\n"); BUG(); } smp_wmb(); } static int __packet_get_status(const struct packet_sock *po, void *frame) { union tpacket_uhdr h; smp_rmb(); /* READ_ONCE() are paired with WRITE_ONCE() in __packet_set_status */ h.raw = frame; switch (po->tp_version) { case TPACKET_V1: flush_dcache_page(pgv_to_page(&h.h1->tp_status)); return READ_ONCE(h.h1->tp_status); case TPACKET_V2: flush_dcache_page(pgv_to_page(&h.h2->tp_status)); return READ_ONCE(h.h2->tp_status); case TPACKET_V3: flush_dcache_page(pgv_to_page(&h.h3->tp_status)); return READ_ONCE(h.h3->tp_status); default: WARN(1, "TPACKET version not supported.\n"); BUG(); return 0; } } static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec64 *ts, unsigned int flags) { struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); if (shhwtstamps && (flags & SOF_TIMESTAMPING_RAW_HARDWARE) && ktime_to_timespec64_cond(shhwtstamps->hwtstamp, ts)) return TP_STATUS_TS_RAW_HARDWARE; if ((flags & SOF_TIMESTAMPING_SOFTWARE) && ktime_to_timespec64_cond(skb_tstamp(skb), ts)) return TP_STATUS_TS_SOFTWARE; return 0; } static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame, struct sk_buff *skb) { union tpacket_uhdr h; struct timespec64 ts; __u32 ts_status; if (!(ts_status = tpacket_get_timestamp(skb, &ts, READ_ONCE(po->tp_tstamp)))) return 0; h.raw = frame; /* * versions 1 through 3 overflow the timestamps in y2106, since they * all store the seconds in a 32-bit unsigned integer. * If we create a version 4, that should have a 64-bit timestamp, * either 64-bit seconds + 32-bit nanoseconds, or just 64-bit * nanoseconds. */ switch (po->tp_version) { case TPACKET_V1: h.h1->tp_sec = ts.tv_sec; h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC; break; case TPACKET_V2: h.h2->tp_sec = ts.tv_sec; h.h2->tp_nsec = ts.tv_nsec; break; case TPACKET_V3: h.h3->tp_sec = ts.tv_sec; h.h3->tp_nsec = ts.tv_nsec; break; default: WARN(1, "TPACKET version not supported.\n"); BUG(); } /* one flush is safe, as both fields always lie on the same cacheline */ flush_dcache_page(pgv_to_page(&h.h1->tp_sec)); smp_wmb(); return ts_status; } static void *packet_lookup_frame(const struct packet_sock *po, const struct packet_ring_buffer *rb, unsigned int position, int status) { unsigned int pg_vec_pos, frame_offset; union tpacket_uhdr h; pg_vec_pos = position / rb->frames_per_block; frame_offset = position % rb->frames_per_block; h.raw = rb->pg_vec[pg_vec_pos].buffer + (frame_offset * rb->frame_size); if (status != __packet_get_status(po, h.raw)) return NULL; return h.raw; } static void *packet_current_frame(struct packet_sock *po, struct packet_ring_buffer *rb, int status) { return packet_lookup_frame(po, rb, rb->head, status); } static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc) { del_timer_sync(&pkc->retire_blk_timer); } static void prb_shutdown_retire_blk_timer(struct packet_sock *po, struct sk_buff_head *rb_queue) { struct tpacket_kbdq_core *pkc; pkc = GET_PBDQC_FROM_RB(&po->rx_ring); spin_lock_bh(&rb_queue->lock); pkc->delete_blk_timer = 1; spin_unlock_bh(&rb_queue->lock); prb_del_retire_blk_timer(pkc); } static void prb_setup_retire_blk_timer(struct packet_sock *po) { struct tpacket_kbdq_core *pkc; pkc = GET_PBDQC_FROM_RB(&po->rx_ring); timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired, 0); pkc->retire_blk_timer.expires = jiffies; } static int prb_calc_retire_blk_tmo(struct packet_sock *po, int blk_size_in_bytes) { struct net_device *dev; unsigned int mbits, div; struct ethtool_link_ksettings ecmd; int err; rtnl_lock(); dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex); if (unlikely(!dev)) { rtnl_unlock(); return DEFAULT_PRB_RETIRE_TOV; } err = __ethtool_get_link_ksettings(dev, &ecmd); rtnl_unlock(); if (err) return DEFAULT_PRB_RETIRE_TOV; /* If the link speed is so slow you don't really * need to worry about perf anyways */ if (ecmd.base.speed < SPEED_1000 || ecmd.base.speed == SPEED_UNKNOWN) return DEFAULT_PRB_RETIRE_TOV; div = ecmd.base.speed / 1000; mbits = (blk_size_in_bytes * 8) / (1024 * 1024); if (div) mbits /= div; if (div) return mbits + 1; return mbits; } static void prb_init_ft_ops(struct tpacket_kbdq_core *p1, union tpacket_req_u *req_u) { p1->feature_req_word = req_u->req3.tp_feature_req_word; } static void init_prb_bdqc(struct packet_sock *po, struct packet_ring_buffer *rb, struct pgv *pg_vec, union tpacket_req_u *req_u) { struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb); struct tpacket_block_desc *pbd; memset(p1, 0x0, sizeof(*p1)); p1->knxt_seq_num = 1; p1->pkbdq = pg_vec; pbd = (struct tpacket_block_desc *)pg_vec[0].buffer; p1->pkblk_start = pg_vec[0].buffer; p1->kblk_size = req_u->req3.tp_block_size; p1->knum_blocks = req_u->req3.tp_block_nr; p1->hdrlen = po->tp_hdrlen; p1->version = po->tp_version; p1->last_kactive_blk_num = 0; po->stats.stats3.tp_freeze_q_cnt = 0; if (req_u->req3.tp_retire_blk_tov) p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov; else p1->retire_blk_tov = prb_calc_retire_blk_tmo(po, req_u->req3.tp_block_size); p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov); p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv; rwlock_init(&p1->blk_fill_in_prog_lock); p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv); prb_init_ft_ops(p1, req_u); prb_setup_retire_blk_timer(po); prb_open_block(p1, pbd); } /* Do NOT update the last_blk_num first. * Assumes sk_buff_head lock is held. */ static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc) { mod_timer(&pkc->retire_blk_timer, jiffies + pkc->tov_in_jiffies); pkc->last_kactive_blk_num = pkc->kactive_blk_num; } /* * Timer logic: * 1) We refresh the timer only when we open a block. * By doing this we don't waste cycles refreshing the timer * on packet-by-packet basis. * * With a 1MB block-size, on a 1Gbps line, it will take * i) ~8 ms to fill a block + ii) memcpy etc. * In this cut we are not accounting for the memcpy time. * * So, if the user sets the 'tmo' to 10ms then the timer * will never fire while the block is still getting filled * (which is what we want). However, the user could choose * to close a block early and that's fine. * * But when the timer does fire, we check whether or not to refresh it. * Since the tmo granularity is in msecs, it is not too expensive * to refresh the timer, lets say every '8' msecs. * Either the user can set the 'tmo' or we can derive it based on * a) line-speed and b) block-size. * prb_calc_retire_blk_tmo() calculates the tmo. * */ static void prb_retire_rx_blk_timer_expired(struct timer_list *t) { struct packet_sock *po = from_timer(po, t, rx_ring.prb_bdqc.retire_blk_timer); struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring); unsigned int frozen; struct tpacket_block_desc *pbd; spin_lock(&po->sk.sk_receive_queue.lock); frozen = prb_queue_frozen(pkc); pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); if (unlikely(pkc->delete_blk_timer)) goto out; /* We only need to plug the race when the block is partially filled. * tpacket_rcv: * lock(); increment BLOCK_NUM_PKTS; unlock() * copy_bits() is in progress ... * timer fires on other cpu: * we can't retire the current block because copy_bits * is in progress. * */ if (BLOCK_NUM_PKTS(pbd)) { /* Waiting for skb_copy_bits to finish... */ write_lock(&pkc->blk_fill_in_prog_lock); write_unlock(&pkc->blk_fill_in_prog_lock); } if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) { if (!frozen) { if (!BLOCK_NUM_PKTS(pbd)) { /* An empty block. Just refresh the timer. */ goto refresh_timer; } prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO); if (!prb_dispatch_next_block(pkc, po)) goto refresh_timer; else goto out; } else { /* Case 1. Queue was frozen because user-space was * lagging behind. */ if (prb_curr_blk_in_use(pbd)) { /* * Ok, user-space is still behind. * So just refresh the timer. */ goto refresh_timer; } else { /* Case 2. queue was frozen,user-space caught up, * now the link went idle && the timer fired. * We don't have a block to close.So we open this * block and restart the timer. * opening a block thaws the queue,restarts timer * Thawing/timer-refresh is a side effect. */ prb_open_block(pkc, pbd); goto out; } } } refresh_timer: _prb_refresh_rx_retire_blk_timer(pkc); out: spin_unlock(&po->sk.sk_receive_queue.lock); } static void prb_flush_block(struct tpacket_kbdq_core *pkc1, struct tpacket_block_desc *pbd1, __u32 status) { /* Flush everything minus the block header */ #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 u8 *start, *end; start = (u8 *)pbd1; /* Skip the block header(we know header WILL fit in 4K) */ start += PAGE_SIZE; end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end); for (; start < end; start += PAGE_SIZE) flush_dcache_page(pgv_to_page(start)); smp_wmb(); #endif /* Now update the block status. */ BLOCK_STATUS(pbd1) = status; /* Flush the block header */ #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 start = (u8 *)pbd1; flush_dcache_page(pgv_to_page(start)); smp_wmb(); #endif } /* * Side effect: * * 1) flush the block * 2) Increment active_blk_num * * Note:We DONT refresh the timer on purpose. * Because almost always the next block will be opened. */ static void prb_close_block(struct tpacket_kbdq_core *pkc1, struct tpacket_block_desc *pbd1, struct packet_sock *po, unsigned int stat) { __u32 status = TP_STATUS_USER | stat; struct tpacket3_hdr *last_pkt; struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1; struct sock *sk = &po->sk; if (atomic_read(&po->tp_drops)) status |= TP_STATUS_LOSING; last_pkt = (struct tpacket3_hdr *)pkc1->prev; last_pkt->tp_next_offset = 0; /* Get the ts of the last pkt */ if (BLOCK_NUM_PKTS(pbd1)) { h1->ts_last_pkt.ts_sec = last_pkt->tp_sec; h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec; } else { /* Ok, we tmo'd - so get the current time. * * It shouldn't really happen as we don't close empty * blocks. See prb_retire_rx_blk_timer_expired(). */ struct timespec64 ts; ktime_get_real_ts64(&ts); h1->ts_last_pkt.ts_sec = ts.tv_sec; h1->ts_last_pkt.ts_nsec = ts.tv_nsec; } smp_wmb(); /* Flush the block */ prb_flush_block(pkc1, pbd1, status); sk->sk_data_ready(sk); pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1); } static void prb_thaw_queue(struct tpacket_kbdq_core *pkc) { pkc->reset_pending_on_curr_blk = 0; } /* * Side effect of opening a block: * * 1) prb_queue is thawed. * 2) retire_blk_timer is refreshed. * */ static void prb_open_block(struct tpacket_kbdq_core *pkc1, struct tpacket_block_desc *pbd1) { struct timespec64 ts; struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1; smp_rmb(); /* We could have just memset this but we will lose the * flexibility of making the priv area sticky */ BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++; BLOCK_NUM_PKTS(pbd1) = 0; BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); ktime_get_real_ts64(&ts); h1->ts_first_pkt.ts_sec = ts.tv_sec; h1->ts_first_pkt.ts_nsec = ts.tv_nsec; pkc1->pkblk_start = (char *)pbd1; pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN; pbd1->version = pkc1->version; pkc1->prev = pkc1->nxt_offset; pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size; prb_thaw_queue(pkc1); _prb_refresh_rx_retire_blk_timer(pkc1); smp_wmb(); } /* * Queue freeze logic: * 1) Assume tp_block_nr = 8 blocks. * 2) At time 't0', user opens Rx ring. * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7 * 4) user-space is either sleeping or processing block '0'. * 5) tpacket_rcv is currently filling block '7', since there is no space left, * it will close block-7,loop around and try to fill block '0'. * call-flow: * __packet_lookup_frame_in_block * prb_retire_current_block() * prb_dispatch_next_block() * |->(BLOCK_STATUS == USER) evaluates to true * 5.1) Since block-0 is currently in-use, we just freeze the queue. * 6) Now there are two cases: * 6.1) Link goes idle right after the queue is frozen. * But remember, the last open_block() refreshed the timer. * When this timer expires,it will refresh itself so that we can * re-open block-0 in near future. * 6.2) Link is busy and keeps on receiving packets. This is a simple * case and __packet_lookup_frame_in_block will check if block-0 * is free and can now be re-used. */ static void prb_freeze_queue(struct tpacket_kbdq_core *pkc, struct packet_sock *po) { pkc->reset_pending_on_curr_blk = 1; po->stats.stats3.tp_freeze_q_cnt++; } #define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT)) /* * If the next block is free then we will dispatch it * and return a good offset. * Else, we will freeze the queue. * So, caller must check the return value. */ static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc, struct packet_sock *po) { struct tpacket_block_desc *pbd; smp_rmb(); /* 1. Get current block num */ pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); /* 2. If this block is currently in_use then freeze the queue */ if (TP_STATUS_USER & BLOCK_STATUS(pbd)) { prb_freeze_queue(pkc, po); return NULL; } /* * 3. * open this block and return the offset where the first packet * needs to get stored. */ prb_open_block(pkc, pbd); return (void *)pkc->nxt_offset; } static void prb_retire_current_block(struct tpacket_kbdq_core *pkc, struct packet_sock *po, unsigned int status) { struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); /* retire/close the current block */ if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) { /* * Plug the case where copy_bits() is in progress on * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't * have space to copy the pkt in the current block and * called prb_retire_current_block() * * We don't need to worry about the TMO case because * the timer-handler already handled this case. */ if (!(status & TP_STATUS_BLK_TMO)) { /* Waiting for skb_copy_bits to finish... */ write_lock(&pkc->blk_fill_in_prog_lock); write_unlock(&pkc->blk_fill_in_prog_lock); } prb_close_block(pkc, pbd, po, status); return; } } static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd) { return TP_STATUS_USER & BLOCK_STATUS(pbd); } static int prb_queue_frozen(struct tpacket_kbdq_core *pkc) { return pkc->reset_pending_on_curr_blk; } static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb) __releases(&pkc->blk_fill_in_prog_lock) { struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb); read_unlock(&pkc->blk_fill_in_prog_lock); } static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc, struct tpacket3_hdr *ppd) { ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb); } static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc, struct tpacket3_hdr *ppd) { ppd->hv1.tp_rxhash = 0; } static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc, struct tpacket3_hdr *ppd) { if (skb_vlan_tag_present(pkc->skb)) { ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb); ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto); ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else { ppd->hv1.tp_vlan_tci = 0; ppd->hv1.tp_vlan_tpid = 0; ppd->tp_status = TP_STATUS_AVAILABLE; } } static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc, struct tpacket3_hdr *ppd) { ppd->hv1.tp_padding = 0; prb_fill_vlan_info(pkc, ppd); if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH) prb_fill_rxhash(pkc, ppd); else prb_clear_rxhash(pkc, ppd); } static void prb_fill_curr_block(char *curr, struct tpacket_kbdq_core *pkc, struct tpacket_block_desc *pbd, unsigned int len) __acquires(&pkc->blk_fill_in_prog_lock) { struct tpacket3_hdr *ppd; ppd = (struct tpacket3_hdr *)curr; ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len); pkc->prev = curr; pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len); BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len); BLOCK_NUM_PKTS(pbd) += 1; read_lock(&pkc->blk_fill_in_prog_lock); prb_run_all_ft_ops(pkc, ppd); } /* Assumes caller has the sk->rx_queue.lock */ static void *__packet_lookup_frame_in_block(struct packet_sock *po, struct sk_buff *skb, unsigned int len ) { struct tpacket_kbdq_core *pkc; struct tpacket_block_desc *pbd; char *curr, *end; pkc = GET_PBDQC_FROM_RB(&po->rx_ring); pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); /* Queue is frozen when user space is lagging behind */ if (prb_queue_frozen(pkc)) { /* * Check if that last block which caused the queue to freeze, * is still in_use by user-space. */ if (prb_curr_blk_in_use(pbd)) { /* Can't record this packet */ return NULL; } else { /* * Ok, the block was released by user-space. * Now let's open that block. * opening a block also thaws the queue. * Thawing is a side effect. */ prb_open_block(pkc, pbd); } } smp_mb(); curr = pkc->nxt_offset; pkc->skb = skb; end = (char *)pbd + pkc->kblk_size; /* first try the current block */ if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) { prb_fill_curr_block(curr, pkc, pbd, len); return (void *)curr; } /* Ok, close the current block */ prb_retire_current_block(pkc, po, 0); /* Now, try to dispatch the next block */ curr = (char *)prb_dispatch_next_block(pkc, po); if (curr) { pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); prb_fill_curr_block(curr, pkc, pbd, len); return (void *)curr; } /* * No free blocks are available.user_space hasn't caught up yet. * Queue was just frozen and now this packet will get dropped. */ return NULL; } static void *packet_current_rx_frame(struct packet_sock *po, struct sk_buff *skb, int status, unsigned int len) { char *curr = NULL; switch (po->tp_version) { case TPACKET_V1: case TPACKET_V2: curr = packet_lookup_frame(po, &po->rx_ring, po->rx_ring.head, status); return curr; case TPACKET_V3: return __packet_lookup_frame_in_block(po, skb, len); default: WARN(1, "TPACKET version not supported\n"); BUG(); return NULL; } } static void *prb_lookup_block(const struct packet_sock *po, const struct packet_ring_buffer *rb, unsigned int idx, int status) { struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb); struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx); if (status != BLOCK_STATUS(pbd)) return NULL; return pbd; } static int prb_previous_blk_num(struct packet_ring_buffer *rb) { unsigned int prev; if (rb->prb_bdqc.kactive_blk_num) prev = rb->prb_bdqc.kactive_blk_num-1; else prev = rb->prb_bdqc.knum_blocks-1; return prev; } /* Assumes caller has held the rx_queue.lock */ static void *__prb_previous_block(struct packet_sock *po, struct packet_ring_buffer *rb, int status) { unsigned int previous = prb_previous_blk_num(rb); return prb_lookup_block(po, rb, previous, status); } static void *packet_previous_rx_frame(struct packet_sock *po, struct packet_ring_buffer *rb, int status) { if (po->tp_version <= TPACKET_V2) return packet_previous_frame(po, rb, status); return __prb_previous_block(po, rb, status); } static void packet_increment_rx_head(struct packet_sock *po, struct packet_ring_buffer *rb) { switch (po->tp_version) { case TPACKET_V1: case TPACKET_V2: return packet_increment_head(rb); case TPACKET_V3: default: WARN(1, "TPACKET version not supported.\n"); BUG(); return; } } static void *packet_previous_frame(struct packet_sock *po, struct packet_ring_buffer *rb, int status) { unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max; return packet_lookup_frame(po, rb, previous, status); } static void packet_increment_head(struct packet_ring_buffer *buff) { buff->head = buff->head != buff->frame_max ? buff->head+1 : 0; } static void packet_inc_pending(struct packet_ring_buffer *rb) { this_cpu_inc(*rb->pending_refcnt); } static void packet_dec_pending(struct packet_ring_buffer *rb) { this_cpu_dec(*rb->pending_refcnt); } static unsigned int packet_read_pending(const struct packet_ring_buffer *rb) { unsigned int refcnt = 0; int cpu; /* We don't use pending refcount in rx_ring. */ if (rb->pending_refcnt == NULL) return 0; for_each_possible_cpu(cpu) refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu); return refcnt; } static int packet_alloc_pending(struct packet_sock *po) { po->rx_ring.pending_refcnt = NULL; po->tx_ring.pending_refcnt = alloc_percpu(unsigned int); if (unlikely(po->tx_ring.pending_refcnt == NULL)) return -ENOBUFS; return 0; } static void packet_free_pending(struct packet_sock *po) { free_percpu(po->tx_ring.pending_refcnt); } #define ROOM_POW_OFF 2 #define ROOM_NONE 0x0 #define ROOM_LOW 0x1 #define ROOM_NORMAL 0x2 static bool __tpacket_has_room(const struct packet_sock *po, int pow_off) { int idx, len; len = READ_ONCE(po->rx_ring.frame_max) + 1; idx = READ_ONCE(po->rx_ring.head); if (pow_off) idx += len >> pow_off; if (idx >= len) idx -= len; return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL); } static bool __tpacket_v3_has_room(const struct packet_sock *po, int pow_off) { int idx, len; len = READ_ONCE(po->rx_ring.prb_bdqc.knum_blocks); idx = READ_ONCE(po->rx_ring.prb_bdqc.kactive_blk_num); if (pow_off) idx += len >> pow_off; if (idx >= len) idx -= len; return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL); } static int __packet_rcv_has_room(const struct packet_sock *po, const struct sk_buff *skb) { const struct sock *sk = &po->sk; int ret = ROOM_NONE; if (po->prot_hook.func != tpacket_rcv) { int rcvbuf = READ_ONCE(sk->sk_rcvbuf); int avail = rcvbuf - atomic_read(&sk->sk_rmem_alloc) - (skb ? skb->truesize : 0); if (avail > (rcvbuf >> ROOM_POW_OFF)) return ROOM_NORMAL; else if (avail > 0) return ROOM_LOW; else return ROOM_NONE; } if (po->tp_version == TPACKET_V3) { if (__tpacket_v3_has_room(po, ROOM_POW_OFF)) ret = ROOM_NORMAL; else if (__tpacket_v3_has_room(po, 0)) ret = ROOM_LOW; } else { if (__tpacket_has_room(po, ROOM_POW_OFF)) ret = ROOM_NORMAL; else if (__tpacket_has_room(po, 0)) ret = ROOM_LOW; } return ret; } static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) { bool pressure; int ret; ret = __packet_rcv_has_room(po, skb); pressure = ret != ROOM_NORMAL; if (packet_sock_flag(po, PACKET_SOCK_PRESSURE) != pressure) packet_sock_flag_set(po, PACKET_SOCK_PRESSURE, pressure); return ret; } static void packet_rcv_try_clear_pressure(struct packet_sock *po) { if (packet_sock_flag(po, PACKET_SOCK_PRESSURE) && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL) packet_sock_flag_set(po, PACKET_SOCK_PRESSURE, false); } static void packet_sock_destruct(struct sock *sk) { skb_queue_purge(&sk->sk_error_queue); WARN_ON(atomic_read(&sk->sk_rmem_alloc)); WARN_ON(refcount_read(&sk->sk_wmem_alloc)); if (!sock_flag(sk, SOCK_DEAD)) { pr_err("Attempt to release alive packet socket: %p\n", sk); return; } } static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb) { u32 *history = po->rollover->history; u32 victim, rxhash; int i, count = 0; rxhash = skb_get_hash(skb); for (i = 0; i < ROLLOVER_HLEN; i++) if (READ_ONCE(history[i]) == rxhash) count++; victim = get_random_u32_below(ROLLOVER_HLEN); /* Avoid dirtying the cache line if possible */ if (READ_ONCE(history[victim]) != rxhash) WRITE_ONCE(history[victim], rxhash); return count > (ROLLOVER_HLEN >> 1); } static unsigned int fanout_demux_hash(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { return reciprocal_scale(__skb_get_hash_symmetric(skb), num); } static unsigned int fanout_demux_lb(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { unsigned int val = atomic_inc_return(&f->rr_cur); return val % num; } static unsigned int fanout_demux_cpu(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { return smp_processor_id() % num; } static unsigned int fanout_demux_rnd(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { return get_random_u32_below(num); } static unsigned int fanout_demux_rollover(struct packet_fanout *f, struct sk_buff *skb, unsigned int idx, bool try_self, unsigned int num) { struct packet_sock *po, *po_next, *po_skip = NULL; unsigned int i, j, room = ROOM_NONE; po = pkt_sk(rcu_dereference(f->arr[idx])); if (try_self) { room = packet_rcv_has_room(po, skb); if (room == ROOM_NORMAL || (room == ROOM_LOW && !fanout_flow_is_huge(po, skb))) return idx; po_skip = po; } i = j = min_t(int, po->rollover->sock, num - 1); do { po_next = pkt_sk(rcu_dereference(f->arr[i])); if (po_next != po_skip && !packet_sock_flag(po_next, PACKET_SOCK_PRESSURE) && packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) { if (i != j) po->rollover->sock = i; atomic_long_inc(&po->rollover->num); if (room == ROOM_LOW) atomic_long_inc(&po->rollover->num_huge); return i; } if (++i == num) i = 0; } while (i != j); atomic_long_inc(&po->rollover->num_failed); return idx; } static unsigned int fanout_demux_qm(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { return skb_get_queue_mapping(skb) % num; } static unsigned int fanout_demux_bpf(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { struct bpf_prog *prog; unsigned int ret = 0; rcu_read_lock(); prog = rcu_dereference(f->bpf_prog); if (prog) ret = bpf_prog_run_clear_cb(prog, skb) % num; rcu_read_unlock(); return ret; } static bool fanout_has_flag(struct packet_fanout *f, u16 flag) { return f->flags & (flag >> 8); } static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct packet_fanout *f = pt->af_packet_priv; unsigned int num = READ_ONCE(f->num_members); struct net *net = read_pnet(&f->net); struct packet_sock *po; unsigned int idx; if (!net_eq(dev_net(dev), net) || !num) { kfree_skb(skb); return 0; } if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) { skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET); if (!skb) return 0; } switch (f->type) { case PACKET_FANOUT_HASH: default: idx = fanout_demux_hash(f, skb, num); break; case PACKET_FANOUT_LB: idx = fanout_demux_lb(f, skb, num); break; case PACKET_FANOUT_CPU: idx = fanout_demux_cpu(f, skb, num); break; case PACKET_FANOUT_RND: idx = fanout_demux_rnd(f, skb, num); break; case PACKET_FANOUT_QM: idx = fanout_demux_qm(f, skb, num); break; case PACKET_FANOUT_ROLLOVER: idx = fanout_demux_rollover(f, skb, 0, false, num); break; case PACKET_FANOUT_CBPF: case PACKET_FANOUT_EBPF: idx = fanout_demux_bpf(f, skb, num); break; } if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER)) idx = fanout_demux_rollover(f, skb, idx, true, num); po = pkt_sk(rcu_dereference(f->arr[idx])); return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev); } DEFINE_MUTEX(fanout_mutex); EXPORT_SYMBOL_GPL(fanout_mutex); static LIST_HEAD(fanout_list); static u16 fanout_next_id; static void __fanout_link(struct sock *sk, struct packet_sock *po) { struct packet_fanout *f = po->fanout; spin_lock(&f->lock); rcu_assign_pointer(f->arr[f->num_members], sk); smp_wmb(); f->num_members++; if (f->num_members == 1) dev_add_pack(&f->prot_hook); spin_unlock(&f->lock); } static void __fanout_unlink(struct sock *sk, struct packet_sock *po) { struct packet_fanout *f = po->fanout; int i; spin_lock(&f->lock); for (i = 0; i < f->num_members; i++) { if (rcu_dereference_protected(f->arr[i], lockdep_is_held(&f->lock)) == sk) break; } BUG_ON(i >= f->num_members); rcu_assign_pointer(f->arr[i], rcu_dereference_protected(f->arr[f->num_members - 1], lockdep_is_held(&f->lock))); f->num_members--; if (f->num_members == 0) __dev_remove_pack(&f->prot_hook); spin_unlock(&f->lock); } static bool match_fanout_group(struct packet_type *ptype, struct sock *sk) { if (sk->sk_family != PF_PACKET) return false; return ptype->af_packet_priv == pkt_sk(sk)->fanout; } static void fanout_init_data(struct packet_fanout *f) { switch (f->type) { case PACKET_FANOUT_LB: atomic_set(&f->rr_cur, 0); break; case PACKET_FANOUT_CBPF: case PACKET_FANOUT_EBPF: RCU_INIT_POINTER(f->bpf_prog, NULL); break; } } static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new) { struct bpf_prog *old; spin_lock(&f->lock); old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock)); rcu_assign_pointer(f->bpf_prog, new); spin_unlock(&f->lock); if (old) { synchronize_net(); bpf_prog_destroy(old); } } static int fanout_set_data_cbpf(struct packet_sock *po, sockptr_t data, unsigned int len) { struct bpf_prog *new; struct sock_fprog fprog; int ret; if (sock_flag(&po->sk, SOCK_FILTER_LOCKED)) return -EPERM; ret = copy_bpf_fprog_from_user(&fprog, data, len); if (ret) return ret; ret = bpf_prog_create_from_user(&new, &fprog, NULL, false); if (ret) return ret; __fanout_set_data_bpf(po->fanout, new); return 0; } static int fanout_set_data_ebpf(struct packet_sock *po, sockptr_t data, unsigned int len) { struct bpf_prog *new; u32 fd; if (sock_flag(&po->sk, SOCK_FILTER_LOCKED)) return -EPERM; if (len != sizeof(fd)) return -EINVAL; if (copy_from_sockptr(&fd, data, len)) return -EFAULT; new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER); if (IS_ERR(new)) return PTR_ERR(new); __fanout_set_data_bpf(po->fanout, new); return 0; } static int fanout_set_data(struct packet_sock *po, sockptr_t data, unsigned int len) { switch (po->fanout->type) { case PACKET_FANOUT_CBPF: return fanout_set_data_cbpf(po, data, len); case PACKET_FANOUT_EBPF: return fanout_set_data_ebpf(po, data, len); default: return -EINVAL; } } static void fanout_release_data(struct packet_fanout *f) { switch (f->type) { case PACKET_FANOUT_CBPF: case PACKET_FANOUT_EBPF: __fanout_set_data_bpf(f, NULL); } } static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id) { struct packet_fanout *f; list_for_each_entry(f, &fanout_list, list) { if (f->id == candidate_id && read_pnet(&f->net) == sock_net(sk)) { return false; } } return true; } static bool fanout_find_new_id(struct sock *sk, u16 *new_id) { u16 id = fanout_next_id; do { if (__fanout_id_is_free(sk, id)) { *new_id = id; fanout_next_id = id + 1; return true; } id++; } while (id != fanout_next_id); return false; } static int fanout_add(struct sock *sk, struct fanout_args *args) { struct packet_rollover *rollover = NULL; struct packet_sock *po = pkt_sk(sk); u16 type_flags = args->type_flags; struct packet_fanout *f, *match; u8 type = type_flags & 0xff; u8 flags = type_flags >> 8; u16 id = args->id; int err; switch (type) { case PACKET_FANOUT_ROLLOVER: if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER) return -EINVAL; break; case PACKET_FANOUT_HASH: case PACKET_FANOUT_LB: case PACKET_FANOUT_CPU: case PACKET_FANOUT_RND: case PACKET_FANOUT_QM: case PACKET_FANOUT_CBPF: case PACKET_FANOUT_EBPF: break; default: return -EINVAL; } mutex_lock(&fanout_mutex); err = -EALREADY; if (po->fanout) goto out; if (type == PACKET_FANOUT_ROLLOVER || (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) { err = -ENOMEM; rollover = kzalloc(sizeof(*rollover), GFP_KERNEL); if (!rollover) goto out; atomic_long_set(&rollover->num, 0); atomic_long_set(&rollover->num_huge, 0); atomic_long_set(&rollover->num_failed, 0); } if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) { if (id != 0) { err = -EINVAL; goto out; } if (!fanout_find_new_id(sk, &id)) { err = -ENOMEM; goto out; } /* ephemeral flag for the first socket in the group: drop it */ flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8); } match = NULL; list_for_each_entry(f, &fanout_list, list) { if (f->id == id && read_pnet(&f->net) == sock_net(sk)) { match = f; break; } } err = -EINVAL; if (match) { if (match->flags != flags) goto out; if (args->max_num_members && args->max_num_members != match->max_num_members) goto out; } else { if (args->max_num_members > PACKET_FANOUT_MAX) goto out; if (!args->max_num_members) /* legacy PACKET_FANOUT_MAX */ args->max_num_members = 256; err = -ENOMEM; match = kvzalloc(struct_size(match, arr, args->max_num_members), GFP_KERNEL); if (!match) goto out; write_pnet(&match->net, sock_net(sk)); match->id = id; match->type = type; match->flags = flags; INIT_LIST_HEAD(&match->list); spin_lock_init(&match->lock); refcount_set(&match->sk_ref, 0); fanout_init_data(match); match->prot_hook.type = po->prot_hook.type; match->prot_hook.dev = po->prot_hook.dev; match->prot_hook.func = packet_rcv_fanout; match->prot_hook.af_packet_priv = match; match->prot_hook.af_packet_net = read_pnet(&match->net); match->prot_hook.id_match = match_fanout_group; match->max_num_members = args->max_num_members; match->prot_hook.ignore_outgoing = type_flags & PACKET_FANOUT_FLAG_IGNORE_OUTGOING; list_add(&match->list, &fanout_list); } err = -EINVAL; spin_lock(&po->bind_lock); if (packet_sock_flag(po, PACKET_SOCK_RUNNING) && match->type == type && match->prot_hook.type == po->prot_hook.type && match->prot_hook.dev == po->prot_hook.dev) { err = -ENOSPC; if (refcount_read(&match->sk_ref) < match->max_num_members) { __dev_remove_pack(&po->prot_hook); /* Paired with packet_setsockopt(PACKET_FANOUT_DATA) */ WRITE_ONCE(po->fanout, match); po->rollover = rollover; rollover = NULL; refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1); __fanout_link(sk, po); err = 0; } } spin_unlock(&po->bind_lock); if (err && !refcount_read(&match->sk_ref)) { list_del(&match->list); kvfree(match); } out: kfree(rollover); mutex_unlock(&fanout_mutex); return err; } /* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout. * It is the responsibility of the caller to call fanout_release_data() and * free the returned packet_fanout (after synchronize_net()) */ static struct packet_fanout *fanout_release(struct sock *sk) { struct packet_sock *po = pkt_sk(sk); struct packet_fanout *f; mutex_lock(&fanout_mutex); f = po->fanout; if (f) { po->fanout = NULL; if (refcount_dec_and_test(&f->sk_ref)) list_del(&f->list); else f = NULL; } mutex_unlock(&fanout_mutex); return f; } static bool packet_extra_vlan_len_allowed(const struct net_device *dev, struct sk_buff *skb) { /* Earlier code assumed this would be a VLAN pkt, double-check * this now that we have the actual packet in hand. We can only * do this check on Ethernet devices. */ if (unlikely(dev->type != ARPHRD_ETHER)) return false; skb_reset_mac_header(skb); return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q)); } static const struct proto_ops packet_ops; static const struct proto_ops packet_ops_spkt; static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct sock *sk; struct sockaddr_pkt *spkt; /* * When we registered the protocol we saved the socket in the data * field for just this event. */ sk = pt->af_packet_priv; /* * Yank back the headers [hope the device set this * right or kerboom...] * * Incoming packets have ll header pulled, * push it back. * * For outgoing ones skb->data == skb_mac_header(skb) * so that this procedure is noop. */ if (skb->pkt_type == PACKET_LOOPBACK) goto out; if (!net_eq(dev_net(dev), sock_net(sk))) goto out; skb = skb_share_check(skb, GFP_ATOMIC); if (skb == NULL) goto oom; /* drop any routing info */ skb_dst_drop(skb); /* drop conntrack reference */ nf_reset_ct(skb); spkt = &PACKET_SKB_CB(skb)->sa.pkt; skb_push(skb, skb->data - skb_mac_header(skb)); /* * The SOCK_PACKET socket receives _all_ frames. */ spkt->spkt_family = dev->type; strscpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device)); spkt->spkt_protocol = skb->protocol; /* * Charge the memory to the socket. This is done specifically * to prevent sockets using all the memory up. */ if (sock_queue_rcv_skb(sk, skb) == 0) return 0; out: kfree_skb(skb); oom: return 0; } static void packet_parse_headers(struct sk_buff *skb, struct socket *sock) { int depth; if ((!skb->protocol || skb->protocol == htons(ETH_P_ALL)) && sock->type == SOCK_RAW) { skb_reset_mac_header(skb); skb->protocol = dev_parse_header_protocol(skb); } /* Move network header to the right position for VLAN tagged packets */ if (likely(skb->dev->type == ARPHRD_ETHER) && eth_type_vlan(skb->protocol) && vlan_get_protocol_and_depth(skb, skb->protocol, &depth) != 0) skb_set_network_header(skb, depth); skb_probe_transport_header(skb); } /* * Output a raw packet to a device layer. This bypasses all the other * protocol layers and you must therefore supply it with a complete frame */ static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name); struct sk_buff *skb = NULL; struct net_device *dev; struct sockcm_cookie sockc; __be16 proto = 0; int err; int extra_len = 0; /* * Get and verify the address. */ if (saddr) { if (msg->msg_namelen < sizeof(struct sockaddr)) return -EINVAL; if (msg->msg_namelen == sizeof(struct sockaddr_pkt)) proto = saddr->spkt_protocol; } else return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */ /* * Find the device first to size check it */ saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0; retry: rcu_read_lock(); dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device); err = -ENODEV; if (dev == NULL) goto out_unlock; err = -ENETDOWN; if (!(dev->flags & IFF_UP)) goto out_unlock; /* * You may not queue a frame bigger than the mtu. This is the lowest level * raw protocol and you must do your own fragmentation at this level. */ if (unlikely(sock_flag(sk, SOCK_NOFCS))) { if (!netif_supports_nofcs(dev)) { err = -EPROTONOSUPPORT; goto out_unlock; } extra_len = 4; /* We're doing our own CRC */ } err = -EMSGSIZE; if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len) goto out_unlock; if (!skb) { size_t reserved = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0; rcu_read_unlock(); skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL); if (skb == NULL) return -ENOBUFS; /* FIXME: Save some space for broken drivers that write a hard * header at transmission time by themselves. PPP is the notable * one here. This should really be fixed at the driver level. */ skb_reserve(skb, reserved); skb_reset_network_header(skb); /* Try to align data part correctly */ if (hhlen) { skb->data -= hhlen; skb->tail -= hhlen; if (len < hhlen) skb_reset_network_header(skb); } err = memcpy_from_msg(skb_put(skb, len), msg, len); if (err) goto out_free; goto retry; } if (!dev_validate_header(dev, skb->data, len) || !skb->len) { err = -EINVAL; goto out_unlock; } if (len > (dev->mtu + dev->hard_header_len + extra_len) && !packet_extra_vlan_len_allowed(dev, skb)) { err = -EMSGSIZE; goto out_unlock; } sockcm_init(&sockc, sk); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) goto out_unlock; } skb->protocol = proto; skb->dev = dev; skb->priority = READ_ONCE(sk->sk_priority); skb->mark = READ_ONCE(sk->sk_mark); skb->tstamp = sockc.transmit_time; skb_setup_tx_timestamp(skb, sockc.tsflags); if (unlikely(extra_len == 4)) skb->no_fcs = 1; packet_parse_headers(skb, sock); dev_queue_xmit(skb); rcu_read_unlock(); return len; out_unlock: rcu_read_unlock(); out_free: kfree_skb(skb); return err; } static unsigned int run_filter(struct sk_buff *skb, const struct sock *sk, unsigned int res) { struct sk_filter *filter; rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); if (filter != NULL) res = bpf_prog_run_clear_cb(filter->prog, skb); rcu_read_unlock(); return res; } static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb, size_t *len, int vnet_hdr_sz) { struct virtio_net_hdr_mrg_rxbuf vnet_hdr = { .num_buffers = 0 }; if (*len < vnet_hdr_sz) return -EINVAL; *len -= vnet_hdr_sz; if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)&vnet_hdr, vio_le(), true, 0)) return -EINVAL; return memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_sz); } /* * This function makes lazy skb cloning in hope that most of packets * are discarded by BPF. * * Note tricky part: we DO mangle shared skb! skb->data, skb->len * and skb->cb are mangled. It works because (and until) packets * falling here are owned by current CPU. Output packets are cloned * by dev_queue_xmit_nit(), input packets are processed by net_bh * sequentially, so that if we return skb to original state on exit, * we will not harm anyone. */ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct sock *sk; struct sockaddr_ll *sll; struct packet_sock *po; u8 *skb_head = skb->data; int skb_len = skb->len; unsigned int snaplen, res; bool is_drop_n_account = false; if (skb->pkt_type == PACKET_LOOPBACK) goto drop; sk = pt->af_packet_priv; po = pkt_sk(sk); if (!net_eq(dev_net(dev), sock_net(sk))) goto drop; skb->dev = dev; if (dev_has_header(dev)) { /* The device has an explicit notion of ll header, * exported to higher levels. * * Otherwise, the device hides details of its frame * structure, so that corresponding packet head is * never delivered to user. */ if (sk->sk_type != SOCK_DGRAM) skb_push(skb, skb->data - skb_mac_header(skb)); else if (skb->pkt_type == PACKET_OUTGOING) { /* Special case: outgoing packets have ll header at head */ skb_pull(skb, skb_network_offset(skb)); } } snaplen = skb->len; res = run_filter(skb, sk, snaplen); if (!res) goto drop_n_restore; if (snaplen > res) snaplen = res; if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) goto drop_n_acct; if (skb_shared(skb)) { struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); if (nskb == NULL) goto drop_n_acct; if (skb_head != skb->data) { skb->data = skb_head; skb->len = skb_len; } consume_skb(skb); skb = nskb; } sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8); sll = &PACKET_SKB_CB(skb)->sa.ll; sll->sll_hatype = dev->type; sll->sll_pkttype = skb->pkt_type; if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV))) sll->sll_ifindex = orig_dev->ifindex; else sll->sll_ifindex = dev->ifindex; sll->sll_halen = dev_parse_header(skb, sll->sll_addr); /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg(). * Use their space for storing the original skb length. */ PACKET_SKB_CB(skb)->sa.origlen = skb->len; if (pskb_trim(skb, snaplen)) goto drop_n_acct; skb_set_owner_r(skb, sk); skb->dev = NULL; skb_dst_drop(skb); /* drop conntrack reference */ nf_reset_ct(skb); spin_lock(&sk->sk_receive_queue.lock); po->stats.stats1.tp_packets++; sock_skb_set_dropcount(sk, skb); skb_clear_delivery_time(skb); __skb_queue_tail(&sk->sk_receive_queue, skb); spin_unlock(&sk->sk_receive_queue.lock); sk->sk_data_ready(sk); return 0; drop_n_acct: is_drop_n_account = true; atomic_inc(&po->tp_drops); atomic_inc(&sk->sk_drops); drop_n_restore: if (skb_head != skb->data && skb_shared(skb)) { skb->data = skb_head; skb->len = skb_len; } drop: if (!is_drop_n_account) consume_skb(skb); else kfree_skb(skb); return 0; } static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct sock *sk; struct packet_sock *po; struct sockaddr_ll *sll; union tpacket_uhdr h; u8 *skb_head = skb->data; int skb_len = skb->len; unsigned int snaplen, res; unsigned long status = TP_STATUS_USER; unsigned short macoff, hdrlen; unsigned int netoff; struct sk_buff *copy_skb = NULL; struct timespec64 ts; __u32 ts_status; bool is_drop_n_account = false; unsigned int slot_id = 0; int vnet_hdr_sz = 0; /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT. * We may add members to them until current aligned size without forcing * userspace to call getsockopt(..., PACKET_HDRLEN, ...). */ BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32); BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48); if (skb->pkt_type == PACKET_LOOPBACK) goto drop; sk = pt->af_packet_priv; po = pkt_sk(sk); if (!net_eq(dev_net(dev), sock_net(sk))) goto drop; if (dev_has_header(dev)) { if (sk->sk_type != SOCK_DGRAM) skb_push(skb, skb->data - skb_mac_header(skb)); else if (skb->pkt_type == PACKET_OUTGOING) { /* Special case: outgoing packets have ll header at head */ skb_pull(skb, skb_network_offset(skb)); } } snaplen = skb->len; res = run_filter(skb, sk, snaplen); if (!res) goto drop_n_restore; /* If we are flooded, just give up */ if (__packet_rcv_has_room(po, skb) == ROOM_NONE) { atomic_inc(&po->tp_drops); goto drop_n_restore; } if (skb->ip_summed == CHECKSUM_PARTIAL) status |= TP_STATUS_CSUMNOTREADY; else if (skb->pkt_type != PACKET_OUTGOING && skb_csum_unnecessary(skb)) status |= TP_STATUS_CSUM_VALID; if (skb_is_gso(skb) && skb_is_gso_tcp(skb)) status |= TP_STATUS_GSO_TCP; if (snaplen > res) snaplen = res; if (sk->sk_type == SOCK_DGRAM) { macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 + po->tp_reserve; } else { unsigned int maclen = skb_network_offset(skb); netoff = TPACKET_ALIGN(po->tp_hdrlen + (maclen < 16 ? 16 : maclen)) + po->tp_reserve; vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz); if (vnet_hdr_sz) netoff += vnet_hdr_sz; macoff = netoff - maclen; } if (netoff > USHRT_MAX) { atomic_inc(&po->tp_drops); goto drop_n_restore; } if (po->tp_version <= TPACKET_V2) { if (macoff + snaplen > po->rx_ring.frame_size) { if (po->copy_thresh && atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) { if (skb_shared(skb)) { copy_skb = skb_clone(skb, GFP_ATOMIC); } else { copy_skb = skb_get(skb); skb_head = skb->data; } if (copy_skb) { memset(&PACKET_SKB_CB(copy_skb)->sa.ll, 0, sizeof(PACKET_SKB_CB(copy_skb)->sa.ll)); skb_set_owner_r(copy_skb, sk); } } snaplen = po->rx_ring.frame_size - macoff; if ((int)snaplen < 0) { snaplen = 0; vnet_hdr_sz = 0; } } } else if (unlikely(macoff + snaplen > GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) { u32 nval; nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff; pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n", snaplen, nval, macoff); snaplen = nval; if (unlikely((int)snaplen < 0)) { snaplen = 0; macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len; vnet_hdr_sz = 0; } } spin_lock(&sk->sk_receive_queue.lock); h.raw = packet_current_rx_frame(po, skb, TP_STATUS_KERNEL, (macoff+snaplen)); if (!h.raw) goto drop_n_account; if (po->tp_version <= TPACKET_V2) { slot_id = po->rx_ring.head; if (test_bit(slot_id, po->rx_ring.rx_owner_map)) goto drop_n_account; __set_bit(slot_id, po->rx_ring.rx_owner_map); } if (vnet_hdr_sz && virtio_net_hdr_from_skb(skb, h.raw + macoff - sizeof(struct virtio_net_hdr), vio_le(), true, 0)) { if (po->tp_version == TPACKET_V3) prb_clear_blk_fill_status(&po->rx_ring); goto drop_n_account; } if (po->tp_version <= TPACKET_V2) { packet_increment_rx_head(po, &po->rx_ring); /* * LOSING will be reported till you read the stats, * because it's COR - Clear On Read. * Anyways, moving it for V1/V2 only as V3 doesn't need this * at packet level. */ if (atomic_read(&po->tp_drops)) status |= TP_STATUS_LOSING; } po->stats.stats1.tp_packets++; if (copy_skb) { status |= TP_STATUS_COPY; skb_clear_delivery_time(copy_skb); __skb_queue_tail(&sk->sk_receive_queue, copy_skb); } spin_unlock(&sk->sk_receive_queue.lock); skb_copy_bits(skb, 0, h.raw + macoff, snaplen); /* Always timestamp; prefer an existing software timestamp taken * closer to the time of capture. */ ts_status = tpacket_get_timestamp(skb, &ts, READ_ONCE(po->tp_tstamp) | SOF_TIMESTAMPING_SOFTWARE); if (!ts_status) ktime_get_real_ts64(&ts); status |= ts_status; switch (po->tp_version) { case TPACKET_V1: h.h1->tp_len = skb->len; h.h1->tp_snaplen = snaplen; h.h1->tp_mac = macoff; h.h1->tp_net = netoff; h.h1->tp_sec = ts.tv_sec; h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC; hdrlen = sizeof(*h.h1); break; case TPACKET_V2: h.h2->tp_len = skb->len; h.h2->tp_snaplen = snaplen; h.h2->tp_mac = macoff; h.h2->tp_net = netoff; h.h2->tp_sec = ts.tv_sec; h.h2->tp_nsec = ts.tv_nsec; if (skb_vlan_tag_present(skb)) { h.h2->tp_vlan_tci = skb_vlan_tag_get(skb); h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto); status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else { h.h2->tp_vlan_tci = 0; h.h2->tp_vlan_tpid = 0; } memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding)); hdrlen = sizeof(*h.h2); break; case TPACKET_V3: /* tp_nxt_offset,vlan are already populated above. * So DONT clear those fields here */ h.h3->tp_status |= status; h.h3->tp_len = skb->len; h.h3->tp_snaplen = snaplen; h.h3->tp_mac = macoff; h.h3->tp_net = netoff; h.h3->tp_sec = ts.tv_sec; h.h3->tp_nsec = ts.tv_nsec; memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding)); hdrlen = sizeof(*h.h3); break; default: BUG(); } sll = h.raw + TPACKET_ALIGN(hdrlen); sll->sll_halen = dev_parse_header(skb, sll->sll_addr); sll->sll_family = AF_PACKET; sll->sll_hatype = dev->type; sll->sll_protocol = skb->protocol; sll->sll_pkttype = skb->pkt_type; if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV))) sll->sll_ifindex = orig_dev->ifindex; else sll->sll_ifindex = dev->ifindex; smp_mb(); #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 if (po->tp_version <= TPACKET_V2) { u8 *start, *end; end = (u8 *) PAGE_ALIGN((unsigned long) h.raw + macoff + snaplen); for (start = h.raw; start < end; start += PAGE_SIZE) flush_dcache_page(pgv_to_page(start)); } smp_wmb(); #endif if (po->tp_version <= TPACKET_V2) { spin_lock(&sk->sk_receive_queue.lock); __packet_set_status(po, h.raw, status); __clear_bit(slot_id, po->rx_ring.rx_owner_map); spin_unlock(&sk->sk_receive_queue.lock); sk->sk_data_ready(sk); } else if (po->tp_version == TPACKET_V3) { prb_clear_blk_fill_status(&po->rx_ring); } drop_n_restore: if (skb_head != skb->data && skb_shared(skb)) { skb->data = skb_head; skb->len = skb_len; } drop: if (!is_drop_n_account) consume_skb(skb); else kfree_skb(skb); return 0; drop_n_account: spin_unlock(&sk->sk_receive_queue.lock); atomic_inc(&po->tp_drops); is_drop_n_account = true; sk->sk_data_ready(sk); kfree_skb(copy_skb); goto drop_n_restore; } static void tpacket_destruct_skb(struct sk_buff *skb) { struct packet_sock *po = pkt_sk(skb->sk); if (likely(po->tx_ring.pg_vec)) { void *ph; __u32 ts; ph = skb_zcopy_get_nouarg(skb); packet_dec_pending(&po->tx_ring); ts = __packet_set_timestamp(po, ph, skb); __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts); if (!packet_read_pending(&po->tx_ring)) complete(&po->skb_completion); } sock_wfree(skb); } static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len) { if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) + __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 > __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len))) vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(), __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) + __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2); if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len) return -EINVAL; return 0; } static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len, struct virtio_net_hdr *vnet_hdr, int vnet_hdr_sz) { int ret; if (*len < vnet_hdr_sz) return -EINVAL; *len -= vnet_hdr_sz; if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter)) return -EFAULT; ret = __packet_snd_vnet_parse(vnet_hdr, *len); if (ret) return ret; /* move iter to point to the start of mac header */ if (vnet_hdr_sz != sizeof(struct virtio_net_hdr)) iov_iter_advance(&msg->msg_iter, vnet_hdr_sz - sizeof(struct virtio_net_hdr)); return 0; } static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, void *frame, struct net_device *dev, void *data, int tp_len, __be16 proto, unsigned char *addr, int hlen, int copylen, const struct sockcm_cookie *sockc) { union tpacket_uhdr ph; int to_write, offset, len, nr_frags, len_max; struct socket *sock = po->sk.sk_socket; struct page *page; int err; ph.raw = frame; skb->protocol = proto; skb->dev = dev; skb->priority = READ_ONCE(po->sk.sk_priority); skb->mark = READ_ONCE(po->sk.sk_mark); skb->tstamp = sockc->transmit_time; skb_setup_tx_timestamp(skb, sockc->tsflags); skb_zcopy_set_nouarg(skb, ph.raw); skb_reserve(skb, hlen); skb_reset_network_header(skb); to_write = tp_len; if (sock->type == SOCK_DGRAM) { err = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, tp_len); if (unlikely(err < 0)) return -EINVAL; } else if (copylen) { int hdrlen = min_t(int, copylen, tp_len); skb_push(skb, dev->hard_header_len); skb_put(skb, copylen - dev->hard_header_len); err = skb_store_bits(skb, 0, data, hdrlen); if (unlikely(err)) return err; if (!dev_validate_header(dev, skb->data, hdrlen)) return -EINVAL; data += hdrlen; to_write -= hdrlen; } offset = offset_in_page(data); len_max = PAGE_SIZE - offset; len = ((to_write > len_max) ? len_max : to_write); skb->data_len = to_write; skb->len += to_write; skb->truesize += to_write; refcount_add(to_write, &po->sk.sk_wmem_alloc); while (likely(to_write)) { nr_frags = skb_shinfo(skb)->nr_frags; if (unlikely(nr_frags >= MAX_SKB_FRAGS)) { pr_err("Packet exceed the number of skb frags(%u)\n", (unsigned int)MAX_SKB_FRAGS); return -EFAULT; } page = pgv_to_page(data); data += len; flush_dcache_page(page); get_page(page); skb_fill_page_desc(skb, nr_frags, page, offset, len); to_write -= len; offset = 0; len_max = PAGE_SIZE; len = ((to_write > len_max) ? len_max : to_write); } packet_parse_headers(skb, sock); return tp_len; } static int tpacket_parse_header(struct packet_sock *po, void *frame, int size_max, void **data) { union tpacket_uhdr ph; int tp_len, off; ph.raw = frame; switch (po->tp_version) { case TPACKET_V3: if (ph.h3->tp_next_offset != 0) { pr_warn_once("variable sized slot not supported"); return -EINVAL; } tp_len = ph.h3->tp_len; break; case TPACKET_V2: tp_len = ph.h2->tp_len; break; default: tp_len = ph.h1->tp_len; break; } if (unlikely(tp_len > size_max)) { pr_err("packet size is too long (%d > %d)\n", tp_len, size_max); return -EMSGSIZE; } if (unlikely(packet_sock_flag(po, PACKET_SOCK_TX_HAS_OFF))) { int off_min, off_max; off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll); off_max = po->tx_ring.frame_size - tp_len; if (po->sk.sk_type == SOCK_DGRAM) { switch (po->tp_version) { case TPACKET_V3: off = ph.h3->tp_net; break; case TPACKET_V2: off = ph.h2->tp_net; break; default: off = ph.h1->tp_net; break; } } else { switch (po->tp_version) { case TPACKET_V3: off = ph.h3->tp_mac; break; case TPACKET_V2: off = ph.h2->tp_mac; break; default: off = ph.h1->tp_mac; break; } } if (unlikely((off < off_min) || (off_max < off))) return -EINVAL; } else { off = po->tp_hdrlen - sizeof(struct sockaddr_ll); } *data = frame + off; return tp_len; } static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) { struct sk_buff *skb = NULL; struct net_device *dev; struct virtio_net_hdr *vnet_hdr = NULL; struct sockcm_cookie sockc; __be16 proto; int err, reserve = 0; void *ph; DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name); bool need_wait = !(msg->msg_flags & MSG_DONTWAIT); int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz); unsigned char *addr = NULL; int tp_len, size_max; void *data; int len_sum = 0; int status = TP_STATUS_AVAILABLE; int hlen, tlen, copylen = 0; long timeo = 0; mutex_lock(&po->pg_vec_lock); /* packet_sendmsg() check on tx_ring.pg_vec was lockless, * we need to confirm it under protection of pg_vec_lock. */ if (unlikely(!po->tx_ring.pg_vec)) { err = -EBUSY; goto out; } if (likely(saddr == NULL)) { dev = packet_cached_dev_get(po); proto = READ_ONCE(po->num); } else { err = -EINVAL; if (msg->msg_namelen < sizeof(struct sockaddr_ll)) goto out; if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr))) goto out; proto = saddr->sll_protocol; dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex); if (po->sk.sk_socket->type == SOCK_DGRAM) { if (dev && msg->msg_namelen < dev->addr_len + offsetof(struct sockaddr_ll, sll_addr)) goto out_put; addr = saddr->sll_addr; } } err = -ENXIO; if (unlikely(dev == NULL)) goto out; err = -ENETDOWN; if (unlikely(!(dev->flags & IFF_UP))) goto out_put; sockcm_init(&sockc, &po->sk); if (msg->msg_controllen) { err = sock_cmsg_send(&po->sk, msg, &sockc); if (unlikely(err)) goto out_put; } if (po->sk.sk_socket->type == SOCK_RAW) reserve = dev->hard_header_len; size_max = po->tx_ring.frame_size - (po->tp_hdrlen - sizeof(struct sockaddr_ll)); if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !vnet_hdr_sz) size_max = dev->mtu + reserve + VLAN_HLEN; reinit_completion(&po->skb_completion); do { ph = packet_current_frame(po, &po->tx_ring, TP_STATUS_SEND_REQUEST); if (unlikely(ph == NULL)) { if (need_wait && skb) { timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT); timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo); if (timeo <= 0) { err = !timeo ? -ETIMEDOUT : -ERESTARTSYS; goto out_put; } } /* check for additional frames */ continue; } skb = NULL; tp_len = tpacket_parse_header(po, ph, size_max, &data); if (tp_len < 0) goto tpacket_error; status = TP_STATUS_SEND_REQUEST; hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; if (vnet_hdr_sz) { vnet_hdr = data; data += vnet_hdr_sz; tp_len -= vnet_hdr_sz; if (tp_len < 0 || __packet_snd_vnet_parse(vnet_hdr, tp_len)) { tp_len = -EINVAL; goto tpacket_error; } copylen = __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len); } copylen = max_t(int, copylen, dev->hard_header_len); skb = sock_alloc_send_skb(&po->sk, hlen + tlen + sizeof(struct sockaddr_ll) + (copylen - dev->hard_header_len), !need_wait, &err); if (unlikely(skb == NULL)) { /* we assume the socket was initially writeable ... */ if (likely(len_sum > 0)) err = len_sum; goto out_status; } tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto, addr, hlen, copylen, &sockc); if (likely(tp_len >= 0) && tp_len > dev->mtu + reserve && !vnet_hdr_sz && !packet_extra_vlan_len_allowed(dev, skb)) tp_len = -EMSGSIZE; if (unlikely(tp_len < 0)) { tpacket_error: if (packet_sock_flag(po, PACKET_SOCK_TP_LOSS)) { __packet_set_status(po, ph, TP_STATUS_AVAILABLE); packet_increment_head(&po->tx_ring); kfree_skb(skb); continue; } else { status = TP_STATUS_WRONG_FORMAT; err = tp_len; goto out_status; } } if (vnet_hdr_sz) { if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) { tp_len = -EINVAL; goto tpacket_error; } virtio_net_hdr_set_proto(skb, vnet_hdr); } skb->destructor = tpacket_destruct_skb; __packet_set_status(po, ph, TP_STATUS_SENDING); packet_inc_pending(&po->tx_ring); status = TP_STATUS_SEND_REQUEST; err = packet_xmit(po, skb); if (unlikely(err != 0)) { if (err > 0) err = net_xmit_errno(err); if (err && __packet_get_status(po, ph) == TP_STATUS_AVAILABLE) { /* skb was destructed already */ skb = NULL; goto out_status; } /* * skb was dropped but not destructed yet; * let's treat it like congestion or err < 0 */ err = 0; } packet_increment_head(&po->tx_ring); len_sum += tp_len; } while (likely((ph != NULL) || /* Note: packet_read_pending() might be slow if we have * to call it as it's per_cpu variable, but in fast-path * we already short-circuit the loop with the first * condition, and luckily don't have to go that path * anyway. */ (need_wait && packet_read_pending(&po->tx_ring)))); err = len_sum; goto out_put; out_status: __packet_set_status(po, ph, status); kfree_skb(skb); out_put: dev_put(dev); out: mutex_unlock(&po->pg_vec_lock); return err; } static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad, size_t reserve, size_t len, size_t linear, int noblock, int *err) { struct sk_buff *skb; /* Under a page? Don't bother with paged skb. */ if (prepad + len < PAGE_SIZE || !linear) linear = len; if (len - linear > MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) linear = len - MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER); skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, err, PAGE_ALLOC_COSTLY_ORDER); if (!skb) return NULL; skb_reserve(skb, reserve); skb_put(skb, linear); skb->data_len = len - linear; skb->len += len - linear; return skb; } static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name); struct sk_buff *skb; struct net_device *dev; __be16 proto; unsigned char *addr = NULL; int err, reserve = 0; struct sockcm_cookie sockc; struct virtio_net_hdr vnet_hdr = { 0 }; int offset = 0; struct packet_sock *po = pkt_sk(sk); int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz); int hlen, tlen, linear; int extra_len = 0; /* * Get and verify the address. */ if (likely(saddr == NULL)) { dev = packet_cached_dev_get(po); proto = READ_ONCE(po->num); } else { err = -EINVAL; if (msg->msg_namelen < sizeof(struct sockaddr_ll)) goto out; if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr))) goto out; proto = saddr->sll_protocol; dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex); if (sock->type == SOCK_DGRAM) { if (dev && msg->msg_namelen < dev->addr_len + offsetof(struct sockaddr_ll, sll_addr)) goto out_unlock; addr = saddr->sll_addr; } } err = -ENXIO; if (unlikely(dev == NULL)) goto out_unlock; err = -ENETDOWN; if (unlikely(!(dev->flags & IFF_UP))) goto out_unlock; sockcm_init(&sockc, sk); sockc.mark = READ_ONCE(sk->sk_mark); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) goto out_unlock; } if (sock->type == SOCK_RAW) reserve = dev->hard_header_len; if (vnet_hdr_sz) { err = packet_snd_vnet_parse(msg, &len, &vnet_hdr, vnet_hdr_sz); if (err) goto out_unlock; } if (unlikely(sock_flag(sk, SOCK_NOFCS))) { if (!netif_supports_nofcs(dev)) { err = -EPROTONOSUPPORT; goto out_unlock; } extra_len = 4; /* We're doing our own CRC */ } err = -EMSGSIZE; if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len)) goto out_unlock; err = -ENOBUFS; hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len); linear = max(linear, min_t(int, len, dev->hard_header_len)); skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear, msg->msg_flags & MSG_DONTWAIT, &err); if (skb == NULL) goto out_unlock; skb_reset_network_header(skb); err = -EINVAL; if (sock->type == SOCK_DGRAM) { offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len); if (unlikely(offset < 0)) goto out_free; } else if (reserve) { skb_reserve(skb, -reserve); if (len < reserve + sizeof(struct ipv6hdr) && dev->min_header_len != dev->hard_header_len) skb_reset_network_header(skb); } /* Returns -EFAULT on error */ err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len); if (err) goto out_free; if ((sock->type == SOCK_RAW && !dev_validate_header(dev, skb->data, len)) || !skb->len) { err = -EINVAL; goto out_free; } skb_setup_tx_timestamp(skb, sockc.tsflags); if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) && !packet_extra_vlan_len_allowed(dev, skb)) { err = -EMSGSIZE; goto out_free; } skb->protocol = proto; skb->dev = dev; skb->priority = READ_ONCE(sk->sk_priority); skb->mark = sockc.mark; skb->tstamp = sockc.transmit_time; if (unlikely(extra_len == 4)) skb->no_fcs = 1; packet_parse_headers(skb, sock); if (vnet_hdr_sz) { err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le()); if (err) goto out_free; len += vnet_hdr_sz; virtio_net_hdr_set_proto(skb, &vnet_hdr); } err = packet_xmit(po, skb); if (unlikely(err != 0)) { if (err > 0) err = net_xmit_errno(err); if (err) goto out_unlock; } dev_put(dev); return len; out_free: kfree_skb(skb); out_unlock: dev_put(dev); out: return err; } static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); /* Reading tx_ring.pg_vec without holding pg_vec_lock is racy. * tpacket_snd() will redo the check safely. */ if (data_race(po->tx_ring.pg_vec)) return tpacket_snd(po, msg); return packet_snd(sock, msg, len); } /* * Close a PACKET socket. This is fairly simple. We immediately go * to 'closed' state and remove our protocol entry in the device list. */ static int packet_release(struct socket *sock) { struct sock *sk = sock->sk; struct packet_sock *po; struct packet_fanout *f; struct net *net; union tpacket_req_u req_u; if (!sk) return 0; net = sock_net(sk); po = pkt_sk(sk); mutex_lock(&net->packet.sklist_lock); sk_del_node_init_rcu(sk); mutex_unlock(&net->packet.sklist_lock); sock_prot_inuse_add(net, sk->sk_prot, -1); spin_lock(&po->bind_lock); unregister_prot_hook(sk, false); packet_cached_dev_reset(po); if (po->prot_hook.dev) { netdev_put(po->prot_hook.dev, &po->prot_hook.dev_tracker); po->prot_hook.dev = NULL; } spin_unlock(&po->bind_lock); packet_flush_mclist(sk); lock_sock(sk); if (po->rx_ring.pg_vec) { memset(&req_u, 0, sizeof(req_u)); packet_set_ring(sk, &req_u, 1, 0); } if (po->tx_ring.pg_vec) { memset(&req_u, 0, sizeof(req_u)); packet_set_ring(sk, &req_u, 1, 1); } release_sock(sk); f = fanout_release(sk); synchronize_net(); kfree(po->rollover); if (f) { fanout_release_data(f); kvfree(f); } /* * Now the socket is dead. No more input will appear. */ sock_orphan(sk); sock->sk = NULL; /* Purge queues */ skb_queue_purge(&sk->sk_receive_queue); packet_free_pending(po); sock_put(sk); return 0; } /* * Attach a packet hook. */ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, __be16 proto) { struct packet_sock *po = pkt_sk(sk); struct net_device *dev = NULL; bool unlisted = false; bool need_rehook; int ret = 0; lock_sock(sk); spin_lock(&po->bind_lock); if (!proto) proto = po->num; rcu_read_lock(); if (po->fanout) { ret = -EINVAL; goto out_unlock; } if (name) { dev = dev_get_by_name_rcu(sock_net(sk), name); if (!dev) { ret = -ENODEV; goto out_unlock; } } else if (ifindex) { dev = dev_get_by_index_rcu(sock_net(sk), ifindex); if (!dev) { ret = -ENODEV; goto out_unlock; } } need_rehook = po->prot_hook.type != proto || po->prot_hook.dev != dev; if (need_rehook) { dev_hold(dev); if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) { rcu_read_unlock(); /* prevents packet_notifier() from calling * register_prot_hook() */ WRITE_ONCE(po->num, 0); __unregister_prot_hook(sk, true); rcu_read_lock(); if (dev) unlisted = !dev_get_by_index_rcu(sock_net(sk), dev->ifindex); } BUG_ON(packet_sock_flag(po, PACKET_SOCK_RUNNING)); WRITE_ONCE(po->num, proto); po->prot_hook.type = proto; netdev_put(po->prot_hook.dev, &po->prot_hook.dev_tracker); if (unlikely(unlisted)) { po->prot_hook.dev = NULL; WRITE_ONCE(po->ifindex, -1); packet_cached_dev_reset(po); } else { netdev_hold(dev, &po->prot_hook.dev_tracker, GFP_ATOMIC); po->prot_hook.dev = dev; WRITE_ONCE(po->ifindex, dev ? dev->ifindex : 0); packet_cached_dev_assign(po, dev); } dev_put(dev); } if (proto == 0 || !need_rehook) goto out_unlock; if (!unlisted && (!dev || (dev->flags & IFF_UP))) { register_prot_hook(sk); } else { sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); } out_unlock: rcu_read_unlock(); spin_unlock(&po->bind_lock); release_sock(sk); return ret; } /* * Bind a packet socket to a device */ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; char name[sizeof(uaddr->sa_data_min) + 1]; /* * Check legality */ if (addr_len != sizeof(struct sockaddr)) return -EINVAL; /* uaddr->sa_data comes from the userspace, it's not guaranteed to be * zero-terminated. */ memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data_min)); name[sizeof(uaddr->sa_data_min)] = 0; return packet_do_bind(sk, name, 0, 0); } static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr; struct sock *sk = sock->sk; /* * Check legality */ if (addr_len < sizeof(struct sockaddr_ll)) return -EINVAL; if (sll->sll_family != AF_PACKET) return -EINVAL; return packet_do_bind(sk, NULL, sll->sll_ifindex, sll->sll_protocol); } static struct proto packet_proto = { .name = "PACKET", .owner = THIS_MODULE, .obj_size = sizeof(struct packet_sock), }; /* * Create a packet of type SOCK_PACKET. */ static int packet_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; struct packet_sock *po; __be16 proto = (__force __be16)protocol; /* weird, but documented */ int err; if (!ns_capable(net->user_ns, CAP_NET_RAW)) return -EPERM; if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW && sock->type != SOCK_PACKET) return -ESOCKTNOSUPPORT; sock->state = SS_UNCONNECTED; err = -ENOBUFS; sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern); if (sk == NULL) goto out; sock->ops = &packet_ops; if (sock->type == SOCK_PACKET) sock->ops = &packet_ops_spkt; sock_init_data(sock, sk); po = pkt_sk(sk); init_completion(&po->skb_completion); sk->sk_family = PF_PACKET; po->num = proto; err = packet_alloc_pending(po); if (err) goto out2; packet_cached_dev_reset(po); sk->sk_destruct = packet_sock_destruct; /* * Attach a protocol block */ spin_lock_init(&po->bind_lock); mutex_init(&po->pg_vec_lock); po->rollover = NULL; po->prot_hook.func = packet_rcv; if (sock->type == SOCK_PACKET) po->prot_hook.func = packet_rcv_spkt; po->prot_hook.af_packet_priv = sk; po->prot_hook.af_packet_net = sock_net(sk); if (proto) { po->prot_hook.type = proto; __register_prot_hook(sk); } mutex_lock(&net->packet.sklist_lock); sk_add_node_tail_rcu(sk, &net->packet.sklist); mutex_unlock(&net->packet.sklist_lock); sock_prot_inuse_add(net, &packet_proto, 1); return 0; out2: sk_free(sk); out: return err; } /* * Pull a packet from our receive queue and hand it to the user. * If necessary we block. */ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; int copied, err; int vnet_hdr_len = READ_ONCE(pkt_sk(sk)->vnet_hdr_sz); unsigned int origlen = 0; err = -EINVAL; if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE)) goto out; #if 0 /* What error should we return now? EUNATTACH? */ if (pkt_sk(sk)->ifindex < 0) return -ENODEV; #endif if (flags & MSG_ERRQUEUE) { err = sock_recv_errqueue(sk, msg, len, SOL_PACKET, PACKET_TX_TIMESTAMP); goto out; } /* * Call the generic datagram receiver. This handles all sorts * of horrible races and re-entrancy so we can forget about it * in the protocol layers. * * Now it will return ENETDOWN, if device have just gone down, * but then it will block. */ skb = skb_recv_datagram(sk, flags, &err); /* * An error occurred so return it. Because skb_recv_datagram() * handles the blocking we don't see and worry about blocking * retries. */ if (skb == NULL) goto out; packet_rcv_try_clear_pressure(pkt_sk(sk)); if (vnet_hdr_len) { err = packet_rcv_vnet(msg, skb, &len, vnet_hdr_len); if (err) goto out_free; } /* You lose any data beyond the buffer you gave. If it worries * a user program they can ask the device for its MTU * anyway. */ copied = skb->len; if (copied > len) { copied = len; msg->msg_flags |= MSG_TRUNC; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free; if (sock->type != SOCK_PACKET) { struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll; /* Original length was stored in sockaddr_ll fields */ origlen = PACKET_SKB_CB(skb)->sa.origlen; sll->sll_family = AF_PACKET; sll->sll_protocol = skb->protocol; } sock_recv_cmsgs(msg, sk, skb); if (msg->msg_name) { const size_t max_len = min(sizeof(skb->cb), sizeof(struct sockaddr_storage)); int copy_len; /* If the address length field is there to be filled * in, we fill it in now. */ if (sock->type == SOCK_PACKET) { __sockaddr_check_size(sizeof(struct sockaddr_pkt)); msg->msg_namelen = sizeof(struct sockaddr_pkt); copy_len = msg->msg_namelen; } else { struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll; msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr); copy_len = msg->msg_namelen; if (msg->msg_namelen < sizeof(struct sockaddr_ll)) { memset(msg->msg_name + offsetof(struct sockaddr_ll, sll_addr), 0, sizeof(sll->sll_addr)); msg->msg_namelen = sizeof(struct sockaddr_ll); } } if (WARN_ON_ONCE(copy_len > max_len)) { copy_len = max_len; msg->msg_namelen = copy_len; } memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len); } if (packet_sock_flag(pkt_sk(sk), PACKET_SOCK_AUXDATA)) { struct tpacket_auxdata aux; aux.tp_status = TP_STATUS_USER; if (skb->ip_summed == CHECKSUM_PARTIAL) aux.tp_status |= TP_STATUS_CSUMNOTREADY; else if (skb->pkt_type != PACKET_OUTGOING && skb_csum_unnecessary(skb)) aux.tp_status |= TP_STATUS_CSUM_VALID; if (skb_is_gso(skb) && skb_is_gso_tcp(skb)) aux.tp_status |= TP_STATUS_GSO_TCP; aux.tp_len = origlen; aux.tp_snaplen = skb->len; aux.tp_mac = 0; aux.tp_net = skb_network_offset(skb); if (skb_vlan_tag_present(skb)) { aux.tp_vlan_tci = skb_vlan_tag_get(skb); aux.tp_vlan_tpid = ntohs(skb->vlan_proto); aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else { aux.tp_vlan_tci = 0; aux.tp_vlan_tpid = 0; } put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux); } /* * Free or return the buffer as appropriate. Again this * hides all the races and re-entrancy issues from us. */ err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied); out_free: skb_free_datagram(sk, skb); out: return err; } static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr, int peer) { struct net_device *dev; struct sock *sk = sock->sk; if (peer) return -EOPNOTSUPP; uaddr->sa_family = AF_PACKET; memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data_min)); rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), READ_ONCE(pkt_sk(sk)->ifindex)); if (dev) strscpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data_min)); rcu_read_unlock(); return sizeof(*uaddr); } static int packet_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct net_device *dev; struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr); int ifindex; if (peer) return -EOPNOTSUPP; ifindex = READ_ONCE(po->ifindex); sll->sll_family = AF_PACKET; sll->sll_ifindex = ifindex; sll->sll_protocol = READ_ONCE(po->num); sll->sll_pkttype = 0; rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), ifindex); if (dev) { sll->sll_hatype = dev->type; sll->sll_halen = dev->addr_len; /* Let __fortify_memcpy_chk() know the actual buffer size. */ memcpy(((struct sockaddr_storage *)sll)->__data + offsetof(struct sockaddr_ll, sll_addr) - offsetofend(struct sockaddr_ll, sll_family), dev->dev_addr, dev->addr_len); } else { sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */ sll->sll_halen = 0; } rcu_read_unlock(); return offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen; } static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i, int what) { switch (i->type) { case PACKET_MR_MULTICAST: if (i->alen != dev->addr_len) return -EINVAL; if (what > 0) return dev_mc_add(dev, i->addr); else return dev_mc_del(dev, i->addr); break; case PACKET_MR_PROMISC: return dev_set_promiscuity(dev, what); case PACKET_MR_ALLMULTI: return dev_set_allmulti(dev, what); case PACKET_MR_UNICAST: if (i->alen != dev->addr_len) return -EINVAL; if (what > 0) return dev_uc_add(dev, i->addr); else return dev_uc_del(dev, i->addr); break; default: break; } return 0; } static void packet_dev_mclist_delete(struct net_device *dev, struct packet_mclist **mlp) { struct packet_mclist *ml; while ((ml = *mlp) != NULL) { if (ml->ifindex == dev->ifindex) { packet_dev_mc(dev, ml, -1); *mlp = ml->next; kfree(ml); } else mlp = &ml->next; } } static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq) { struct packet_sock *po = pkt_sk(sk); struct packet_mclist *ml, *i; struct net_device *dev; int err; rtnl_lock(); err = -ENODEV; dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex); if (!dev) goto done; err = -EINVAL; if (mreq->mr_alen > dev->addr_len) goto done; err = -ENOBUFS; i = kmalloc(sizeof(*i), GFP_KERNEL); if (i == NULL) goto done; err = 0; for (ml = po->mclist; ml; ml = ml->next) { if (ml->ifindex == mreq->mr_ifindex && ml->type == mreq->mr_type && ml->alen == mreq->mr_alen && memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) { ml->count++; /* Free the new element ... */ kfree(i); goto done; } } i->type = mreq->mr_type; i->ifindex = mreq->mr_ifindex; i->alen = mreq->mr_alen; memcpy(i->addr, mreq->mr_address, i->alen); memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen); i->count = 1; i->next = po->mclist; po->mclist = i; err = packet_dev_mc(dev, i, 1); if (err) { po->mclist = i->next; kfree(i); } done: rtnl_unlock(); return err; } static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq) { struct packet_mclist *ml, **mlp; rtnl_lock(); for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) { if (ml->ifindex == mreq->mr_ifindex && ml->type == mreq->mr_type && ml->alen == mreq->mr_alen && memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) { if (--ml->count == 0) { struct net_device *dev; *mlp = ml->next; dev = __dev_get_by_index(sock_net(sk), ml->ifindex); if (dev) packet_dev_mc(dev, ml, -1); kfree(ml); } break; } } rtnl_unlock(); return 0; } static void packet_flush_mclist(struct sock *sk) { struct packet_sock *po = pkt_sk(sk); struct packet_mclist *ml; if (!po->mclist) return; rtnl_lock(); while ((ml = po->mclist) != NULL) { struct net_device *dev; po->mclist = ml->next; dev = __dev_get_by_index(sock_net(sk), ml->ifindex); if (dev != NULL) packet_dev_mc(dev, ml, -1); kfree(ml); } rtnl_unlock(); } static int packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); int ret; if (level != SOL_PACKET) return -ENOPROTOOPT; switch (optname) { case PACKET_ADD_MEMBERSHIP: case PACKET_DROP_MEMBERSHIP: { struct packet_mreq_max mreq; int len = optlen; memset(&mreq, 0, sizeof(mreq)); if (len < sizeof(struct packet_mreq)) return -EINVAL; if (len > sizeof(mreq)) len = sizeof(mreq); if (copy_from_sockptr(&mreq, optval, len)) return -EFAULT; if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address))) return -EINVAL; if (optname == PACKET_ADD_MEMBERSHIP) ret = packet_mc_add(sk, &mreq); else ret = packet_mc_drop(sk, &mreq); return ret; } case PACKET_RX_RING: case PACKET_TX_RING: { union tpacket_req_u req_u; int len; lock_sock(sk); switch (po->tp_version) { case TPACKET_V1: case TPACKET_V2: len = sizeof(req_u.req); break; case TPACKET_V3: default: len = sizeof(req_u.req3); break; } if (optlen < len) { ret = -EINVAL; } else { if (copy_from_sockptr(&req_u.req, optval, len)) ret = -EFAULT; else ret = packet_set_ring(sk, &req_u, 0, optname == PACKET_TX_RING); } release_sock(sk); return ret; } case PACKET_COPY_THRESH: { int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; pkt_sk(sk)->copy_thresh = val; return 0; } case PACKET_VERSION: { int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; switch (val) { case TPACKET_V1: case TPACKET_V2: case TPACKET_V3: break; default: return -EINVAL; } lock_sock(sk); if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { ret = -EBUSY; } else { po->tp_version = val; ret = 0; } release_sock(sk); return ret; } case PACKET_RESERVE: { unsigned int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; if (val > INT_MAX) return -EINVAL; lock_sock(sk); if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { ret = -EBUSY; } else { po->tp_reserve = val; ret = 0; } release_sock(sk); return ret; } case PACKET_LOSS: { unsigned int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; lock_sock(sk); if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { ret = -EBUSY; } else { packet_sock_flag_set(po, PACKET_SOCK_TP_LOSS, val); ret = 0; } release_sock(sk); return ret; } case PACKET_AUXDATA: { int val; if (optlen < sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; packet_sock_flag_set(po, PACKET_SOCK_AUXDATA, val); return 0; } case PACKET_ORIGDEV: { int val; if (optlen < sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; packet_sock_flag_set(po, PACKET_SOCK_ORIGDEV, val); return 0; } case PACKET_VNET_HDR: case PACKET_VNET_HDR_SZ: { int val, hdr_len; if (sock->type != SOCK_RAW) return -EINVAL; if (optlen < sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; if (optname == PACKET_VNET_HDR_SZ) { if (val && val != sizeof(struct virtio_net_hdr) && val != sizeof(struct virtio_net_hdr_mrg_rxbuf)) return -EINVAL; hdr_len = val; } else { hdr_len = val ? sizeof(struct virtio_net_hdr) : 0; } lock_sock(sk); if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { ret = -EBUSY; } else { WRITE_ONCE(po->vnet_hdr_sz, hdr_len); ret = 0; } release_sock(sk); return ret; } case PACKET_TIMESTAMP: { int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; WRITE_ONCE(po->tp_tstamp, val); return 0; } case PACKET_FANOUT: { struct fanout_args args = { 0 }; if (optlen != sizeof(int) && optlen != sizeof(args)) return -EINVAL; if (copy_from_sockptr(&args, optval, optlen)) return -EFAULT; return fanout_add(sk, &args); } case PACKET_FANOUT_DATA: { /* Paired with the WRITE_ONCE() in fanout_add() */ if (!READ_ONCE(po->fanout)) return -EINVAL; return fanout_set_data(po, optval, optlen); } case PACKET_IGNORE_OUTGOING: { int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; if (val < 0 || val > 1) return -EINVAL; po->prot_hook.ignore_outgoing = !!val; return 0; } case PACKET_TX_HAS_OFF: { unsigned int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; lock_sock(sk); if (!po->rx_ring.pg_vec && !po->tx_ring.pg_vec) packet_sock_flag_set(po, PACKET_SOCK_TX_HAS_OFF, val); release_sock(sk); return 0; } case PACKET_QDISC_BYPASS: { int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; packet_sock_flag_set(po, PACKET_SOCK_QDISC_BYPASS, val); return 0; } default: return -ENOPROTOOPT; } } static int packet_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { int len; int val, lv = sizeof(val); struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); void *data = &val; union tpacket_stats_u st; struct tpacket_rollover_stats rstats; int drops; if (level != SOL_PACKET) return -ENOPROTOOPT; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; switch (optname) { case PACKET_STATISTICS: spin_lock_bh(&sk->sk_receive_queue.lock); memcpy(&st, &po->stats, sizeof(st)); memset(&po->stats, 0, sizeof(po->stats)); spin_unlock_bh(&sk->sk_receive_queue.lock); drops = atomic_xchg(&po->tp_drops, 0); if (po->tp_version == TPACKET_V3) { lv = sizeof(struct tpacket_stats_v3); st.stats3.tp_drops = drops; st.stats3.tp_packets += drops; data = &st.stats3; } else { lv = sizeof(struct tpacket_stats); st.stats1.tp_drops = drops; st.stats1.tp_packets += drops; data = &st.stats1; } break; case PACKET_AUXDATA: val = packet_sock_flag(po, PACKET_SOCK_AUXDATA); break; case PACKET_ORIGDEV: val = packet_sock_flag(po, PACKET_SOCK_ORIGDEV); break; case PACKET_VNET_HDR: val = !!READ_ONCE(po->vnet_hdr_sz); break; case PACKET_VNET_HDR_SZ: val = READ_ONCE(po->vnet_hdr_sz); break; case PACKET_VERSION: val = po->tp_version; break; case PACKET_HDRLEN: if (len > sizeof(int)) len = sizeof(int); if (len < sizeof(int)) return -EINVAL; if (copy_from_user(&val, optval, len)) return -EFAULT; switch (val) { case TPACKET_V1: val = sizeof(struct tpacket_hdr); break; case TPACKET_V2: val = sizeof(struct tpacket2_hdr); break; case TPACKET_V3: val = sizeof(struct tpacket3_hdr); break; default: return -EINVAL; } break; case PACKET_RESERVE: val = po->tp_reserve; break; case PACKET_LOSS: val = packet_sock_flag(po, PACKET_SOCK_TP_LOSS); break; case PACKET_TIMESTAMP: val = READ_ONCE(po->tp_tstamp); break; case PACKET_FANOUT: val = (po->fanout ? ((u32)po->fanout->id | ((u32)po->fanout->type << 16) | ((u32)po->fanout->flags << 24)) : 0); break; case PACKET_IGNORE_OUTGOING: val = po->prot_hook.ignore_outgoing; break; case PACKET_ROLLOVER_STATS: if (!po->rollover) return -EINVAL; rstats.tp_all = atomic_long_read(&po->rollover->num); rstats.tp_huge = atomic_long_read(&po->rollover->num_huge); rstats.tp_failed = atomic_long_read(&po->rollover->num_failed); data = &rstats; lv = sizeof(rstats); break; case PACKET_TX_HAS_OFF: val = packet_sock_flag(po, PACKET_SOCK_TX_HAS_OFF); break; case PACKET_QDISC_BYPASS: val = packet_sock_flag(po, PACKET_SOCK_QDISC_BYPASS); break; default: return -ENOPROTOOPT; } if (len > lv) len = lv; if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, data, len)) return -EFAULT; return 0; } static int packet_notifier(struct notifier_block *this, unsigned long msg, void *ptr) { struct sock *sk; struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); rcu_read_lock(); sk_for_each_rcu(sk, &net->packet.sklist) { struct packet_sock *po = pkt_sk(sk); switch (msg) { case NETDEV_UNREGISTER: if (po->mclist) packet_dev_mclist_delete(dev, &po->mclist); fallthrough; case NETDEV_DOWN: if (dev->ifindex == po->ifindex) { spin_lock(&po->bind_lock); if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) { __unregister_prot_hook(sk, false); sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); } if (msg == NETDEV_UNREGISTER) { packet_cached_dev_reset(po); WRITE_ONCE(po->ifindex, -1); netdev_put(po->prot_hook.dev, &po->prot_hook.dev_tracker); po->prot_hook.dev = NULL; } spin_unlock(&po->bind_lock); } break; case NETDEV_UP: if (dev->ifindex == po->ifindex) { spin_lock(&po->bind_lock); if (po->num) register_prot_hook(sk); spin_unlock(&po->bind_lock); } break; } } rcu_read_unlock(); return NOTIFY_DONE; } static int packet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; switch (cmd) { case SIOCOUTQ: { int amount = sk_wmem_alloc_get(sk); return put_user(amount, (int __user *)arg); } case SIOCINQ: { struct sk_buff *skb; int amount = 0; spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); if (skb) amount = skb->len; spin_unlock_bh(&sk->sk_receive_queue.lock); return put_user(amount, (int __user *)arg); } #ifdef CONFIG_INET case SIOCADDRT: case SIOCDELRT: case SIOCDARP: case SIOCGARP: case SIOCSARP: case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFBRDADDR: case SIOCSIFBRDADDR: case SIOCGIFNETMASK: case SIOCSIFNETMASK: case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: case SIOCSIFFLAGS: return inet_dgram_ops.ioctl(sock, cmd, arg); #endif default: return -ENOIOCTLCMD; } return 0; } static __poll_t packet_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); __poll_t mask = datagram_poll(file, sock, wait); spin_lock_bh(&sk->sk_receive_queue.lock); if (po->rx_ring.pg_vec) { if (!packet_previous_rx_frame(po, &po->rx_ring, TP_STATUS_KERNEL)) mask |= EPOLLIN | EPOLLRDNORM; } packet_rcv_try_clear_pressure(po); spin_unlock_bh(&sk->sk_receive_queue.lock); spin_lock_bh(&sk->sk_write_queue.lock); if (po->tx_ring.pg_vec) { if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE)) mask |= EPOLLOUT | EPOLLWRNORM; } spin_unlock_bh(&sk->sk_write_queue.lock); return mask; } /* Dirty? Well, I still did not learn better way to account * for user mmaps. */ static void packet_mm_open(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct socket *sock = file->private_data; struct sock *sk = sock->sk; if (sk) atomic_inc(&pkt_sk(sk)->mapped); } static void packet_mm_close(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct socket *sock = file->private_data; struct sock *sk = sock->sk; if (sk) atomic_dec(&pkt_sk(sk)->mapped); } static const struct vm_operations_struct packet_mmap_ops = { .open = packet_mm_open, .close = packet_mm_close, }; static void free_pg_vec(struct pgv *pg_vec, unsigned int order, unsigned int len) { int i; for (i = 0; i < len; i++) { if (likely(pg_vec[i].buffer)) { if (is_vmalloc_addr(pg_vec[i].buffer)) vfree(pg_vec[i].buffer); else free_pages((unsigned long)pg_vec[i].buffer, order); pg_vec[i].buffer = NULL; } } kfree(pg_vec); } static char *alloc_one_pg_vec_page(unsigned long order) { char *buffer; gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY; buffer = (char *) __get_free_pages(gfp_flags, order); if (buffer) return buffer; /* __get_free_pages failed, fall back to vmalloc */ buffer = vzalloc(array_size((1 << order), PAGE_SIZE)); if (buffer) return buffer; /* vmalloc failed, lets dig into swap here */ gfp_flags &= ~__GFP_NORETRY; buffer = (char *) __get_free_pages(gfp_flags, order); if (buffer) return buffer; /* complete and utter failure */ return NULL; } static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order) { unsigned int block_nr = req->tp_block_nr; struct pgv *pg_vec; int i; pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL | __GFP_NOWARN); if (unlikely(!pg_vec)) goto out; for (i = 0; i < block_nr; i++) { pg_vec[i].buffer = alloc_one_pg_vec_page(order); if (unlikely(!pg_vec[i].buffer)) goto out_free_pgvec; } out: return pg_vec; out_free_pgvec: free_pg_vec(pg_vec, order, block_nr); pg_vec = NULL; goto out; } static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, int closing, int tx_ring) { struct pgv *pg_vec = NULL; struct packet_sock *po = pkt_sk(sk); unsigned long *rx_owner_map = NULL; int was_running, order = 0; struct packet_ring_buffer *rb; struct sk_buff_head *rb_queue; __be16 num; int err; /* Added to avoid minimal code churn */ struct tpacket_req *req = &req_u->req; rb = tx_ring ? &po->tx_ring : &po->rx_ring; rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; err = -EBUSY; if (!closing) { if (atomic_read(&po->mapped)) goto out; if (packet_read_pending(rb)) goto out; } if (req->tp_block_nr) { unsigned int min_frame_size; /* Sanity tests and some calculations */ err = -EBUSY; if (unlikely(rb->pg_vec)) goto out; switch (po->tp_version) { case TPACKET_V1: po->tp_hdrlen = TPACKET_HDRLEN; break; case TPACKET_V2: po->tp_hdrlen = TPACKET2_HDRLEN; break; case TPACKET_V3: po->tp_hdrlen = TPACKET3_HDRLEN; break; } err = -EINVAL; if (unlikely((int)req->tp_block_size <= 0)) goto out; if (unlikely(!PAGE_ALIGNED(req->tp_block_size))) goto out; min_frame_size = po->tp_hdrlen + po->tp_reserve; if (po->tp_version >= TPACKET_V3 && req->tp_block_size < BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size) goto out; if (unlikely(req->tp_frame_size < min_frame_size)) goto out; if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1))) goto out; rb->frames_per_block = req->tp_block_size / req->tp_frame_size; if (unlikely(rb->frames_per_block == 0)) goto out; if (unlikely(rb->frames_per_block > UINT_MAX / req->tp_block_nr)) goto out; if (unlikely((rb->frames_per_block * req->tp_block_nr) != req->tp_frame_nr)) goto out; err = -ENOMEM; order = get_order(req->tp_block_size); pg_vec = alloc_pg_vec(req, order); if (unlikely(!pg_vec)) goto out; switch (po->tp_version) { case TPACKET_V3: /* Block transmit is not supported yet */ if (!tx_ring) { init_prb_bdqc(po, rb, pg_vec, req_u); } else { struct tpacket_req3 *req3 = &req_u->req3; if (req3->tp_retire_blk_tov || req3->tp_sizeof_priv || req3->tp_feature_req_word) { err = -EINVAL; goto out_free_pg_vec; } } break; default: if (!tx_ring) { rx_owner_map = bitmap_alloc(req->tp_frame_nr, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO); if (!rx_owner_map) goto out_free_pg_vec; } break; } } /* Done */ else { err = -EINVAL; if (unlikely(req->tp_frame_nr)) goto out; } /* Detach socket from network */ spin_lock(&po->bind_lock); was_running = packet_sock_flag(po, PACKET_SOCK_RUNNING); num = po->num; if (was_running) { WRITE_ONCE(po->num, 0); __unregister_prot_hook(sk, false); } spin_unlock(&po->bind_lock); synchronize_net(); err = -EBUSY; mutex_lock(&po->pg_vec_lock); if (closing || atomic_read(&po->mapped) == 0) { err = 0; spin_lock_bh(&rb_queue->lock); swap(rb->pg_vec, pg_vec); if (po->tp_version <= TPACKET_V2) swap(rb->rx_owner_map, rx_owner_map); rb->frame_max = (req->tp_frame_nr - 1); rb->head = 0; rb->frame_size = req->tp_frame_size; spin_unlock_bh(&rb_queue->lock); swap(rb->pg_vec_order, order); swap(rb->pg_vec_len, req->tp_block_nr); rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE; po->prot_hook.func = (po->rx_ring.pg_vec) ? tpacket_rcv : packet_rcv; skb_queue_purge(rb_queue); if (atomic_read(&po->mapped)) pr_err("packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped)); } mutex_unlock(&po->pg_vec_lock); spin_lock(&po->bind_lock); if (was_running) { WRITE_ONCE(po->num, num); register_prot_hook(sk); } spin_unlock(&po->bind_lock); if (pg_vec && (po->tp_version > TPACKET_V2)) { /* Because we don't support block-based V3 on tx-ring */ if (!tx_ring) prb_shutdown_retire_blk_timer(po, rb_queue); } out_free_pg_vec: if (pg_vec) { bitmap_free(rx_owner_map); free_pg_vec(pg_vec, order, req->tp_block_nr); } out: return err; } static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); unsigned long size, expected_size; struct packet_ring_buffer *rb; unsigned long start; int err = -EINVAL; int i; if (vma->vm_pgoff) return -EINVAL; mutex_lock(&po->pg_vec_lock); expected_size = 0; for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) { if (rb->pg_vec) { expected_size += rb->pg_vec_len * rb->pg_vec_pages * PAGE_SIZE; } } if (expected_size == 0) goto out; size = vma->vm_end - vma->vm_start; if (size != expected_size) goto out; start = vma->vm_start; for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) { if (rb->pg_vec == NULL) continue; for (i = 0; i < rb->pg_vec_len; i++) { struct page *page; void *kaddr = rb->pg_vec[i].buffer; int pg_num; for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) { page = pgv_to_page(kaddr); err = vm_insert_page(vma, start, page); if (unlikely(err)) goto out; start += PAGE_SIZE; kaddr += PAGE_SIZE; } } } atomic_inc(&po->mapped); vma->vm_ops = &packet_mmap_ops; err = 0; out: mutex_unlock(&po->pg_vec_lock); return err; } static const struct proto_ops packet_ops_spkt = { .family = PF_PACKET, .owner = THIS_MODULE, .release = packet_release, .bind = packet_bind_spkt, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = packet_getname_spkt, .poll = datagram_poll, .ioctl = packet_ioctl, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .sendmsg = packet_sendmsg_spkt, .recvmsg = packet_recvmsg, .mmap = sock_no_mmap, }; static const struct proto_ops packet_ops = { .family = PF_PACKET, .owner = THIS_MODULE, .release = packet_release, .bind = packet_bind, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = packet_getname, .poll = packet_poll, .ioctl = packet_ioctl, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = packet_setsockopt, .getsockopt = packet_getsockopt, .sendmsg = packet_sendmsg, .recvmsg = packet_recvmsg, .mmap = packet_mmap, }; static const struct net_proto_family packet_family_ops = { .family = PF_PACKET, .create = packet_create, .owner = THIS_MODULE, }; static struct notifier_block packet_netdev_notifier = { .notifier_call = packet_notifier, }; #ifdef CONFIG_PROC_FS static void *packet_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct net *net = seq_file_net(seq); rcu_read_lock(); return seq_hlist_start_head_rcu(&net->packet.sklist, *pos); } static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct net *net = seq_file_net(seq); return seq_hlist_next_rcu(v, &net->packet.sklist, pos); } static void packet_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static int packet_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) seq_printf(seq, "%*sRefCnt Type Proto Iface R Rmem User Inode\n", IS_ENABLED(CONFIG_64BIT) ? -17 : -9, "sk"); else { struct sock *s = sk_entry(v); const struct packet_sock *po = pkt_sk(s); seq_printf(seq, "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n", s, refcount_read(&s->sk_refcnt), s->sk_type, ntohs(READ_ONCE(po->num)), READ_ONCE(po->ifindex), packet_sock_flag(po, PACKET_SOCK_RUNNING), atomic_read(&s->sk_rmem_alloc), from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)), sock_i_ino(s)); } return 0; } static const struct seq_operations packet_seq_ops = { .start = packet_seq_start, .next = packet_seq_next, .stop = packet_seq_stop, .show = packet_seq_show, }; #endif static int __net_init packet_net_init(struct net *net) { mutex_init(&net->packet.sklist_lock); INIT_HLIST_HEAD(&net->packet.sklist); #ifdef CONFIG_PROC_FS if (!proc_create_net("packet", 0, net->proc_net, &packet_seq_ops, sizeof(struct seq_net_private))) return -ENOMEM; #endif /* CONFIG_PROC_FS */ return 0; } static void __net_exit packet_net_exit(struct net *net) { remove_proc_entry("packet", net->proc_net); WARN_ON_ONCE(!hlist_empty(&net->packet.sklist)); } static struct pernet_operations packet_net_ops = { .init = packet_net_init, .exit = packet_net_exit, }; static void __exit packet_exit(void) { sock_unregister(PF_PACKET); proto_unregister(&packet_proto); unregister_netdevice_notifier(&packet_netdev_notifier); unregister_pernet_subsys(&packet_net_ops); } static int __init packet_init(void) { int rc; rc = register_pernet_subsys(&packet_net_ops); if (rc) goto out; rc = register_netdevice_notifier(&packet_netdev_notifier); if (rc) goto out_pernet; rc = proto_register(&packet_proto, 0); if (rc) goto out_notifier; rc = sock_register(&packet_family_ops); if (rc) goto out_proto; return 0; out_proto: proto_unregister(&packet_proto); out_notifier: unregister_netdevice_notifier(&packet_netdev_notifier); out_pernet: unregister_pernet_subsys(&packet_net_ops); out: return rc; } module_init(packet_init); module_exit(packet_exit); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_PACKET);
747 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 // SPDX-License-Identifier: GPL-2.0 // Copyright(c) 2018 Linus Torvalds. All rights reserved. // Copyright(c) 2018 Alexei Starovoitov. All rights reserved. // Copyright(c) 2018 Intel Corporation. All rights reserved. #ifndef _LINUX_NOSPEC_H #define _LINUX_NOSPEC_H #include <linux/compiler.h> #include <asm/barrier.h> struct task_struct; #ifndef barrier_nospec # define barrier_nospec() do { } while (0) #endif /** * array_index_mask_nospec() - generate a ~0 mask when index < size, 0 otherwise * @index: array element index * @size: number of elements in array * * When @index is out of bounds (@index >= @size), the sign bit will be * set. Extend the sign bit to all bits and invert, giving a result of * zero for an out of bounds index, or ~0 if within bounds [0, @size). */ #ifndef array_index_mask_nospec static inline unsigned long array_index_mask_nospec(unsigned long index, unsigned long size) { /* * Always calculate and emit the mask even if the compiler * thinks the mask is not needed. The compiler does not take * into account the value of @index under speculation. */ OPTIMIZER_HIDE_VAR(index); return ~(long)(index | (size - 1UL - index)) >> (BITS_PER_LONG - 1); } #endif /* * array_index_nospec - sanitize an array index after a bounds check * * For a code sequence like: * * if (index < size) { * index = array_index_nospec(index, size); * val = array[index]; * } * * ...if the CPU speculates past the bounds check then * array_index_nospec() will clamp the index within the range of [0, * size). */ #define array_index_nospec(index, size) \ ({ \ typeof(index) _i = (index); \ typeof(size) _s = (size); \ unsigned long _mask = array_index_mask_nospec(_i, _s); \ \ BUILD_BUG_ON(sizeof(_i) > sizeof(long)); \ BUILD_BUG_ON(sizeof(_s) > sizeof(long)); \ \ (typeof(_i)) (_i & _mask); \ }) /* Speculation control prctl */ int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which); int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which, unsigned long ctrl); /* Speculation control for seccomp enforced mitigation */ void arch_seccomp_spec_mitigate(struct task_struct *task); #endif /* _LINUX_NOSPEC_H */
8 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 // SPDX-License-Identifier: GPL-2.0 /* * LED Triggers for USB Activity * * Copyright 2014 Michal Sojka <sojka@merica.cz> */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/leds.h> #include <linux/usb.h> #include "common.h" #define BLINK_DELAY 30 DEFINE_LED_TRIGGER(ledtrig_usb_gadget); DEFINE_LED_TRIGGER(ledtrig_usb_host); void usb_led_activity(enum usb_led_event ev) { struct led_trigger *trig = NULL; switch (ev) { case USB_LED_EVENT_GADGET: trig = ledtrig_usb_gadget; break; case USB_LED_EVENT_HOST: trig = ledtrig_usb_host; break; } /* led_trigger_blink_oneshot() handles trig == NULL gracefully */ led_trigger_blink_oneshot(trig, BLINK_DELAY, BLINK_DELAY, 0); } EXPORT_SYMBOL_GPL(usb_led_activity); void __init ledtrig_usb_init(void) { led_trigger_register_simple("usb-gadget", &ledtrig_usb_gadget); led_trigger_register_simple("usb-host", &ledtrig_usb_host); } void __exit ledtrig_usb_exit(void) { led_trigger_unregister_simple(ledtrig_usb_gadget); led_trigger_unregister_simple(ledtrig_usb_host); }
198 155 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 /* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com */ #ifndef _LINUX_BPF_H #define _LINUX_BPF_H 1 #include <uapi/linux/bpf.h> #include <uapi/linux/filter.h> #include <linux/workqueue.h> #include <linux/file.h> #include <linux/percpu.h> #include <linux/err.h> #include <linux/rbtree_latch.h> #include <linux/numa.h> #include <linux/mm_types.h> #include <linux/wait.h> #include <linux/refcount.h> #include <linux/mutex.h> #include <linux/module.h> #include <linux/kallsyms.h> #include <linux/capability.h> #include <linux/sched/mm.h> #include <linux/slab.h> #include <linux/percpu-refcount.h> #include <linux/stddef.h> #include <linux/bpfptr.h> #include <linux/btf.h> #include <linux/rcupdate_trace.h> #include <linux/static_call.h> #include <linux/memcontrol.h> struct bpf_verifier_env; struct bpf_verifier_log; struct perf_event; struct bpf_prog; struct bpf_prog_aux; struct bpf_map; struct sock; struct seq_file; struct btf; struct btf_type; struct exception_table_entry; struct seq_operations; struct bpf_iter_aux_info; struct bpf_local_storage; struct bpf_local_storage_map; struct kobject; struct mem_cgroup; struct module; struct bpf_func_state; struct ftrace_ops; struct cgroup; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; extern struct kobject *btf_kobj; extern struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma; extern bool bpf_global_ma_set, bpf_global_percpu_ma_set; typedef u64 (*bpf_callback_t)(u64, u64, u64, u64, u64); typedef int (*bpf_iter_init_seq_priv_t)(void *private_data, struct bpf_iter_aux_info *aux); typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data); typedef unsigned int (*bpf_func_t)(const void *, const struct bpf_insn *); struct bpf_iter_seq_info { const struct seq_operations *seq_ops; bpf_iter_init_seq_priv_t init_seq_private; bpf_iter_fini_seq_priv_t fini_seq_private; u32 seq_priv_size; }; /* map is generic key/value storage optionally accessible by eBPF programs */ struct bpf_map_ops { /* funcs callable from userspace (via syscall) */ int (*map_alloc_check)(union bpf_attr *attr); struct bpf_map *(*map_alloc)(union bpf_attr *attr); void (*map_release)(struct bpf_map *map, struct file *map_file); void (*map_free)(struct bpf_map *map); int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key); void (*map_release_uref)(struct bpf_map *map); void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key); int (*map_lookup_batch)(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr); int (*map_lookup_and_delete_elem)(struct bpf_map *map, void *key, void *value, u64 flags); int (*map_lookup_and_delete_batch)(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr); int (*map_update_batch)(struct bpf_map *map, struct file *map_file, const union bpf_attr *attr, union bpf_attr __user *uattr); int (*map_delete_batch)(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr); /* funcs callable from userspace and from eBPF programs */ void *(*map_lookup_elem)(struct bpf_map *map, void *key); long (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags); long (*map_delete_elem)(struct bpf_map *map, void *key); long (*map_push_elem)(struct bpf_map *map, void *value, u64 flags); long (*map_pop_elem)(struct bpf_map *map, void *value); long (*map_peek_elem)(struct bpf_map *map, void *value); void *(*map_lookup_percpu_elem)(struct bpf_map *map, void *key, u32 cpu); /* funcs called by prog_array and perf_event_array map */ void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, int fd); void (*map_fd_put_ptr)(void *ptr); int (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf); u32 (*map_fd_sys_lookup_elem)(void *ptr); void (*map_seq_show_elem)(struct bpf_map *map, void *key, struct seq_file *m); int (*map_check_btf)(const struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type); /* Prog poke tracking helpers. */ int (*map_poke_track)(struct bpf_map *map, struct bpf_prog_aux *aux); void (*map_poke_untrack)(struct bpf_map *map, struct bpf_prog_aux *aux); void (*map_poke_run)(struct bpf_map *map, u32 key, struct bpf_prog *old, struct bpf_prog *new); /* Direct value access helpers. */ int (*map_direct_value_addr)(const struct bpf_map *map, u64 *imm, u32 off); int (*map_direct_value_meta)(const struct bpf_map *map, u64 imm, u32 *off); int (*map_mmap)(struct bpf_map *map, struct vm_area_struct *vma); __poll_t (*map_poll)(struct bpf_map *map, struct file *filp, struct poll_table_struct *pts); /* Functions called by bpf_local_storage maps */ int (*map_local_storage_charge)(struct bpf_local_storage_map *smap, void *owner, u32 size); void (*map_local_storage_uncharge)(struct bpf_local_storage_map *smap, void *owner, u32 size); struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner); /* Misc helpers.*/ long (*map_redirect)(struct bpf_map *map, u64 key, u64 flags); /* map_meta_equal must be implemented for maps that can be * used as an inner map. It is a runtime check to ensure * an inner map can be inserted to an outer map. * * Some properties of the inner map has been used during the * verification time. When inserting an inner map at the runtime, * map_meta_equal has to ensure the inserting map has the same * properties that the verifier has used earlier. */ bool (*map_meta_equal)(const struct bpf_map *meta0, const struct bpf_map *meta1); int (*map_set_for_each_callback_args)(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee); long (*map_for_each_callback)(struct bpf_map *map, bpf_callback_t callback_fn, void *callback_ctx, u64 flags); u64 (*map_mem_usage)(const struct bpf_map *map); /* BTF id of struct allocated by map_alloc */ int *map_btf_id; /* bpf_iter info used to open a seq_file */ const struct bpf_iter_seq_info *iter_seq_info; }; enum { /* Support at most 10 fields in a BTF type */ BTF_FIELDS_MAX = 10, }; enum btf_field_type { BPF_SPIN_LOCK = (1 << 0), BPF_TIMER = (1 << 1), BPF_KPTR_UNREF = (1 << 2), BPF_KPTR_REF = (1 << 3), BPF_KPTR_PERCPU = (1 << 4), BPF_KPTR = BPF_KPTR_UNREF | BPF_KPTR_REF | BPF_KPTR_PERCPU, BPF_LIST_HEAD = (1 << 5), BPF_LIST_NODE = (1 << 6), BPF_RB_ROOT = (1 << 7), BPF_RB_NODE = (1 << 8), BPF_GRAPH_NODE_OR_ROOT = BPF_LIST_NODE | BPF_LIST_HEAD | BPF_RB_NODE | BPF_RB_ROOT, BPF_REFCOUNT = (1 << 9), }; typedef void (*btf_dtor_kfunc_t)(void *); struct btf_field_kptr { struct btf *btf; struct module *module; /* dtor used if btf_is_kernel(btf), otherwise the type is * program-allocated, dtor is NULL, and __bpf_obj_drop_impl is used */ btf_dtor_kfunc_t dtor; u32 btf_id; }; struct btf_field_graph_root { struct btf *btf; u32 value_btf_id; u32 node_offset; struct btf_record *value_rec; }; struct btf_field { u32 offset; u32 size; enum btf_field_type type; union { struct btf_field_kptr kptr; struct btf_field_graph_root graph_root; }; }; struct btf_record { u32 cnt; u32 field_mask; int spin_lock_off; int timer_off; int refcount_off; struct btf_field fields[]; }; /* Non-opaque version of bpf_rb_node in uapi/linux/bpf.h */ struct bpf_rb_node_kern { struct rb_node rb_node; void *owner; } __attribute__((aligned(8))); /* Non-opaque version of bpf_list_node in uapi/linux/bpf.h */ struct bpf_list_node_kern { struct list_head list_head; void *owner; } __attribute__((aligned(8))); struct bpf_map { /* The first two cachelines with read-mostly members of which some * are also accessed in fast-path (e.g. ops, max_entries). */ const struct bpf_map_ops *ops ____cacheline_aligned; struct bpf_map *inner_map_meta; #ifdef CONFIG_SECURITY void *security; #endif enum bpf_map_type map_type; u32 key_size; u32 value_size; u32 max_entries; u64 map_extra; /* any per-map-type extra fields */ u32 map_flags; u32 id; struct btf_record *record; int numa_node; u32 btf_key_type_id; u32 btf_value_type_id; u32 btf_vmlinux_value_type_id; struct btf *btf; #ifdef CONFIG_MEMCG_KMEM struct obj_cgroup *objcg; #endif char name[BPF_OBJ_NAME_LEN]; /* The 3rd and 4th cacheline with misc members to avoid false sharing * particularly with refcounting. */ atomic64_t refcnt ____cacheline_aligned; atomic64_t usercnt; struct work_struct work; struct mutex freeze_mutex; atomic64_t writecnt; /* 'Ownership' of program-containing map is claimed by the first program * that is going to use this map or by the first program which FD is * stored in the map to make sure that all callers and callees have the * same prog type, JITed flag and xdp_has_frags flag. */ struct { spinlock_t lock; enum bpf_prog_type type; bool jited; bool xdp_has_frags; } owner; bool bypass_spec_v1; bool frozen; /* write-once; write-protected by freeze_mutex */ s64 __percpu *elem_count; }; static inline const char *btf_field_type_name(enum btf_field_type type) { switch (type) { case BPF_SPIN_LOCK: return "bpf_spin_lock"; case BPF_TIMER: return "bpf_timer"; case BPF_KPTR_UNREF: case BPF_KPTR_REF: return "kptr"; case BPF_KPTR_PERCPU: return "percpu_kptr"; case BPF_LIST_HEAD: return "bpf_list_head"; case BPF_LIST_NODE: return "bpf_list_node"; case BPF_RB_ROOT: return "bpf_rb_root"; case BPF_RB_NODE: return "bpf_rb_node"; case BPF_REFCOUNT: return "bpf_refcount"; default: WARN_ON_ONCE(1); return "unknown"; } } static inline u32 btf_field_type_size(enum btf_field_type type) { switch (type) { case BPF_SPIN_LOCK: return sizeof(struct bpf_spin_lock); case BPF_TIMER: return sizeof(struct bpf_timer); case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: return sizeof(u64); case BPF_LIST_HEAD: return sizeof(struct bpf_list_head); case BPF_LIST_NODE: return sizeof(struct bpf_list_node); case BPF_RB_ROOT: return sizeof(struct bpf_rb_root); case BPF_RB_NODE: return sizeof(struct bpf_rb_node); case BPF_REFCOUNT: return sizeof(struct bpf_refcount); default: WARN_ON_ONCE(1); return 0; } } static inline u32 btf_field_type_align(enum btf_field_type type) { switch (type) { case BPF_SPIN_LOCK: return __alignof__(struct bpf_spin_lock); case BPF_TIMER: return __alignof__(struct bpf_timer); case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: return __alignof__(u64); case BPF_LIST_HEAD: return __alignof__(struct bpf_list_head); case BPF_LIST_NODE: return __alignof__(struct bpf_list_node); case BPF_RB_ROOT: return __alignof__(struct bpf_rb_root); case BPF_RB_NODE: return __alignof__(struct bpf_rb_node); case BPF_REFCOUNT: return __alignof__(struct bpf_refcount); default: WARN_ON_ONCE(1); return 0; } } static inline void bpf_obj_init_field(const struct btf_field *field, void *addr) { memset(addr, 0, field->size); switch (field->type) { case BPF_REFCOUNT: refcount_set((refcount_t *)addr, 1); break; case BPF_RB_NODE: RB_CLEAR_NODE((struct rb_node *)addr); break; case BPF_LIST_HEAD: case BPF_LIST_NODE: INIT_LIST_HEAD((struct list_head *)addr); break; case BPF_RB_ROOT: /* RB_ROOT_CACHED 0-inits, no need to do anything after memset */ case BPF_SPIN_LOCK: case BPF_TIMER: case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: break; default: WARN_ON_ONCE(1); return; } } static inline bool btf_record_has_field(const struct btf_record *rec, enum btf_field_type type) { if (IS_ERR_OR_NULL(rec)) return false; return rec->field_mask & type; } static inline void bpf_obj_init(const struct btf_record *rec, void *obj) { int i; if (IS_ERR_OR_NULL(rec)) return; for (i = 0; i < rec->cnt; i++) bpf_obj_init_field(&rec->fields[i], obj + rec->fields[i].offset); } /* 'dst' must be a temporary buffer and should not point to memory that is being * used in parallel by a bpf program or bpf syscall, otherwise the access from * the bpf program or bpf syscall may be corrupted by the reinitialization, * leading to weird problems. Even 'dst' is newly-allocated from bpf memory * allocator, it is still possible for 'dst' to be used in parallel by a bpf * program or bpf syscall. */ static inline void check_and_init_map_value(struct bpf_map *map, void *dst) { bpf_obj_init(map->record, dst); } /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and * forced to use 'long' read/writes to try to atomically copy long counters. * Best-effort only. No barriers here, since it _will_ race with concurrent * updates from BPF programs. Called from bpf syscall and mostly used with * size 8 or 16 bytes, so ask compiler to inline it. */ static inline void bpf_long_memcpy(void *dst, const void *src, u32 size) { const long *lsrc = src; long *ldst = dst; size /= sizeof(long); while (size--) data_race(*ldst++ = *lsrc++); } /* copy everything but bpf_spin_lock, bpf_timer, and kptrs. There could be one of each. */ static inline void bpf_obj_memcpy(struct btf_record *rec, void *dst, void *src, u32 size, bool long_memcpy) { u32 curr_off = 0; int i; if (IS_ERR_OR_NULL(rec)) { if (long_memcpy) bpf_long_memcpy(dst, src, round_up(size, 8)); else memcpy(dst, src, size); return; } for (i = 0; i < rec->cnt; i++) { u32 next_off = rec->fields[i].offset; u32 sz = next_off - curr_off; memcpy(dst + curr_off, src + curr_off, sz); curr_off += rec->fields[i].size + sz; } memcpy(dst + curr_off, src + curr_off, size - curr_off); } static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) { bpf_obj_memcpy(map->record, dst, src, map->value_size, false); } static inline void copy_map_value_long(struct bpf_map *map, void *dst, void *src) { bpf_obj_memcpy(map->record, dst, src, map->value_size, true); } static inline void bpf_obj_memzero(struct btf_record *rec, void *dst, u32 size) { u32 curr_off = 0; int i; if (IS_ERR_OR_NULL(rec)) { memset(dst, 0, size); return; } for (i = 0; i < rec->cnt; i++) { u32 next_off = rec->fields[i].offset; u32 sz = next_off - curr_off; memset(dst + curr_off, 0, sz); curr_off += rec->fields[i].size + sz; } memset(dst + curr_off, 0, size - curr_off); } static inline void zero_map_value(struct bpf_map *map, void *dst) { bpf_obj_memzero(map->record, dst, map->value_size); } void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, bool lock_src); void bpf_timer_cancel_and_free(void *timer); void bpf_list_head_free(const struct btf_field *field, void *list_head, struct bpf_spin_lock *spin_lock); void bpf_rb_root_free(const struct btf_field *field, void *rb_root, struct bpf_spin_lock *spin_lock); int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size); struct bpf_offload_dev; struct bpf_offloaded_map; struct bpf_map_dev_ops { int (*map_get_next_key)(struct bpf_offloaded_map *map, void *key, void *next_key); int (*map_lookup_elem)(struct bpf_offloaded_map *map, void *key, void *value); int (*map_update_elem)(struct bpf_offloaded_map *map, void *key, void *value, u64 flags); int (*map_delete_elem)(struct bpf_offloaded_map *map, void *key); }; struct bpf_offloaded_map { struct bpf_map map; struct net_device *netdev; const struct bpf_map_dev_ops *dev_ops; void *dev_priv; struct list_head offloads; }; static inline struct bpf_offloaded_map *map_to_offmap(struct bpf_map *map) { return container_of(map, struct bpf_offloaded_map, map); } static inline bool bpf_map_offload_neutral(const struct bpf_map *map) { return map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY; } static inline bool bpf_map_support_seq_show(const struct bpf_map *map) { return (map->btf_value_type_id || map->btf_vmlinux_value_type_id) && map->ops->map_seq_show_elem; } int map_check_no_btf(const struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type); bool bpf_map_meta_equal(const struct bpf_map *meta0, const struct bpf_map *meta1); extern const struct bpf_map_ops bpf_map_offload_ops; /* bpf_type_flag contains a set of flags that are applicable to the values of * arg_type, ret_type and reg_type. For example, a pointer value may be null, * or a memory is read-only. We classify types into two categories: base types * and extended types. Extended types are base types combined with a type flag. * * Currently there are no more than 32 base types in arg_type, ret_type and * reg_types. */ #define BPF_BASE_TYPE_BITS 8 enum bpf_type_flag { /* PTR may be NULL. */ PTR_MAYBE_NULL = BIT(0 + BPF_BASE_TYPE_BITS), /* MEM is read-only. When applied on bpf_arg, it indicates the arg is * compatible with both mutable and immutable memory. */ MEM_RDONLY = BIT(1 + BPF_BASE_TYPE_BITS), /* MEM points to BPF ring buffer reservation. */ MEM_RINGBUF = BIT(2 + BPF_BASE_TYPE_BITS), /* MEM is in user address space. */ MEM_USER = BIT(3 + BPF_BASE_TYPE_BITS), /* MEM is a percpu memory. MEM_PERCPU tags PTR_TO_BTF_ID. When tagged * with MEM_PERCPU, PTR_TO_BTF_ID _cannot_ be directly accessed. In * order to drop this tag, it must be passed into bpf_per_cpu_ptr() * or bpf_this_cpu_ptr(), which will return the pointer corresponding * to the specified cpu. */ MEM_PERCPU = BIT(4 + BPF_BASE_TYPE_BITS), /* Indicates that the argument will be released. */ OBJ_RELEASE = BIT(5 + BPF_BASE_TYPE_BITS), /* PTR is not trusted. This is only used with PTR_TO_BTF_ID, to mark * unreferenced and referenced kptr loaded from map value using a load * instruction, so that they can only be dereferenced but not escape the * BPF program into the kernel (i.e. cannot be passed as arguments to * kfunc or bpf helpers). */ PTR_UNTRUSTED = BIT(6 + BPF_BASE_TYPE_BITS), MEM_UNINIT = BIT(7 + BPF_BASE_TYPE_BITS), /* DYNPTR points to memory local to the bpf program. */ DYNPTR_TYPE_LOCAL = BIT(8 + BPF_BASE_TYPE_BITS), /* DYNPTR points to a kernel-produced ringbuf record. */ DYNPTR_TYPE_RINGBUF = BIT(9 + BPF_BASE_TYPE_BITS), /* Size is known at compile time. */ MEM_FIXED_SIZE = BIT(10 + BPF_BASE_TYPE_BITS), /* MEM is of an allocated object of type in program BTF. This is used to * tag PTR_TO_BTF_ID allocated using bpf_obj_new. */ MEM_ALLOC = BIT(11 + BPF_BASE_TYPE_BITS), /* PTR was passed from the kernel in a trusted context, and may be * passed to KF_TRUSTED_ARGS kfuncs or BPF helper functions. * Confusingly, this is _not_ the opposite of PTR_UNTRUSTED above. * PTR_UNTRUSTED refers to a kptr that was read directly from a map * without invoking bpf_kptr_xchg(). What we really need to know is * whether a pointer is safe to pass to a kfunc or BPF helper function. * While PTR_UNTRUSTED pointers are unsafe to pass to kfuncs and BPF * helpers, they do not cover all possible instances of unsafe * pointers. For example, a pointer that was obtained from walking a * struct will _not_ get the PTR_UNTRUSTED type modifier, despite the * fact that it may be NULL, invalid, etc. This is due to backwards * compatibility requirements, as this was the behavior that was first * introduced when kptrs were added. The behavior is now considered * deprecated, and PTR_UNTRUSTED will eventually be removed. * * PTR_TRUSTED, on the other hand, is a pointer that the kernel * guarantees to be valid and safe to pass to kfuncs and BPF helpers. * For example, pointers passed to tracepoint arguments are considered * PTR_TRUSTED, as are pointers that are passed to struct_ops * callbacks. As alluded to above, pointers that are obtained from * walking PTR_TRUSTED pointers are _not_ trusted. For example, if a * struct task_struct *task is PTR_TRUSTED, then accessing * task->last_wakee will lose the PTR_TRUSTED modifier when it's stored * in a BPF register. Similarly, pointers passed to certain programs * types such as kretprobes are not guaranteed to be valid, as they may * for example contain an object that was recently freed. */ PTR_TRUSTED = BIT(12 + BPF_BASE_TYPE_BITS), /* MEM is tagged with rcu and memory access needs rcu_read_lock protection. */ MEM_RCU = BIT(13 + BPF_BASE_TYPE_BITS), /* Used to tag PTR_TO_BTF_ID | MEM_ALLOC references which are non-owning. * Currently only valid for linked-list and rbtree nodes. If the nodes * have a bpf_refcount_field, they must be tagged MEM_RCU as well. */ NON_OWN_REF = BIT(14 + BPF_BASE_TYPE_BITS), /* DYNPTR points to sk_buff */ DYNPTR_TYPE_SKB = BIT(15 + BPF_BASE_TYPE_BITS), /* DYNPTR points to xdp_buff */ DYNPTR_TYPE_XDP = BIT(16 + BPF_BASE_TYPE_BITS), __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; #define DYNPTR_TYPE_FLAG_MASK (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB \ | DYNPTR_TYPE_XDP) /* Max number of base types. */ #define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS) /* Max number of all types. */ #define BPF_TYPE_LIMIT (__BPF_TYPE_LAST_FLAG | (__BPF_TYPE_LAST_FLAG - 1)) /* function argument constraints */ enum bpf_arg_type { ARG_DONTCARE = 0, /* unused argument in helper function */ /* the following constraints used to prototype * bpf_map_lookup/update/delete_elem() functions */ ARG_CONST_MAP_PTR, /* const argument used as pointer to bpf_map */ ARG_PTR_TO_MAP_KEY, /* pointer to stack used as map key */ ARG_PTR_TO_MAP_VALUE, /* pointer to stack used as map value */ /* Used to prototype bpf_memcmp() and other functions that access data * on eBPF program stack */ ARG_PTR_TO_MEM, /* pointer to valid memory (stack, packet, map value) */ ARG_CONST_SIZE, /* number of bytes accessed from memory */ ARG_CONST_SIZE_OR_ZERO, /* number of bytes accessed from memory or 0 */ ARG_PTR_TO_CTX, /* pointer to context */ ARG_ANYTHING, /* any (initialized) argument is ok */ ARG_PTR_TO_SPIN_LOCK, /* pointer to bpf_spin_lock */ ARG_PTR_TO_SOCK_COMMON, /* pointer to sock_common */ ARG_PTR_TO_INT, /* pointer to int */ ARG_PTR_TO_LONG, /* pointer to long */ ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */ ARG_PTR_TO_BTF_ID, /* pointer to in-kernel struct */ ARG_PTR_TO_RINGBUF_MEM, /* pointer to dynamically reserved ringbuf memory */ ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */ ARG_PTR_TO_BTF_ID_SOCK_COMMON, /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */ ARG_PTR_TO_PERCPU_BTF_ID, /* pointer to in-kernel percpu type */ ARG_PTR_TO_FUNC, /* pointer to a bpf program function */ ARG_PTR_TO_STACK, /* pointer to stack */ ARG_PTR_TO_CONST_STR, /* pointer to a null terminated read-only string */ ARG_PTR_TO_TIMER, /* pointer to bpf_timer */ ARG_PTR_TO_KPTR, /* pointer to referenced kptr */ ARG_PTR_TO_DYNPTR, /* pointer to bpf_dynptr. See bpf_type_flag for dynptr type */ __BPF_ARG_TYPE_MAX, /* Extended arg_types. */ ARG_PTR_TO_MAP_VALUE_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_MAP_VALUE, ARG_PTR_TO_MEM_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_MEM, ARG_PTR_TO_CTX_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_CTX, ARG_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_SOCKET, ARG_PTR_TO_STACK_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_STACK, ARG_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, /* pointer to memory does not need to be initialized, helper function must fill * all bytes or clear them in error case. */ ARG_PTR_TO_UNINIT_MEM = MEM_UNINIT | ARG_PTR_TO_MEM, /* Pointer to valid memory of size known at compile time. */ ARG_PTR_TO_FIXED_SIZE_MEM = MEM_FIXED_SIZE | ARG_PTR_TO_MEM, /* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag. */ __BPF_ARG_TYPE_LIMIT = BPF_TYPE_LIMIT, }; static_assert(__BPF_ARG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT); /* type of values returned from helper functions */ enum bpf_return_type { RET_INTEGER, /* function returns integer */ RET_VOID, /* function doesn't return anything */ RET_PTR_TO_MAP_VALUE, /* returns a pointer to map elem value */ RET_PTR_TO_SOCKET, /* returns a pointer to a socket */ RET_PTR_TO_TCP_SOCK, /* returns a pointer to a tcp_sock */ RET_PTR_TO_SOCK_COMMON, /* returns a pointer to a sock_common */ RET_PTR_TO_MEM, /* returns a pointer to memory */ RET_PTR_TO_MEM_OR_BTF_ID, /* returns a pointer to a valid memory or a btf_id */ RET_PTR_TO_BTF_ID, /* returns a pointer to a btf_id */ __BPF_RET_TYPE_MAX, /* Extended ret_types. */ RET_PTR_TO_MAP_VALUE_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_MAP_VALUE, RET_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCKET, RET_PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_TCP_SOCK, RET_PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCK_COMMON, RET_PTR_TO_RINGBUF_MEM_OR_NULL = PTR_MAYBE_NULL | MEM_RINGBUF | RET_PTR_TO_MEM, RET_PTR_TO_DYNPTR_MEM_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_MEM, RET_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID, RET_PTR_TO_BTF_ID_TRUSTED = PTR_TRUSTED | RET_PTR_TO_BTF_ID, /* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag. */ __BPF_RET_TYPE_LIMIT = BPF_TYPE_LIMIT, }; static_assert(__BPF_RET_TYPE_MAX <= BPF_BASE_TYPE_LIMIT); /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs * to in-kernel helper functions and for adjusting imm32 field in BPF_CALL * instructions after verifying */ struct bpf_func_proto { u64 (*func)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); bool gpl_only; bool pkt_access; bool might_sleep; enum bpf_return_type ret_type; union { struct { enum bpf_arg_type arg1_type; enum bpf_arg_type arg2_type; enum bpf_arg_type arg3_type; enum bpf_arg_type arg4_type; enum bpf_arg_type arg5_type; }; enum bpf_arg_type arg_type[5]; }; union { struct { u32 *arg1_btf_id; u32 *arg2_btf_id; u32 *arg3_btf_id; u32 *arg4_btf_id; u32 *arg5_btf_id; }; u32 *arg_btf_id[5]; struct { size_t arg1_size; size_t arg2_size; size_t arg3_size; size_t arg4_size; size_t arg5_size; }; size_t arg_size[5]; }; int *ret_btf_id; /* return value btf_id */ bool (*allowed)(const struct bpf_prog *prog); }; /* bpf_context is intentionally undefined structure. Pointer to bpf_context is * the first argument to eBPF programs. * For socket filters: 'struct bpf_context *' == 'struct sk_buff *' */ struct bpf_context; enum bpf_access_type { BPF_READ = 1, BPF_WRITE = 2 }; /* types of values stored in eBPF registers */ /* Pointer types represent: * pointer * pointer + imm * pointer + (u16) var * pointer + (u16) var + imm * if (range > 0) then [ptr, ptr + range - off) is safe to access * if (id > 0) means that some 'var' was added * if (off > 0) means that 'imm' was added */ enum bpf_reg_type { NOT_INIT = 0, /* nothing was written into register */ SCALAR_VALUE, /* reg doesn't contain a valid pointer */ PTR_TO_CTX, /* reg points to bpf_context */ CONST_PTR_TO_MAP, /* reg points to struct bpf_map */ PTR_TO_MAP_VALUE, /* reg points to map element value */ PTR_TO_MAP_KEY, /* reg points to a map element key */ PTR_TO_STACK, /* reg == frame_pointer + offset */ PTR_TO_PACKET_META, /* skb->data - meta_len */ PTR_TO_PACKET, /* reg points to skb->data */ PTR_TO_PACKET_END, /* skb->data + headlen */ PTR_TO_FLOW_KEYS, /* reg points to bpf_flow_keys */ PTR_TO_SOCKET, /* reg points to struct bpf_sock */ PTR_TO_SOCK_COMMON, /* reg points to sock_common */ PTR_TO_TCP_SOCK, /* reg points to struct tcp_sock */ PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */ PTR_TO_XDP_SOCK, /* reg points to struct xdp_sock */ /* PTR_TO_BTF_ID points to a kernel struct that does not need * to be null checked by the BPF program. This does not imply the * pointer is _not_ null and in practice this can easily be a null * pointer when reading pointer chains. The assumption is program * context will handle null pointer dereference typically via fault * handling. The verifier must keep this in mind and can make no * assumptions about null or non-null when doing branch analysis. * Further, when passed into helpers the helpers can not, without * additional context, assume the value is non-null. */ PTR_TO_BTF_ID, /* PTR_TO_BTF_ID_OR_NULL points to a kernel struct that has not * been checked for null. Used primarily to inform the verifier * an explicit null check is required for this struct. */ PTR_TO_MEM, /* reg points to valid memory region */ PTR_TO_BUF, /* reg points to a read/write buffer */ PTR_TO_FUNC, /* reg points to a bpf program function */ CONST_PTR_TO_DYNPTR, /* reg points to a const struct bpf_dynptr */ __BPF_REG_TYPE_MAX, /* Extended reg_types. */ PTR_TO_MAP_VALUE_OR_NULL = PTR_MAYBE_NULL | PTR_TO_MAP_VALUE, PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | PTR_TO_SOCKET, PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | PTR_TO_SOCK_COMMON, PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | PTR_TO_TCP_SOCK, PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | PTR_TO_BTF_ID, /* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag. */ __BPF_REG_TYPE_LIMIT = BPF_TYPE_LIMIT, }; static_assert(__BPF_REG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT); /* The information passed from prog-specific *_is_valid_access * back to the verifier. */ struct bpf_insn_access_aux { enum bpf_reg_type reg_type; union { int ctx_field_size; struct { struct btf *btf; u32 btf_id; }; }; struct bpf_verifier_log *log; /* for verbose logs */ }; static inline void bpf_ctx_record_field_size(struct bpf_insn_access_aux *aux, u32 size) { aux->ctx_field_size = size; } static inline bool bpf_pseudo_func(const struct bpf_insn *insn) { return insn->code == (BPF_LD | BPF_IMM | BPF_DW) && insn->src_reg == BPF_PSEUDO_FUNC; } struct bpf_prog_ops { int (*test_run)(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); }; struct bpf_reg_state; struct bpf_verifier_ops { /* return eBPF function prototype for verification */ const struct bpf_func_proto * (*get_func_proto)(enum bpf_func_id func_id, const struct bpf_prog *prog); /* return true if 'size' wide access at offset 'off' within bpf_context * with 'type' (read or write) is allowed */ bool (*is_valid_access)(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info); int (*gen_prologue)(struct bpf_insn *insn, bool direct_write, const struct bpf_prog *prog); int (*gen_ld_abs)(const struct bpf_insn *orig, struct bpf_insn *insn_buf); u32 (*convert_ctx_access)(enum bpf_access_type type, const struct bpf_insn *src, struct bpf_insn *dst, struct bpf_prog *prog, u32 *target_size); int (*btf_struct_access)(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size); }; struct bpf_prog_offload_ops { /* verifier basic callbacks */ int (*insn_hook)(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx); int (*finalize)(struct bpf_verifier_env *env); /* verifier optimization callbacks (called after .finalize) */ int (*replace_insn)(struct bpf_verifier_env *env, u32 off, struct bpf_insn *insn); int (*remove_insns)(struct bpf_verifier_env *env, u32 off, u32 cnt); /* program management callbacks */ int (*prepare)(struct bpf_prog *prog); int (*translate)(struct bpf_prog *prog); void (*destroy)(struct bpf_prog *prog); }; struct bpf_prog_offload { struct bpf_prog *prog; struct net_device *netdev; struct bpf_offload_dev *offdev; void *dev_priv; struct list_head offloads; bool dev_state; bool opt_failed; void *jited_image; u32 jited_len; }; enum bpf_cgroup_storage_type { BPF_CGROUP_STORAGE_SHARED, BPF_CGROUP_STORAGE_PERCPU, __BPF_CGROUP_STORAGE_MAX }; #define MAX_BPF_CGROUP_STORAGE_TYPE __BPF_CGROUP_STORAGE_MAX /* The longest tracepoint has 12 args. * See include/trace/bpf_probe.h */ #define MAX_BPF_FUNC_ARGS 12 /* The maximum number of arguments passed through registers * a single function may have. */ #define MAX_BPF_FUNC_REG_ARGS 5 /* The argument is a structure. */ #define BTF_FMODEL_STRUCT_ARG BIT(0) /* The argument is signed. */ #define BTF_FMODEL_SIGNED_ARG BIT(1) struct btf_func_model { u8 ret_size; u8 ret_flags; u8 nr_args; u8 arg_size[MAX_BPF_FUNC_ARGS]; u8 arg_flags[MAX_BPF_FUNC_ARGS]; }; /* Restore arguments before returning from trampoline to let original function * continue executing. This flag is used for fentry progs when there are no * fexit progs. */ #define BPF_TRAMP_F_RESTORE_REGS BIT(0) /* Call original function after fentry progs, but before fexit progs. * Makes sense for fentry/fexit, normal calls and indirect calls. */ #define BPF_TRAMP_F_CALL_ORIG BIT(1) /* Skip current frame and return to parent. Makes sense for fentry/fexit * programs only. Should not be used with normal calls and indirect calls. */ #define BPF_TRAMP_F_SKIP_FRAME BIT(2) /* Store IP address of the caller on the trampoline stack, * so it's available for trampoline's programs. */ #define BPF_TRAMP_F_IP_ARG BIT(3) /* Return the return value of fentry prog. Only used by bpf_struct_ops. */ #define BPF_TRAMP_F_RET_FENTRY_RET BIT(4) /* Get original function from stack instead of from provided direct address. * Makes sense for trampolines with fexit or fmod_ret programs. */ #define BPF_TRAMP_F_ORIG_STACK BIT(5) /* This trampoline is on a function with another ftrace_ops with IPMODIFY, * e.g., a live patch. This flag is set and cleared by ftrace call backs, */ #define BPF_TRAMP_F_SHARE_IPMODIFY BIT(6) /* Indicate that current trampoline is in a tail call context. Then, it has to * cache and restore tail_call_cnt to avoid infinite tail call loop. */ #define BPF_TRAMP_F_TAIL_CALL_CTX BIT(7) /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 * bytes on x86. */ enum { #if defined(__s390x__) BPF_MAX_TRAMP_LINKS = 27, #else BPF_MAX_TRAMP_LINKS = 38, #endif }; struct bpf_tramp_links { struct bpf_tramp_link *links[BPF_MAX_TRAMP_LINKS]; int nr_links; }; struct bpf_tramp_run_ctx; /* Different use cases for BPF trampoline: * 1. replace nop at the function entry (kprobe equivalent) * flags = BPF_TRAMP_F_RESTORE_REGS * fentry = a set of programs to run before returning from trampoline * * 2. replace nop at the function entry (kprobe + kretprobe equivalent) * flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME * orig_call = fentry_ip + MCOUNT_INSN_SIZE * fentry = a set of program to run before calling original function * fexit = a set of program to run after original function * * 3. replace direct call instruction anywhere in the function body * or assign a function pointer for indirect call (like tcp_congestion_ops->cong_avoid) * With flags = 0 * fentry = a set of programs to run before returning from trampoline * With flags = BPF_TRAMP_F_CALL_ORIG * orig_call = original callback addr or direct function addr * fentry = a set of program to run before calling original function * fexit = a set of program to run after original function */ struct bpf_tramp_image; int arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end, const struct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, void *orig_call); u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx); void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr); void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr); typedef u64 (*bpf_trampoline_enter_t)(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); typedef void (*bpf_trampoline_exit_t)(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx); bpf_trampoline_enter_t bpf_trampoline_enter(const struct bpf_prog *prog); bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog); struct bpf_ksym { unsigned long start; unsigned long end; char name[KSYM_NAME_LEN]; struct list_head lnode; struct latch_tree_node tnode; bool prog; }; enum bpf_tramp_prog_type { BPF_TRAMP_FENTRY, BPF_TRAMP_FEXIT, BPF_TRAMP_MODIFY_RETURN, BPF_TRAMP_MAX, BPF_TRAMP_REPLACE, /* more than MAX */ }; struct bpf_tramp_image { void *image; struct bpf_ksym ksym; struct percpu_ref pcref; void *ip_after_call; void *ip_epilogue; union { struct rcu_head rcu; struct work_struct work; }; }; struct bpf_trampoline { /* hlist for trampoline_table */ struct hlist_node hlist; struct ftrace_ops *fops; /* serializes access to fields of this trampoline */ struct mutex mutex; refcount_t refcnt; u32 flags; u64 key; struct { struct btf_func_model model; void *addr; bool ftrace_managed; } func; /* if !NULL this is BPF_PROG_TYPE_EXT program that extends another BPF * program by replacing one of its functions. func.addr is the address * of the function it replaced. */ struct bpf_prog *extension_prog; /* list of BPF programs using this trampoline */ struct hlist_head progs_hlist[BPF_TRAMP_MAX]; /* Number of attached programs. A counter per kind. */ int progs_cnt[BPF_TRAMP_MAX]; /* Executable image of trampoline */ struct bpf_tramp_image *cur_image; struct module *mod; }; struct bpf_attach_target_info { struct btf_func_model fmodel; long tgt_addr; struct module *tgt_mod; const char *tgt_name; const struct btf_type *tgt_type; }; #define BPF_DISPATCHER_MAX 48 /* Fits in 2048B */ struct bpf_dispatcher_prog { struct bpf_prog *prog; refcount_t users; }; struct bpf_dispatcher { /* dispatcher mutex */ struct mutex mutex; void *func; struct bpf_dispatcher_prog progs[BPF_DISPATCHER_MAX]; int num_progs; void *image; void *rw_image; u32 image_off; struct bpf_ksym ksym; #ifdef CONFIG_HAVE_STATIC_CALL struct static_call_key *sc_key; void *sc_tramp; #endif }; static __always_inline __nocfi unsigned int bpf_dispatcher_nop_func( const void *ctx, const struct bpf_insn *insnsi, bpf_func_t bpf_func) { return bpf_func(ctx, insnsi); } /* the implementation of the opaque uapi struct bpf_dynptr */ struct bpf_dynptr_kern { void *data; /* Size represents the number of usable bytes of dynptr data. * If for example the offset is at 4 for a local dynptr whose data is * of type u64, the number of usable bytes is 4. * * The upper 8 bits are reserved. It is as follows: * Bits 0 - 23 = size * Bits 24 - 30 = dynptr type * Bit 31 = whether dynptr is read-only */ u32 size; u32 offset; } __aligned(8); enum bpf_dynptr_type { BPF_DYNPTR_TYPE_INVALID, /* Points to memory that is local to the bpf program */ BPF_DYNPTR_TYPE_LOCAL, /* Underlying data is a ringbuf record */ BPF_DYNPTR_TYPE_RINGBUF, /* Underlying data is a sk_buff */ BPF_DYNPTR_TYPE_SKB, /* Underlying data is a xdp_buff */ BPF_DYNPTR_TYPE_XDP, }; int bpf_dynptr_check_size(u32 size); u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr); #ifdef CONFIG_BPF_JIT int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr); int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr); struct bpf_trampoline *bpf_trampoline_get(u64 key, struct bpf_attach_target_info *tgt_info); void bpf_trampoline_put(struct bpf_trampoline *tr); int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs); /* * When the architecture supports STATIC_CALL replace the bpf_dispatcher_fn * indirection with a direct call to the bpf program. If the architecture does * not have STATIC_CALL, avoid a double-indirection. */ #ifdef CONFIG_HAVE_STATIC_CALL #define __BPF_DISPATCHER_SC_INIT(_name) \ .sc_key = &STATIC_CALL_KEY(_name), \ .sc_tramp = STATIC_CALL_TRAMP_ADDR(_name), #define __BPF_DISPATCHER_SC(name) \ DEFINE_STATIC_CALL(bpf_dispatcher_##name##_call, bpf_dispatcher_nop_func) #define __BPF_DISPATCHER_CALL(name) \ static_call(bpf_dispatcher_##name##_call)(ctx, insnsi, bpf_func) #define __BPF_DISPATCHER_UPDATE(_d, _new) \ __static_call_update((_d)->sc_key, (_d)->sc_tramp, (_new)) #else #define __BPF_DISPATCHER_SC_INIT(name) #define __BPF_DISPATCHER_SC(name) #define __BPF_DISPATCHER_CALL(name) bpf_func(ctx, insnsi) #define __BPF_DISPATCHER_UPDATE(_d, _new) #endif #define BPF_DISPATCHER_INIT(_name) { \ .mutex = __MUTEX_INITIALIZER(_name.mutex), \ .func = &_name##_func, \ .progs = {}, \ .num_progs = 0, \ .image = NULL, \ .image_off = 0, \ .ksym = { \ .name = #_name, \ .lnode = LIST_HEAD_INIT(_name.ksym.lnode), \ }, \ __BPF_DISPATCHER_SC_INIT(_name##_call) \ } #define DEFINE_BPF_DISPATCHER(name) \ __BPF_DISPATCHER_SC(name); \ noinline __nocfi unsigned int bpf_dispatcher_##name##_func( \ const void *ctx, \ const struct bpf_insn *insnsi, \ bpf_func_t bpf_func) \ { \ return __BPF_DISPATCHER_CALL(name); \ } \ EXPORT_SYMBOL(bpf_dispatcher_##name##_func); \ struct bpf_dispatcher bpf_dispatcher_##name = \ BPF_DISPATCHER_INIT(bpf_dispatcher_##name); #define DECLARE_BPF_DISPATCHER(name) \ unsigned int bpf_dispatcher_##name##_func( \ const void *ctx, \ const struct bpf_insn *insnsi, \ bpf_func_t bpf_func); \ extern struct bpf_dispatcher bpf_dispatcher_##name; #define BPF_DISPATCHER_FUNC(name) bpf_dispatcher_##name##_func #define BPF_DISPATCHER_PTR(name) (&bpf_dispatcher_##name) void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, struct bpf_prog *to); /* Called only from JIT-enabled code, so there's no need for stubs. */ void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym); void bpf_image_ksym_del(struct bpf_ksym *ksym); void bpf_ksym_add(struct bpf_ksym *ksym); void bpf_ksym_del(struct bpf_ksym *ksym); int bpf_jit_charge_modmem(u32 size); void bpf_jit_uncharge_modmem(u32 size); bool bpf_prog_has_trampoline(const struct bpf_prog *prog); #else static inline int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) { return -ENOTSUPP; } static inline int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) { return -ENOTSUPP; } static inline struct bpf_trampoline *bpf_trampoline_get(u64 key, struct bpf_attach_target_info *tgt_info) { return NULL; } static inline void bpf_trampoline_put(struct bpf_trampoline *tr) {} #define DEFINE_BPF_DISPATCHER(name) #define DECLARE_BPF_DISPATCHER(name) #define BPF_DISPATCHER_FUNC(name) bpf_dispatcher_nop_func #define BPF_DISPATCHER_PTR(name) NULL static inline void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, struct bpf_prog *to) {} static inline bool is_bpf_image_address(unsigned long address) { return false; } static inline bool bpf_prog_has_trampoline(const struct bpf_prog *prog) { return false; } #endif struct bpf_func_info_aux { u16 linkage; bool unreliable; }; enum bpf_jit_poke_reason { BPF_POKE_REASON_TAIL_CALL, }; /* Descriptor of pokes pointing /into/ the JITed image. */ struct bpf_jit_poke_descriptor { void *tailcall_target; void *tailcall_bypass; void *bypass_addr; void *aux; union { struct { struct bpf_map *map; u32 key; } tail_call; }; bool tailcall_target_stable; u8 adj_off; u16 reason; u32 insn_idx; }; /* reg_type info for ctx arguments */ struct bpf_ctx_arg_aux { u32 offset; enum bpf_reg_type reg_type; u32 btf_id; }; struct btf_mod_pair { struct btf *btf; struct module *module; }; struct bpf_kfunc_desc_tab; struct bpf_prog_aux { atomic64_t refcnt; u32 used_map_cnt; u32 used_btf_cnt; u32 max_ctx_offset; u32 max_pkt_offset; u32 max_tp_access; u32 stack_depth; u32 id; u32 func_cnt; /* used by non-func prog as the number of func progs */ u32 real_func_cnt; /* includes hidden progs, only used for JIT and freeing progs */ u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */ u32 attach_btf_id; /* in-kernel BTF type id to attach to */ u32 ctx_arg_info_size; u32 max_rdonly_access; u32 max_rdwr_access; struct btf *attach_btf; const struct bpf_ctx_arg_aux *ctx_arg_info; struct mutex dst_mutex; /* protects dst_* pointers below, *after* prog becomes visible */ struct bpf_prog *dst_prog; struct bpf_trampoline *dst_trampoline; enum bpf_prog_type saved_dst_prog_type; enum bpf_attach_type saved_dst_attach_type; bool verifier_zext; /* Zero extensions has been inserted by verifier. */ bool dev_bound; /* Program is bound to the netdev. */ bool offload_requested; /* Program is bound and offloaded to the netdev. */ bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ bool func_proto_unreliable; bool sleepable; bool tail_call_reachable; bool xdp_has_frags; bool exception_cb; bool exception_boundary; /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */ const struct btf_type *attach_func_proto; /* function name for valid attach_btf_id */ const char *attach_func_name; struct bpf_prog **func; void *jit_data; /* JIT specific data. arch dependent */ struct bpf_jit_poke_descriptor *poke_tab; struct bpf_kfunc_desc_tab *kfunc_tab; struct bpf_kfunc_btf_tab *kfunc_btf_tab; u32 size_poke_tab; struct bpf_ksym ksym; const struct bpf_prog_ops *ops; struct bpf_map **used_maps; struct mutex used_maps_mutex; /* mutex for used_maps and used_map_cnt */ struct btf_mod_pair *used_btfs; struct bpf_prog *prog; struct user_struct *user; u64 load_time; /* ns since boottime */ u32 verified_insns; int cgroup_atype; /* enum cgroup_bpf_attach_type */ struct bpf_map *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]; char name[BPF_OBJ_NAME_LEN]; unsigned int (*bpf_exception_cb)(u64 cookie, u64 sp, u64 bp); #ifdef CONFIG_SECURITY void *security; #endif struct bpf_prog_offload *offload; struct btf *btf; struct bpf_func_info *func_info; struct bpf_func_info_aux *func_info_aux; /* bpf_line_info loaded from userspace. linfo->insn_off * has the xlated insn offset. * Both the main and sub prog share the same linfo. * The subprog can access its first linfo by * using the linfo_idx. */ struct bpf_line_info *linfo; /* jited_linfo is the jited addr of the linfo. It has a * one to one mapping to linfo: * jited_linfo[i] is the jited addr for the linfo[i]->insn_off. * Both the main and sub prog share the same jited_linfo. * The subprog can access its first jited_linfo by * using the linfo_idx. */ void **jited_linfo; u32 func_info_cnt; u32 nr_linfo; /* subprog can use linfo_idx to access its first linfo and * jited_linfo. * main prog always has linfo_idx == 0 */ u32 linfo_idx; struct module *mod; u32 num_exentries; struct exception_table_entry *extable; union { struct work_struct work; struct rcu_head rcu; }; }; struct bpf_prog { u16 pages; /* Number of allocated pages */ u16 jited:1, /* Is our filter JIT'ed? */ jit_requested:1,/* archs need to JIT the prog */ gpl_compatible:1, /* Is filter GPL compatible? */ cb_access:1, /* Is control block accessed? */ dst_needed:1, /* Do we need dst entry? */ blinding_requested:1, /* needs constant blinding */ blinded:1, /* Was blinded */ is_func:1, /* program is a bpf function */ kprobe_override:1, /* Do we override a kprobe? */ has_callchain_buf:1, /* callchain buffer allocated? */ enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */ call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */ call_get_func_ip:1, /* Do we call get_func_ip() */ tstamp_type_access:1; /* Accessed __sk_buff->tstamp_type */ enum bpf_prog_type type; /* Type of BPF program */ enum bpf_attach_type expected_attach_type; /* For some prog types */ u32 len; /* Number of filter blocks */ u32 jited_len; /* Size of jited insns in bytes */ u8 tag[BPF_TAG_SIZE]; struct bpf_prog_stats __percpu *stats; int __percpu *active; unsigned int (*bpf_func)(const void *ctx, const struct bpf_insn *insn); struct bpf_prog_aux *aux; /* Auxiliary fields */ struct sock_fprog_kern *orig_prog; /* Original BPF program */ /* Instructions for interpreter */ union { DECLARE_FLEX_ARRAY(struct sock_filter, insns); DECLARE_FLEX_ARRAY(struct bpf_insn, insnsi); }; }; struct bpf_array_aux { /* Programs with direct jumps into programs part of this array. */ struct list_head poke_progs; struct bpf_map *map; struct mutex poke_mutex; struct work_struct work; }; struct bpf_link { atomic64_t refcnt; u32 id; enum bpf_link_type type; const struct bpf_link_ops *ops; struct bpf_prog *prog; struct work_struct work; }; struct bpf_link_ops { void (*release)(struct bpf_link *link); void (*dealloc)(struct bpf_link *link); int (*detach)(struct bpf_link *link); int (*update_prog)(struct bpf_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog); void (*show_fdinfo)(const struct bpf_link *link, struct seq_file *seq); int (*fill_link_info)(const struct bpf_link *link, struct bpf_link_info *info); int (*update_map)(struct bpf_link *link, struct bpf_map *new_map, struct bpf_map *old_map); }; struct bpf_tramp_link { struct bpf_link link; struct hlist_node tramp_hlist; u64 cookie; }; struct bpf_shim_tramp_link { struct bpf_tramp_link link; struct bpf_trampoline *trampoline; }; struct bpf_tracing_link { struct bpf_tramp_link link; enum bpf_attach_type attach_type; struct bpf_trampoline *trampoline; struct bpf_prog *tgt_prog; }; struct bpf_link_primer { struct bpf_link *link; struct file *file; int fd; u32 id; }; struct bpf_struct_ops_value; struct btf_member; #define BPF_STRUCT_OPS_MAX_NR_MEMBERS 64 /** * struct bpf_struct_ops - A structure of callbacks allowing a subsystem to * define a BPF_MAP_TYPE_STRUCT_OPS map type composed * of BPF_PROG_TYPE_STRUCT_OPS progs. * @verifier_ops: A structure of callbacks that are invoked by the verifier * when determining whether the struct_ops progs in the * struct_ops map are valid. * @init: A callback that is invoked a single time, and before any other * callback, to initialize the structure. A nonzero return value means * the subsystem could not be initialized. * @check_member: When defined, a callback invoked by the verifier to allow * the subsystem to determine if an entry in the struct_ops map * is valid. A nonzero return value means that the map is * invalid and should be rejected by the verifier. * @init_member: A callback that is invoked for each member of the struct_ops * map to allow the subsystem to initialize the member. A nonzero * value means the member could not be initialized. This callback * is exclusive with the @type, @type_id, @value_type, and * @value_id fields. * @reg: A callback that is invoked when the struct_ops map has been * initialized and is being attached to. Zero means the struct_ops map * has been successfully registered and is live. A nonzero return value * means the struct_ops map could not be registered. * @unreg: A callback that is invoked when the struct_ops map should be * unregistered. * @update: A callback that is invoked when the live struct_ops map is being * updated to contain new values. This callback is only invoked when * the struct_ops map is loaded with BPF_F_LINK. If not defined, the * it is assumed that the struct_ops map cannot be updated. * @validate: A callback that is invoked after all of the members have been * initialized. This callback should perform static checks on the * map, meaning that it should either fail or succeed * deterministically. A struct_ops map that has been validated may * not necessarily succeed in being registered if the call to @reg * fails. For example, a valid struct_ops map may be loaded, but * then fail to be registered due to there being another active * struct_ops map on the system in the subsystem already. For this * reason, if this callback is not defined, the check is skipped as * the struct_ops map will have final verification performed in * @reg. * @type: BTF type. * @value_type: Value type. * @name: The name of the struct bpf_struct_ops object. * @func_models: Func models * @type_id: BTF type id. * @value_id: BTF value id. */ struct bpf_struct_ops { const struct bpf_verifier_ops *verifier_ops; int (*init)(struct btf *btf); int (*check_member)(const struct btf_type *t, const struct btf_member *member, const struct bpf_prog *prog); int (*init_member)(const struct btf_type *t, const struct btf_member *member, void *kdata, const void *udata); int (*reg)(void *kdata); void (*unreg)(void *kdata); int (*update)(void *kdata, void *old_kdata); int (*validate)(void *kdata); const struct btf_type *type; const struct btf_type *value_type; const char *name; struct btf_func_model func_models[BPF_STRUCT_OPS_MAX_NR_MEMBERS]; u32 type_id; u32 value_id; }; #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) #define BPF_MODULE_OWNER ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA)) const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id); void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log); bool bpf_struct_ops_get(const void *kdata); void bpf_struct_ops_put(const void *kdata); int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, void *value); int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, struct bpf_tramp_link *link, const struct btf_func_model *model, void *image, void *image_end); static inline bool bpf_try_module_get(const void *data, struct module *owner) { if (owner == BPF_MODULE_OWNER) return bpf_struct_ops_get(data); else return try_module_get(owner); } static inline void bpf_module_put(const void *data, struct module *owner) { if (owner == BPF_MODULE_OWNER) bpf_struct_ops_put(data); else module_put(owner); } int bpf_struct_ops_link_create(union bpf_attr *attr); #ifdef CONFIG_NET /* Define it here to avoid the use of forward declaration */ struct bpf_dummy_ops_state { int val; }; struct bpf_dummy_ops { int (*test_1)(struct bpf_dummy_ops_state *cb); int (*test_2)(struct bpf_dummy_ops_state *cb, int a1, unsigned short a2, char a3, unsigned long a4); int (*test_sleepable)(struct bpf_dummy_ops_state *cb); }; int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); #endif #else static inline const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id) { return NULL; } static inline void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log) { } static inline bool bpf_try_module_get(const void *data, struct module *owner) { return try_module_get(owner); } static inline void bpf_module_put(const void *data, struct module *owner) { module_put(owner); } static inline int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, void *value) { return -EINVAL; } static inline int bpf_struct_ops_link_create(union bpf_attr *attr) { return -EOPNOTSUPP; } #endif #if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_LSM) int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, int cgroup_atype); void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog); #else static inline int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, int cgroup_atype) { return -EOPNOTSUPP; } static inline void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog) { } #endif struct bpf_array { struct bpf_map map; u32 elem_size; u32 index_mask; struct bpf_array_aux *aux; union { DECLARE_FLEX_ARRAY(char, value) __aligned(8); DECLARE_FLEX_ARRAY(void *, ptrs) __aligned(8); DECLARE_FLEX_ARRAY(void __percpu *, pptrs) __aligned(8); }; }; #define BPF_COMPLEXITY_LIMIT_INSNS 1000000 /* yes. 1M insns */ #define MAX_TAIL_CALL_CNT 33 /* Maximum number of loops for bpf_loop and bpf_iter_num. * It's enum to expose it (and thus make it discoverable) through BTF. */ enum { BPF_MAX_LOOPS = 8 * 1024 * 1024, }; #define BPF_F_ACCESS_MASK (BPF_F_RDONLY | \ BPF_F_RDONLY_PROG | \ BPF_F_WRONLY | \ BPF_F_WRONLY_PROG) #define BPF_MAP_CAN_READ BIT(0) #define BPF_MAP_CAN_WRITE BIT(1) /* Maximum number of user-producer ring buffer samples that can be drained in * a call to bpf_user_ringbuf_drain(). */ #define BPF_MAX_USER_RINGBUF_SAMPLES (128 * 1024) static inline u32 bpf_map_flags_to_cap(struct bpf_map *map) { u32 access_flags = map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG); /* Combination of BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG is * not possible. */ if (access_flags & BPF_F_RDONLY_PROG) return BPF_MAP_CAN_READ; else if (access_flags & BPF_F_WRONLY_PROG) return BPF_MAP_CAN_WRITE; else return BPF_MAP_CAN_READ | BPF_MAP_CAN_WRITE; } static inline bool bpf_map_flags_access_ok(u32 access_flags) { return (access_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) != (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG); } struct bpf_event_entry { struct perf_event *event; struct file *perf_file; struct file *map_file; struct rcu_head rcu; }; static inline bool map_type_contains_progs(struct bpf_map *map) { return map->map_type == BPF_MAP_TYPE_PROG_ARRAY || map->map_type == BPF_MAP_TYPE_DEVMAP || map->map_type == BPF_MAP_TYPE_CPUMAP; } bool bpf_prog_map_compatible(struct bpf_map *map, const struct bpf_prog *fp); int bpf_prog_calc_tag(struct bpf_prog *fp); const struct bpf_func_proto *bpf_get_trace_printk_proto(void); const struct bpf_func_proto *bpf_get_trace_vprintk_proto(void); typedef unsigned long (*bpf_ctx_copy_t)(void *dst, const void *src, unsigned long off, unsigned long len); typedef u32 (*bpf_convert_ctx_access_t)(enum bpf_access_type type, const struct bpf_insn *src, struct bpf_insn *dst, struct bpf_prog *prog, u32 *target_size); u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy); /* an array of programs to be executed under rcu_lock. * * Typical usage: * ret = bpf_prog_run_array(rcu_dereference(&bpf_prog_array), ctx, bpf_prog_run); * * the structure returned by bpf_prog_array_alloc() should be populated * with program pointers and the last pointer must be NULL. * The user has to keep refcnt on the program and make sure the program * is removed from the array before bpf_prog_put(). * The 'struct bpf_prog_array *' should only be replaced with xchg() * since other cpus are walking the array of pointers in parallel. */ struct bpf_prog_array_item { struct bpf_prog *prog; union { struct bpf_cgroup_storage *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]; u64 bpf_cookie; }; }; struct bpf_prog_array { struct rcu_head rcu; struct bpf_prog_array_item items[]; }; struct bpf_empty_prog_array { struct bpf_prog_array hdr; struct bpf_prog *null_prog; }; /* to avoid allocating empty bpf_prog_array for cgroups that * don't have bpf program attached use one global 'bpf_empty_prog_array' * It will not be modified the caller of bpf_prog_array_alloc() * (since caller requested prog_cnt == 0) * that pointer should be 'freed' by bpf_prog_array_free() */ extern struct bpf_empty_prog_array bpf_empty_prog_array; struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); void bpf_prog_array_free(struct bpf_prog_array *progs); /* Use when traversal over the bpf_prog_array uses tasks_trace rcu */ void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs); int bpf_prog_array_length(struct bpf_prog_array *progs); bool bpf_prog_array_is_empty(struct bpf_prog_array *array); int bpf_prog_array_copy_to_user(struct bpf_prog_array *progs, __u32 __user *prog_ids, u32 cnt); void bpf_prog_array_delete_safe(struct bpf_prog_array *progs, struct bpf_prog *old_prog); int bpf_prog_array_delete_safe_at(struct bpf_prog_array *array, int index); int bpf_prog_array_update_at(struct bpf_prog_array *array, int index, struct bpf_prog *prog); int bpf_prog_array_copy_info(struct bpf_prog_array *array, u32 *prog_ids, u32 request_cnt, u32 *prog_cnt); int bpf_prog_array_copy(struct bpf_prog_array *old_array, struct bpf_prog *exclude_prog, struct bpf_prog *include_prog, u64 bpf_cookie, struct bpf_prog_array **new_array); struct bpf_run_ctx {}; struct bpf_cg_run_ctx { struct bpf_run_ctx run_ctx; const struct bpf_prog_array_item *prog_item; int retval; }; struct bpf_trace_run_ctx { struct bpf_run_ctx run_ctx; u64 bpf_cookie; bool is_uprobe; }; struct bpf_tramp_run_ctx { struct bpf_run_ctx run_ctx; u64 bpf_cookie; struct bpf_run_ctx *saved_run_ctx; }; static inline struct bpf_run_ctx *bpf_set_run_ctx(struct bpf_run_ctx *new_ctx) { struct bpf_run_ctx *old_ctx = NULL; #ifdef CONFIG_BPF_SYSCALL old_ctx = current->bpf_ctx; current->bpf_ctx = new_ctx; #endif return old_ctx; } static inline void bpf_reset_run_ctx(struct bpf_run_ctx *old_ctx) { #ifdef CONFIG_BPF_SYSCALL current->bpf_ctx = old_ctx; #endif } /* BPF program asks to bypass CAP_NET_BIND_SERVICE in bind. */ #define BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE (1 << 0) /* BPF program asks to set CN on the packet. */ #define BPF_RET_SET_CN (1 << 0) typedef u32 (*bpf_prog_run_fn)(const struct bpf_prog *prog, const void *ctx); static __always_inline u32 bpf_prog_run_array(const struct bpf_prog_array *array, const void *ctx, bpf_prog_run_fn run_prog) { const struct bpf_prog_array_item *item; const struct bpf_prog *prog; struct bpf_run_ctx *old_run_ctx; struct bpf_trace_run_ctx run_ctx; u32 ret = 1; RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held"); if (unlikely(!array)) return ret; run_ctx.is_uprobe = false; migrate_disable(); old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); item = &array->items[0]; while ((prog = READ_ONCE(item->prog))) { run_ctx.bpf_cookie = item->bpf_cookie; ret &= run_prog(prog, ctx); item++; } bpf_reset_run_ctx(old_run_ctx); migrate_enable(); return ret; } /* Notes on RCU design for bpf_prog_arrays containing sleepable programs: * * We use the tasks_trace rcu flavor read section to protect the bpf_prog_array * overall. As a result, we must use the bpf_prog_array_free_sleepable * in order to use the tasks_trace rcu grace period. * * When a non-sleepable program is inside the array, we take the rcu read * section and disable preemption for that program alone, so it can access * rcu-protected dynamically sized maps. */ static __always_inline u32 bpf_prog_run_array_uprobe(const struct bpf_prog_array __rcu *array_rcu, const void *ctx, bpf_prog_run_fn run_prog) { const struct bpf_prog_array_item *item; const struct bpf_prog *prog; const struct bpf_prog_array *array; struct bpf_run_ctx *old_run_ctx; struct bpf_trace_run_ctx run_ctx; u32 ret = 1; might_fault(); rcu_read_lock_trace(); migrate_disable(); run_ctx.is_uprobe = true; array = rcu_dereference_check(array_rcu, rcu_read_lock_trace_held()); if (unlikely(!array)) goto out; old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); item = &array->items[0]; while ((prog = READ_ONCE(item->prog))) { if (!prog->aux->sleepable) rcu_read_lock(); run_ctx.bpf_cookie = item->bpf_cookie; ret &= run_prog(prog, ctx); item++; if (!prog->aux->sleepable) rcu_read_unlock(); } bpf_reset_run_ctx(old_run_ctx); out: migrate_enable(); rcu_read_unlock_trace(); return ret; } #ifdef CONFIG_BPF_SYSCALL DECLARE_PER_CPU(int, bpf_prog_active); extern struct mutex bpf_stats_enabled_mutex; /* * Block execution of BPF programs attached to instrumentation (perf, * kprobes, tracepoints) to prevent deadlocks on map operations as any of * these events can happen inside a region which holds a map bucket lock * and can deadlock on it. */ static inline void bpf_disable_instrumentation(void) { migrate_disable(); this_cpu_inc(bpf_prog_active); } static inline void bpf_enable_instrumentation(void) { this_cpu_dec(bpf_prog_active); migrate_enable(); } extern const struct file_operations bpf_map_fops; extern const struct file_operations bpf_prog_fops; extern const struct file_operations bpf_iter_fops; #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ extern const struct bpf_prog_ops _name ## _prog_ops; \ extern const struct bpf_verifier_ops _name ## _verifier_ops; #define BPF_MAP_TYPE(_id, _ops) \ extern const struct bpf_map_ops _ops; #define BPF_LINK_TYPE(_id, _name) #include <linux/bpf_types.h> #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE #undef BPF_LINK_TYPE extern const struct bpf_prog_ops bpf_offload_prog_ops; extern const struct bpf_verifier_ops tc_cls_act_analyzer_ops; extern const struct bpf_verifier_ops xdp_analyzer_ops; struct bpf_prog *bpf_prog_get(u32 ufd); struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, bool attach_drv); void bpf_prog_add(struct bpf_prog *prog, int i); void bpf_prog_sub(struct bpf_prog *prog, int i); void bpf_prog_inc(struct bpf_prog *prog); struct bpf_prog * __must_check bpf_prog_inc_not_zero(struct bpf_prog *prog); void bpf_prog_put(struct bpf_prog *prog); void bpf_prog_free_id(struct bpf_prog *prog); void bpf_map_free_id(struct bpf_map *map); struct btf_field *btf_record_find(const struct btf_record *rec, u32 offset, u32 field_mask); void btf_record_free(struct btf_record *rec); void bpf_map_free_record(struct bpf_map *map); struct btf_record *btf_record_dup(const struct btf_record *rec); bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b); void bpf_obj_free_timer(const struct btf_record *rec, void *obj); void bpf_obj_free_fields(const struct btf_record *rec, void *obj); void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu); struct bpf_map *bpf_map_get(u32 ufd); struct bpf_map *bpf_map_get_with_uref(u32 ufd); struct bpf_map *__bpf_map_get(struct fd f); void bpf_map_inc(struct bpf_map *map); void bpf_map_inc_with_uref(struct bpf_map *map); struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref); struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map); void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); void *bpf_map_area_alloc(u64 size, int numa_node); void *bpf_map_area_mmapable_alloc(u64 size, int numa_node); void bpf_map_area_free(void *base); bool bpf_map_write_active(const struct bpf_map *map); void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); int generic_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr); int generic_map_update_batch(struct bpf_map *map, struct file *map_file, const union bpf_attr *attr, union bpf_attr __user *uattr); int generic_map_delete_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr); struct bpf_map *bpf_map_get_curr_or_next(u32 *id); struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id); #ifdef CONFIG_MEMCG_KMEM void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, int node); void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags); void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size, gfp_t flags); void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, size_t align, gfp_t flags); #else static inline void * bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, int node) { return kmalloc_node(size, flags, node); } static inline void * bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags) { return kzalloc(size, flags); } static inline void * bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size, gfp_t flags) { return kvcalloc(n, size, flags); } static inline void __percpu * bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, size_t align, gfp_t flags) { return __alloc_percpu_gfp(size, align, flags); } #endif static inline int bpf_map_init_elem_count(struct bpf_map *map) { size_t size = sizeof(*map->elem_count), align = size; gfp_t flags = GFP_USER | __GFP_NOWARN; map->elem_count = bpf_map_alloc_percpu(map, size, align, flags); if (!map->elem_count) return -ENOMEM; return 0; } static inline void bpf_map_free_elem_count(struct bpf_map *map) { free_percpu(map->elem_count); } static inline void bpf_map_inc_elem_count(struct bpf_map *map) { this_cpu_inc(*map->elem_count); } static inline void bpf_map_dec_elem_count(struct bpf_map *map) { this_cpu_dec(*map->elem_count); } extern int sysctl_unprivileged_bpf_disabled; static inline bool bpf_allow_ptr_leaks(void) { return perfmon_capable(); } static inline bool bpf_allow_uninit_stack(void) { return perfmon_capable(); } static inline bool bpf_bypass_spec_v1(void) { return cpu_mitigations_off() || perfmon_capable(); } static inline bool bpf_bypass_spec_v4(void) { return cpu_mitigations_off() || perfmon_capable(); } int bpf_map_new_fd(struct bpf_map *map, int flags); int bpf_prog_new_fd(struct bpf_prog *prog); void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, const struct bpf_link_ops *ops, struct bpf_prog *prog); int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer); int bpf_link_settle(struct bpf_link_primer *primer); void bpf_link_cleanup(struct bpf_link_primer *primer); void bpf_link_inc(struct bpf_link *link); void bpf_link_put(struct bpf_link *link); int bpf_link_new_fd(struct bpf_link *link); struct bpf_link *bpf_link_get_from_fd(u32 ufd); struct bpf_link *bpf_link_get_curr_or_next(u32 *id); int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname); int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags); #define BPF_ITER_FUNC_PREFIX "bpf_iter_" #define DEFINE_BPF_ITER_FUNC(target, args...) \ extern int bpf_iter_ ## target(args); \ int __init bpf_iter_ ## target(args) { return 0; } /* * The task type of iterators. * * For BPF task iterators, they can be parameterized with various * parameters to visit only some of tasks. * * BPF_TASK_ITER_ALL (default) * Iterate over resources of every task. * * BPF_TASK_ITER_TID * Iterate over resources of a task/tid. * * BPF_TASK_ITER_TGID * Iterate over resources of every task of a process / task group. */ enum bpf_iter_task_type { BPF_TASK_ITER_ALL = 0, BPF_TASK_ITER_TID, BPF_TASK_ITER_TGID, }; struct bpf_iter_aux_info { /* for map_elem iter */ struct bpf_map *map; /* for cgroup iter */ struct { struct cgroup *start; /* starting cgroup */ enum bpf_cgroup_iter_order order; } cgroup; struct { enum bpf_iter_task_type type; u32 pid; } task; }; typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog, union bpf_iter_link_info *linfo, struct bpf_iter_aux_info *aux); typedef void (*bpf_iter_detach_target_t)(struct bpf_iter_aux_info *aux); typedef void (*bpf_iter_show_fdinfo_t) (const struct bpf_iter_aux_info *aux, struct seq_file *seq); typedef int (*bpf_iter_fill_link_info_t)(const struct bpf_iter_aux_info *aux, struct bpf_link_info *info); typedef const struct bpf_func_proto * (*bpf_iter_get_func_proto_t)(enum bpf_func_id func_id, const struct bpf_prog *prog); enum bpf_iter_feature { BPF_ITER_RESCHED = BIT(0), }; #define BPF_ITER_CTX_ARG_MAX 2 struct bpf_iter_reg { const char *target; bpf_iter_attach_target_t attach_target; bpf_iter_detach_target_t detach_target; bpf_iter_show_fdinfo_t show_fdinfo; bpf_iter_fill_link_info_t fill_link_info; bpf_iter_get_func_proto_t get_func_proto; u32 ctx_arg_info_size; u32 feature; struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX]; const struct bpf_iter_seq_info *seq_info; }; struct bpf_iter_meta { __bpf_md_ptr(struct seq_file *, seq); u64 session_id; u64 seq_num; }; struct bpf_iter__bpf_map_elem { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct bpf_map *, map); __bpf_md_ptr(void *, key); __bpf_md_ptr(void *, value); }; int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info); void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info); bool bpf_iter_prog_supported(struct bpf_prog *prog); const struct bpf_func_proto * bpf_iter_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_prog *prog); int bpf_iter_new_fd(struct bpf_link *link); bool bpf_link_is_iter(struct bpf_link *link); struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop); int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx); void bpf_iter_map_show_fdinfo(const struct bpf_iter_aux_info *aux, struct seq_file *seq); int bpf_iter_map_fill_link_info(const struct bpf_iter_aux_info *aux, struct bpf_link_info *info); int map_set_for_each_callback_args(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee); int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, u64 flags); int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, u64 flags); int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value); int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags); int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value); int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags); int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value); int bpf_get_file_flag(int flags); int bpf_check_uarg_tail_zero(bpfptr_t uaddr, size_t expected_size, size_t actual_size); /* verify correctness of eBPF program */ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size); #ifndef CONFIG_BPF_JIT_ALWAYS_ON void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); #endif struct btf *bpf_get_btf_vmlinux(void); /* Map specifics */ struct xdp_frame; struct sk_buff; struct bpf_dtab_netdev; struct bpf_cpu_map_entry; void __dev_flush(void); int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx); int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf, struct net_device *dev_rx); int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_map *map, bool exclude_ingress); int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, struct bpf_prog *xdp_prog); int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, struct bpf_prog *xdp_prog, struct bpf_map *map, bool exclude_ingress); void __cpu_map_flush(void); int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf, struct net_device *dev_rx); int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu, struct sk_buff *skb); /* Return map's numa specified by userspace */ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr) { return (attr->map_flags & BPF_F_NUMA_NODE) ? attr->numa_node : NUMA_NO_NODE; } struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type); int array_map_alloc_check(union bpf_attr *attr); int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); int bpf_prog_test_run_tracing(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); int bpf_prog_test_run_nf(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info); static inline bool bpf_tracing_ctx_access(int off, int size, enum bpf_access_type type) { if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) return false; if (type != BPF_READ) return false; if (off % size != 0) return false; return true; } static inline bool bpf_tracing_btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (!bpf_tracing_ctx_access(off, size, type)) return false; return btf_ctx_access(off, size, type, prog, info); } int btf_struct_access(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size, enum bpf_access_type atype, u32 *next_btf_id, enum bpf_type_flag *flag, const char **field_name); bool btf_struct_ids_match(struct bpf_verifier_log *log, const struct btf *btf, u32 id, int off, const struct btf *need_btf, u32 need_type_id, bool strict); int btf_distill_func_proto(struct bpf_verifier_log *log, struct btf *btf, const struct btf_type *func_proto, const char *func_name, struct btf_func_model *m); struct bpf_reg_state; int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs); int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs); int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *reg, bool is_ex_cb); int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog, struct btf *btf, const struct btf_type *t); const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt, int comp_idx, const char *tag_key); struct bpf_prog *bpf_prog_by_id(u32 id); struct bpf_link *bpf_link_by_id(u32 id); const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id); void bpf_task_storage_free(struct task_struct *task); void bpf_cgrp_storage_free(struct cgroup *cgroup); bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog); const struct btf_func_model * bpf_jit_find_kfunc_model(const struct bpf_prog *prog, const struct bpf_insn *insn); int bpf_get_kfunc_addr(const struct bpf_prog *prog, u32 func_id, u16 btf_fd_idx, u8 **func_addr); struct bpf_core_ctx { struct bpf_verifier_log *log; const struct btf *btf; }; bool btf_nested_type_is_trusted(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, const char *field_name, u32 btf_id, const char *suffix); bool btf_type_ids_nocast_alias(struct bpf_verifier_log *log, const struct btf *reg_btf, u32 reg_id, const struct btf *arg_btf, u32 arg_id); int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, int relo_idx, void *insn); static inline bool unprivileged_ebpf_enabled(void) { return !sysctl_unprivileged_bpf_disabled; } /* Not all bpf prog type has the bpf_ctx. * For the bpf prog type that has initialized the bpf_ctx, * this function can be used to decide if a kernel function * is called by a bpf program. */ static inline bool has_current_bpf_ctx(void) { return !!current->bpf_ctx; } void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog); void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, enum bpf_dynptr_type type, u32 offset, u32 size); void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr); void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr); bool dev_check_flush(void); bool cpu_map_check_flush(void); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { return ERR_PTR(-EOPNOTSUPP); } static inline struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, bool attach_drv) { return ERR_PTR(-EOPNOTSUPP); } static inline void bpf_prog_add(struct bpf_prog *prog, int i) { } static inline void bpf_prog_sub(struct bpf_prog *prog, int i) { } static inline void bpf_prog_put(struct bpf_prog *prog) { } static inline void bpf_prog_inc(struct bpf_prog *prog) { } static inline struct bpf_prog *__must_check bpf_prog_inc_not_zero(struct bpf_prog *prog) { return ERR_PTR(-EOPNOTSUPP); } static inline void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, const struct bpf_link_ops *ops, struct bpf_prog *prog) { } static inline int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer) { return -EOPNOTSUPP; } static inline int bpf_link_settle(struct bpf_link_primer *primer) { return -EOPNOTSUPP; } static inline void bpf_link_cleanup(struct bpf_link_primer *primer) { } static inline void bpf_link_inc(struct bpf_link *link) { } static inline void bpf_link_put(struct bpf_link *link) { } static inline int bpf_obj_get_user(const char __user *pathname, int flags) { return -EOPNOTSUPP; } static inline void __dev_flush(void) { } struct xdp_frame; struct bpf_dtab_netdev; struct bpf_cpu_map_entry; static inline int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx) { return 0; } static inline int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf, struct net_device *dev_rx) { return 0; } static inline int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_map *map, bool exclude_ingress) { return 0; } struct sk_buff; static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, struct bpf_prog *xdp_prog) { return 0; } static inline int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, struct bpf_prog *xdp_prog, struct bpf_map *map, bool exclude_ingress) { return 0; } static inline void __cpu_map_flush(void) { } static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf, struct net_device *dev_rx) { return 0; } static inline int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu, struct sk_buff *skb) { return -EOPNOTSUPP; } static inline struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type) { return ERR_PTR(-EOPNOTSUPP); } static inline int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { return -ENOTSUPP; } static inline int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { return -ENOTSUPP; } static inline int bpf_prog_test_run_tracing(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { return -ENOTSUPP; } static inline int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { return -ENOTSUPP; } static inline int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { return -ENOTSUPP; } static inline void bpf_map_put(struct bpf_map *map) { } static inline struct bpf_prog *bpf_prog_by_id(u32 id) { return ERR_PTR(-ENOTSUPP); } static inline int btf_struct_access(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size, enum bpf_access_type atype, u32 *next_btf_id, enum bpf_type_flag *flag, const char **field_name) { return -EACCES; } static inline const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { return NULL; } static inline void bpf_task_storage_free(struct task_struct *task) { } static inline bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog) { return false; } static inline const struct btf_func_model * bpf_jit_find_kfunc_model(const struct bpf_prog *prog, const struct bpf_insn *insn) { return NULL; } static inline int bpf_get_kfunc_addr(const struct bpf_prog *prog, u32 func_id, u16 btf_fd_idx, u8 **func_addr) { return -ENOTSUPP; } static inline bool unprivileged_ebpf_enabled(void) { return false; } static inline bool has_current_bpf_ctx(void) { return false; } static inline void bpf_prog_inc_misses_counter(struct bpf_prog *prog) { } static inline void bpf_cgrp_storage_free(struct cgroup *cgroup) { } static inline void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, enum bpf_dynptr_type type, u32 offset, u32 size) { } static inline void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr) { } static inline void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr) { } #endif /* CONFIG_BPF_SYSCALL */ static __always_inline int bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr) { int ret = -EFAULT; if (IS_ENABLED(CONFIG_BPF_EVENTS)) ret = copy_from_kernel_nofault(dst, unsafe_ptr, size); if (unlikely(ret < 0)) memset(dst, 0, size); return ret; } void __bpf_free_used_btfs(struct bpf_prog_aux *aux, struct btf_mod_pair *used_btfs, u32 len); static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) { return bpf_prog_get_type_dev(ufd, type, false); } void __bpf_free_used_maps(struct bpf_prog_aux *aux, struct bpf_map **used_maps, u32 len); bool bpf_prog_get_ok(struct bpf_prog *, enum bpf_prog_type *, bool); int bpf_prog_offload_compile(struct bpf_prog *prog); void bpf_prog_dev_bound_destroy(struct bpf_prog *prog); int bpf_prog_offload_info_fill(struct bpf_prog_info *info, struct bpf_prog *prog); int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map); int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value); int bpf_map_offload_update_elem(struct bpf_map *map, void *key, void *value, u64 flags); int bpf_map_offload_delete_elem(struct bpf_map *map, void *key); int bpf_map_offload_get_next_key(struct bpf_map *map, void *key, void *next_key); bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map); struct bpf_offload_dev * bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops, void *priv); void bpf_offload_dev_destroy(struct bpf_offload_dev *offdev); void *bpf_offload_dev_priv(struct bpf_offload_dev *offdev); int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev, struct net_device *netdev); void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev, struct net_device *netdev); bool bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev); void unpriv_ebpf_notify(int new_state); #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) int bpf_dev_bound_kfunc_check(struct bpf_verifier_log *log, struct bpf_prog_aux *prog_aux); void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id); int bpf_prog_dev_bound_init(struct bpf_prog *prog, union bpf_attr *attr); int bpf_prog_dev_bound_inherit(struct bpf_prog *new_prog, struct bpf_prog *old_prog); void bpf_dev_bound_netdev_unregister(struct net_device *dev); static inline bool bpf_prog_is_dev_bound(const struct bpf_prog_aux *aux) { return aux->dev_bound; } static inline bool bpf_prog_is_offloaded(const struct bpf_prog_aux *aux) { return aux->offload_requested; } bool bpf_prog_dev_bound_match(const struct bpf_prog *lhs, const struct bpf_prog *rhs); static inline bool bpf_map_is_offloaded(struct bpf_map *map) { return unlikely(map->ops == &bpf_map_offload_ops); } struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr); void bpf_map_offload_map_free(struct bpf_map *map); u64 bpf_map_offload_map_mem_usage(const struct bpf_map *map); int bpf_prog_test_run_syscall(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog); int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags); int sock_map_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); void sock_map_unhash(struct sock *sk); void sock_map_destroy(struct sock *sk); void sock_map_close(struct sock *sk, long timeout); #else static inline int bpf_dev_bound_kfunc_check(struct bpf_verifier_log *log, struct bpf_prog_aux *prog_aux) { return -EOPNOTSUPP; } static inline void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id) { return NULL; } static inline int bpf_prog_dev_bound_init(struct bpf_prog *prog, union bpf_attr *attr) { return -EOPNOTSUPP; } static inline int bpf_prog_dev_bound_inherit(struct bpf_prog *new_prog, struct bpf_prog *old_prog) { return -EOPNOTSUPP; } static inline void bpf_dev_bound_netdev_unregister(struct net_device *dev) { } static inline bool bpf_prog_is_dev_bound(const struct bpf_prog_aux *aux) { return false; } static inline bool bpf_prog_is_offloaded(struct bpf_prog_aux *aux) { return false; } static inline bool bpf_prog_dev_bound_match(const struct bpf_prog *lhs, const struct bpf_prog *rhs) { return false; } static inline bool bpf_map_is_offloaded(struct bpf_map *map) { return false; } static inline struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) { return ERR_PTR(-EOPNOTSUPP); } static inline void bpf_map_offload_map_free(struct bpf_map *map) { } static inline u64 bpf_map_offload_map_mem_usage(const struct bpf_map *map) { return 0; } static inline int bpf_prog_test_run_syscall(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { return -ENOTSUPP; } #ifdef CONFIG_BPF_SYSCALL static inline int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) { return -EINVAL; } static inline int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) { return -EOPNOTSUPP; } static inline int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags) { return -EOPNOTSUPP; } static inline int sock_map_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { return -EINVAL; } #endif /* CONFIG_BPF_SYSCALL */ #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ static __always_inline void bpf_prog_inc_misses_counters(const struct bpf_prog_array *array) { const struct bpf_prog_array_item *item; struct bpf_prog *prog; if (unlikely(!array)) return; item = &array->items[0]; while ((prog = READ_ONCE(item->prog))) { bpf_prog_inc_misses_counter(prog); item++; } } #if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) void bpf_sk_reuseport_detach(struct sock *sk); int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, void *value); int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags); #else static inline void bpf_sk_reuseport_detach(struct sock *sk) { } #ifdef CONFIG_BPF_SYSCALL static inline int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, void *value) { return -EOPNOTSUPP; } static inline int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { return -EOPNOTSUPP; } #endif /* CONFIG_BPF_SYSCALL */ #endif /* defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) */ /* verifier prototypes for helper functions called from eBPF programs */ extern const struct bpf_func_proto bpf_map_lookup_elem_proto; extern const struct bpf_func_proto bpf_map_update_elem_proto; extern const struct bpf_func_proto bpf_map_delete_elem_proto; extern const struct bpf_func_proto bpf_map_push_elem_proto; extern const struct bpf_func_proto bpf_map_pop_elem_proto; extern const struct bpf_func_proto bpf_map_peek_elem_proto; extern const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto; extern const struct bpf_func_proto bpf_get_prandom_u32_proto; extern const struct bpf_func_proto bpf_get_smp_processor_id_proto; extern const struct bpf_func_proto bpf_get_numa_node_id_proto; extern const struct bpf_func_proto bpf_tail_call_proto; extern const struct bpf_func_proto bpf_ktime_get_ns_proto; extern const struct bpf_func_proto bpf_ktime_get_boot_ns_proto; extern const struct bpf_func_proto bpf_ktime_get_tai_ns_proto; extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto; extern const struct bpf_func_proto bpf_get_current_uid_gid_proto; extern const struct bpf_func_proto bpf_get_current_comm_proto; extern const struct bpf_func_proto bpf_get_stackid_proto; extern const struct bpf_func_proto bpf_get_stack_proto; extern const struct bpf_func_proto bpf_get_task_stack_proto; extern const struct bpf_func_proto bpf_get_stackid_proto_pe; extern const struct bpf_func_proto bpf_get_stack_proto_pe; extern const struct bpf_func_proto bpf_sock_map_update_proto; extern const struct bpf_func_proto bpf_sock_hash_update_proto; extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto; extern const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto; extern const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto; extern const struct bpf_func_proto bpf_msg_redirect_hash_proto; extern const struct bpf_func_proto bpf_msg_redirect_map_proto; extern const struct bpf_func_proto bpf_sk_redirect_hash_proto; extern const struct bpf_func_proto bpf_sk_redirect_map_proto; extern const struct bpf_func_proto bpf_spin_lock_proto; extern const struct bpf_func_proto bpf_spin_unlock_proto; extern const struct bpf_func_proto bpf_get_local_storage_proto; extern const struct bpf_func_proto bpf_strtol_proto; extern const struct bpf_func_proto bpf_strtoul_proto; extern const struct bpf_func_proto bpf_tcp_sock_proto; extern const struct bpf_func_proto bpf_jiffies64_proto; extern const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto; extern const struct bpf_func_proto bpf_event_output_data_proto; extern const struct bpf_func_proto bpf_ringbuf_output_proto; extern const struct bpf_func_proto bpf_ringbuf_reserve_proto; extern const struct bpf_func_proto bpf_ringbuf_submit_proto; extern const struct bpf_func_proto bpf_ringbuf_discard_proto; extern const struct bpf_func_proto bpf_ringbuf_query_proto; extern const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto; extern const struct bpf_func_proto bpf_ringbuf_submit_dynptr_proto; extern const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto; extern const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto; extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto; extern const struct bpf_func_proto bpf_skc_to_unix_sock_proto; extern const struct bpf_func_proto bpf_skc_to_mptcp_sock_proto; extern const struct bpf_func_proto bpf_copy_from_user_proto; extern const struct bpf_func_proto bpf_snprintf_btf_proto; extern const struct bpf_func_proto bpf_snprintf_proto; extern const struct bpf_func_proto bpf_per_cpu_ptr_proto; extern const struct bpf_func_proto bpf_this_cpu_ptr_proto; extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto; extern const struct bpf_func_proto bpf_sock_from_file_proto; extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto; extern const struct bpf_func_proto bpf_task_storage_get_recur_proto; extern const struct bpf_func_proto bpf_task_storage_get_proto; extern const struct bpf_func_proto bpf_task_storage_delete_recur_proto; extern const struct bpf_func_proto bpf_task_storage_delete_proto; extern const struct bpf_func_proto bpf_for_each_map_elem_proto; extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto; extern const struct bpf_func_proto bpf_sk_setsockopt_proto; extern const struct bpf_func_proto bpf_sk_getsockopt_proto; extern const struct bpf_func_proto bpf_unlocked_sk_setsockopt_proto; extern const struct bpf_func_proto bpf_unlocked_sk_getsockopt_proto; extern const struct bpf_func_proto bpf_find_vma_proto; extern const struct bpf_func_proto bpf_loop_proto; extern const struct bpf_func_proto bpf_copy_from_user_task_proto; extern const struct bpf_func_proto bpf_set_retval_proto; extern const struct bpf_func_proto bpf_get_retval_proto; extern const struct bpf_func_proto bpf_user_ringbuf_drain_proto; extern const struct bpf_func_proto bpf_cgrp_storage_get_proto; extern const struct bpf_func_proto bpf_cgrp_storage_delete_proto; const struct bpf_func_proto *tracing_prog_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); u64 bpf_get_raw_cpu_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); #if defined(CONFIG_NET) bool bpf_sock_common_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info); bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info); u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size); int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags, struct bpf_dynptr_kern *ptr); #else static inline bool bpf_sock_common_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { return false; } static inline bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { return false; } static inline u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { return 0; } static inline int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags, struct bpf_dynptr_kern *ptr) { return -EOPNOTSUPP; } #endif #ifdef CONFIG_INET struct sk_reuseport_kern { struct sk_buff *skb; struct sock *sk; struct sock *selected_sk; struct sock *migrating_sk; void *data_end; u32 hash; u32 reuseport_id; bool bind_inany; }; bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info); u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size); bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info); u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size); #else static inline bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { return false; } static inline u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { return 0; } static inline bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { return false; } static inline u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { return 0; } #endif /* CONFIG_INET */ enum bpf_text_poke_type { BPF_MOD_CALL, BPF_MOD_JUMP, }; int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *addr1, void *addr2); void *bpf_arch_text_copy(void *dst, void *src, size_t len); int bpf_arch_text_invalidate(void *dst, size_t len); struct btf_id_set; bool btf_id_set_contains(const struct btf_id_set *set, u32 id); #define MAX_BPRINTF_VARARGS 12 #define MAX_BPRINTF_BUF 1024 struct bpf_bprintf_data { u32 *bin_args; char *buf; bool get_bin_args; bool get_buf; }; int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, u32 num_args, struct bpf_bprintf_data *data); void bpf_bprintf_cleanup(struct bpf_bprintf_data *data); #ifdef CONFIG_BPF_LSM void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype); void bpf_cgroup_atype_put(int cgroup_atype); #else static inline void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype) {} static inline void bpf_cgroup_atype_put(int cgroup_atype) {} #endif /* CONFIG_BPF_LSM */ struct key; #ifdef CONFIG_KEYS struct bpf_key { struct key *key; bool has_ref; }; #endif /* CONFIG_KEYS */ static inline bool type_is_alloc(u32 type) { return type & MEM_ALLOC; } static inline gfp_t bpf_memcg_flags(gfp_t flags) { if (memcg_bpf_enabled()) return flags | __GFP_ACCOUNT; return flags; } static inline bool bpf_is_subprog(const struct bpf_prog *prog) { return prog->aux->func_idx != 0; } #endif /* _LINUX_BPF_H */
97 35 1 97 101 101 101 91 90 88 89 17 15 74 72 69 69 97 97 101 15 101 72 101 104 104 104 34 94 70 50 11 10 47 4 3 43 12 11 32 2 2 30 13 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 // SPDX-License-Identifier: GPL-2.0 /* * security/tomoyo/mount.c * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #include <linux/slab.h> #include <uapi/linux/mount.h> #include "common.h" /* String table for special mount operations. */ static const char * const tomoyo_mounts[TOMOYO_MAX_SPECIAL_MOUNT] = { [TOMOYO_MOUNT_BIND] = "--bind", [TOMOYO_MOUNT_MOVE] = "--move", [TOMOYO_MOUNT_REMOUNT] = "--remount", [TOMOYO_MOUNT_MAKE_UNBINDABLE] = "--make-unbindable", [TOMOYO_MOUNT_MAKE_PRIVATE] = "--make-private", [TOMOYO_MOUNT_MAKE_SLAVE] = "--make-slave", [TOMOYO_MOUNT_MAKE_SHARED] = "--make-shared", }; /** * tomoyo_audit_mount_log - Audit mount log. * * @r: Pointer to "struct tomoyo_request_info". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_audit_mount_log(struct tomoyo_request_info *r) { return tomoyo_supervisor(r, "file mount %s %s %s 0x%lX\n", r->param.mount.dev->name, r->param.mount.dir->name, r->param.mount.type->name, r->param.mount.flags); } /** * tomoyo_check_mount_acl - Check permission for path path path number operation. * * @r: Pointer to "struct tomoyo_request_info". * @ptr: Pointer to "struct tomoyo_acl_info". * * Returns true if granted, false otherwise. */ static bool tomoyo_check_mount_acl(struct tomoyo_request_info *r, const struct tomoyo_acl_info *ptr) { const struct tomoyo_mount_acl *acl = container_of(ptr, typeof(*acl), head); return tomoyo_compare_number_union(r->param.mount.flags, &acl->flags) && tomoyo_compare_name_union(r->param.mount.type, &acl->fs_type) && tomoyo_compare_name_union(r->param.mount.dir, &acl->dir_name) && (!r->param.mount.need_dev || tomoyo_compare_name_union(r->param.mount.dev, &acl->dev_name)); } /** * tomoyo_mount_acl - Check permission for mount() operation. * * @r: Pointer to "struct tomoyo_request_info". * @dev_name: Name of device file. Maybe NULL. * @dir: Pointer to "struct path". * @type: Name of filesystem type. * @flags: Mount options. * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_mount_acl(struct tomoyo_request_info *r, const char *dev_name, const struct path *dir, const char *type, unsigned long flags) { struct tomoyo_obj_info obj = { }; struct path path; struct file_system_type *fstype = NULL; const char *requested_type = NULL; const char *requested_dir_name = NULL; const char *requested_dev_name = NULL; struct tomoyo_path_info rtype; struct tomoyo_path_info rdev; struct tomoyo_path_info rdir; int need_dev = 0; int error = -ENOMEM; r->obj = &obj; /* Get fstype. */ requested_type = tomoyo_encode(type); if (!requested_type) goto out; rtype.name = requested_type; tomoyo_fill_path_info(&rtype); /* Get mount point. */ obj.path2 = *dir; requested_dir_name = tomoyo_realpath_from_path(dir); if (!requested_dir_name) { error = -ENOMEM; goto out; } rdir.name = requested_dir_name; tomoyo_fill_path_info(&rdir); /* Compare fs name. */ if (type == tomoyo_mounts[TOMOYO_MOUNT_REMOUNT]) { /* dev_name is ignored. */ } else if (type == tomoyo_mounts[TOMOYO_MOUNT_MAKE_UNBINDABLE] || type == tomoyo_mounts[TOMOYO_MOUNT_MAKE_PRIVATE] || type == tomoyo_mounts[TOMOYO_MOUNT_MAKE_SLAVE] || type == tomoyo_mounts[TOMOYO_MOUNT_MAKE_SHARED]) { /* dev_name is ignored. */ } else if (type == tomoyo_mounts[TOMOYO_MOUNT_BIND] || type == tomoyo_mounts[TOMOYO_MOUNT_MOVE]) { need_dev = -1; /* dev_name is a directory */ } else { fstype = get_fs_type(type); if (!fstype) { error = -ENODEV; goto out; } if (fstype->fs_flags & FS_REQUIRES_DEV) /* dev_name is a block device file. */ need_dev = 1; } if (need_dev) { /* Get mount point or device file. */ if (!dev_name || kern_path(dev_name, LOOKUP_FOLLOW, &path)) { error = -ENOENT; goto out; } obj.path1 = path; requested_dev_name = tomoyo_realpath_from_path(&path); if (!requested_dev_name) { error = -ENOENT; goto out; } } else { /* Map dev_name to "<NULL>" if no dev_name given. */ if (!dev_name) dev_name = "<NULL>"; requested_dev_name = tomoyo_encode(dev_name); if (!requested_dev_name) { error = -ENOMEM; goto out; } } rdev.name = requested_dev_name; tomoyo_fill_path_info(&rdev); r->param_type = TOMOYO_TYPE_MOUNT_ACL; r->param.mount.need_dev = need_dev; r->param.mount.dev = &rdev; r->param.mount.dir = &rdir; r->param.mount.type = &rtype; r->param.mount.flags = flags; do { tomoyo_check_acl(r, tomoyo_check_mount_acl); error = tomoyo_audit_mount_log(r); } while (error == TOMOYO_RETRY_REQUEST); out: kfree(requested_dev_name); kfree(requested_dir_name); if (fstype) put_filesystem(fstype); kfree(requested_type); /* Drop refcount obtained by kern_path(). */ if (obj.path1.dentry) path_put(&obj.path1); return error; } /** * tomoyo_mount_permission - Check permission for mount() operation. * * @dev_name: Name of device file. Maybe NULL. * @path: Pointer to "struct path". * @type: Name of filesystem type. Maybe NULL. * @flags: Mount options. * @data_page: Optional data. Maybe NULL. * * Returns 0 on success, negative value otherwise. */ int tomoyo_mount_permission(const char *dev_name, const struct path *path, const char *type, unsigned long flags, void *data_page) { struct tomoyo_request_info r; int error; int idx; if (tomoyo_init_request_info(&r, NULL, TOMOYO_MAC_FILE_MOUNT) == TOMOYO_CONFIG_DISABLED) return 0; if ((flags & MS_MGC_MSK) == MS_MGC_VAL) flags &= ~MS_MGC_MSK; if (flags & MS_REMOUNT) { type = tomoyo_mounts[TOMOYO_MOUNT_REMOUNT]; flags &= ~MS_REMOUNT; } else if (flags & MS_BIND) { type = tomoyo_mounts[TOMOYO_MOUNT_BIND]; flags &= ~MS_BIND; } else if (flags & MS_SHARED) { if (flags & (MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) return -EINVAL; type = tomoyo_mounts[TOMOYO_MOUNT_MAKE_SHARED]; flags &= ~MS_SHARED; } else if (flags & MS_PRIVATE) { if (flags & (MS_SHARED | MS_SLAVE | MS_UNBINDABLE)) return -EINVAL; type = tomoyo_mounts[TOMOYO_MOUNT_MAKE_PRIVATE]; flags &= ~MS_PRIVATE; } else if (flags & MS_SLAVE) { if (flags & (MS_SHARED | MS_PRIVATE | MS_UNBINDABLE)) return -EINVAL; type = tomoyo_mounts[TOMOYO_MOUNT_MAKE_SLAVE]; flags &= ~MS_SLAVE; } else if (flags & MS_UNBINDABLE) { if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE)) return -EINVAL; type = tomoyo_mounts[TOMOYO_MOUNT_MAKE_UNBINDABLE]; flags &= ~MS_UNBINDABLE; } else if (flags & MS_MOVE) { type = tomoyo_mounts[TOMOYO_MOUNT_MOVE]; flags &= ~MS_MOVE; } if (!type) type = "<NULL>"; idx = tomoyo_read_lock(); error = tomoyo_mount_acl(&r, dev_name, path, type, flags); tomoyo_read_unlock(idx); return error; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 /* SPDX-License-Identifier: GPL-2.0 */ /* * NUMA memory policies for Linux. * Copyright 2003,2004 Andi Kleen SuSE Labs */ #ifndef _LINUX_MEMPOLICY_H #define _LINUX_MEMPOLICY_H 1 #include <linux/sched.h> #include <linux/mmzone.h> #include <linux/slab.h> #include <linux/rbtree.h> #include <linux/spinlock.h> #include <linux/nodemask.h> #include <linux/pagemap.h> #include <uapi/linux/mempolicy.h> struct mm_struct; #define NO_INTERLEAVE_INDEX (-1UL) /* use task il_prev for interleaving */ #ifdef CONFIG_NUMA /* * Describe a memory policy. * * A mempolicy can be either associated with a process or with a VMA. * For VMA related allocations the VMA policy is preferred, otherwise * the process policy is used. Interrupts ignore the memory policy * of the current process. * * Locking policy for interleave: * In process context there is no locking because only the process accesses * its own state. All vma manipulation is somewhat protected by a down_read on * mmap_lock. * * Freeing policy: * Mempolicy objects are reference counted. A mempolicy will be freed when * mpol_put() decrements the reference count to zero. * * Duplicating policy objects: * mpol_dup() allocates a new mempolicy and copies the specified mempolicy * to the new storage. The reference count of the new object is initialized * to 1, representing the caller of mpol_dup(). */ struct mempolicy { atomic_t refcnt; unsigned short mode; /* See MPOL_* above */ unsigned short flags; /* See set_mempolicy() MPOL_F_* above */ nodemask_t nodes; /* interleave/bind/perfer */ int home_node; /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */ union { nodemask_t cpuset_mems_allowed; /* relative to these nodes */ nodemask_t user_nodemask; /* nodemask passed by user */ } w; }; /* * Support for managing mempolicy data objects (clone, copy, destroy) * The default fast path of a NULL MPOL_DEFAULT policy is always inlined. */ extern void __mpol_put(struct mempolicy *pol); static inline void mpol_put(struct mempolicy *pol) { if (pol) __mpol_put(pol); } /* * Does mempolicy pol need explicit unref after use? * Currently only needed for shared policies. */ static inline int mpol_needs_cond_ref(struct mempolicy *pol) { return (pol && (pol->flags & MPOL_F_SHARED)); } static inline void mpol_cond_put(struct mempolicy *pol) { if (mpol_needs_cond_ref(pol)) __mpol_put(pol); } extern struct mempolicy *__mpol_dup(struct mempolicy *pol); static inline struct mempolicy *mpol_dup(struct mempolicy *pol) { if (pol) pol = __mpol_dup(pol); return pol; } static inline void mpol_get(struct mempolicy *pol) { if (pol) atomic_inc(&pol->refcnt); } extern bool __mpol_equal(struct mempolicy *a, struct mempolicy *b); static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) { if (a == b) return true; return __mpol_equal(a, b); } /* * Tree of shared policies for a shared memory region. */ struct shared_policy { struct rb_root root; rwlock_t lock; }; struct sp_node { struct rb_node nd; pgoff_t start, end; struct mempolicy *policy; }; int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst); void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol); int mpol_set_shared_policy(struct shared_policy *sp, struct vm_area_struct *vma, struct mempolicy *mpol); void mpol_free_shared_policy(struct shared_policy *sp); struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx); struct mempolicy *get_task_policy(struct task_struct *p); struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, unsigned long addr, pgoff_t *ilx); struct mempolicy *get_vma_policy(struct vm_area_struct *vma, unsigned long addr, int order, pgoff_t *ilx); bool vma_policy_mof(struct vm_area_struct *vma); extern void numa_default_policy(void); extern void numa_policy_init(void); extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new); extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new); extern int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask); extern bool init_nodemask_of_mempolicy(nodemask_t *mask); extern bool mempolicy_in_oom_domain(struct task_struct *tsk, const nodemask_t *mask); extern unsigned int mempolicy_slab_node(void); extern enum zone_type policy_zone; static inline void check_highest_zone(enum zone_type k) { if (k > policy_zone && k != ZONE_MOVABLE) policy_zone = k; } int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags); #ifdef CONFIG_TMPFS extern int mpol_parse_str(char *str, struct mempolicy **mpol); #endif extern void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol); /* Check if a vma is migratable */ extern bool vma_migratable(struct vm_area_struct *vma); int mpol_misplaced(struct folio *, struct vm_area_struct *, unsigned long); extern void mpol_put_task_policy(struct task_struct *); static inline bool mpol_is_preferred_many(struct mempolicy *pol) { return (pol->mode == MPOL_PREFERRED_MANY); } extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone); #else struct mempolicy {}; static inline struct mempolicy *get_task_policy(struct task_struct *p) { return NULL; } static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) { return true; } static inline void mpol_put(struct mempolicy *pol) { } static inline void mpol_cond_put(struct mempolicy *pol) { } static inline void mpol_get(struct mempolicy *pol) { } struct shared_policy {}; static inline void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) { } static inline void mpol_free_shared_policy(struct shared_policy *sp) { } static inline struct mempolicy * mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx) { return NULL; } static inline struct mempolicy *get_vma_policy(struct vm_area_struct *vma, unsigned long addr, int order, pgoff_t *ilx) { *ilx = 0; return NULL; } static inline int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) { return 0; } static inline void numa_policy_init(void) { } static inline void numa_default_policy(void) { } static inline void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) { } static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) { } static inline int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask) { *mpol = NULL; *nodemask = NULL; return 0; } static inline bool init_nodemask_of_mempolicy(nodemask_t *m) { return false; } static inline int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags) { return 0; } static inline void check_highest_zone(int k) { } #ifdef CONFIG_TMPFS static inline int mpol_parse_str(char *str, struct mempolicy **mpol) { return 1; /* error */ } #endif static inline int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma, unsigned long address) { return -1; /* no node preference */ } static inline void mpol_put_task_policy(struct task_struct *task) { } static inline bool mpol_is_preferred_many(struct mempolicy *pol) { return false; } #endif /* CONFIG_NUMA */ #endif
365 23 6 19 16 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 // SPDX-License-Identifier: GPL-2.0 #include <net/ip.h> #include <net/udp.h> #include <net/udplite.h> #include <asm/checksum.h> #ifndef _HAVE_ARCH_IPV6_CSUM __sum16 csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr, __u32 len, __u8 proto, __wsum csum) { int carry; __u32 ulen; __u32 uproto; __u32 sum = (__force u32)csum; sum += (__force u32)saddr->s6_addr32[0]; carry = (sum < (__force u32)saddr->s6_addr32[0]); sum += carry; sum += (__force u32)saddr->s6_addr32[1]; carry = (sum < (__force u32)saddr->s6_addr32[1]); sum += carry; sum += (__force u32)saddr->s6_addr32[2]; carry = (sum < (__force u32)saddr->s6_addr32[2]); sum += carry; sum += (__force u32)saddr->s6_addr32[3]; carry = (sum < (__force u32)saddr->s6_addr32[3]); sum += carry; sum += (__force u32)daddr->s6_addr32[0]; carry = (sum < (__force u32)daddr->s6_addr32[0]); sum += carry; sum += (__force u32)daddr->s6_addr32[1]; carry = (sum < (__force u32)daddr->s6_addr32[1]); sum += carry; sum += (__force u32)daddr->s6_addr32[2]; carry = (sum < (__force u32)daddr->s6_addr32[2]); sum += carry; sum += (__force u32)daddr->s6_addr32[3]; carry = (sum < (__force u32)daddr->s6_addr32[3]); sum += carry; ulen = (__force u32)htonl((__u32) len); sum += ulen; carry = (sum < ulen); sum += carry; uproto = (__force u32)htonl(proto); sum += uproto; carry = (sum < uproto); sum += carry; return csum_fold((__force __wsum)sum); } EXPORT_SYMBOL(csum_ipv6_magic); #endif int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto) { int err; UDP_SKB_CB(skb)->partial_cov = 0; UDP_SKB_CB(skb)->cscov = skb->len; if (proto == IPPROTO_UDPLITE) { err = udplite_checksum_init(skb, uh); if (err) return err; if (UDP_SKB_CB(skb)->partial_cov) { skb->csum = ip6_compute_pseudo(skb, proto); return 0; } } /* To support RFC 6936 (allow zero checksum in UDP/IPV6 for tunnels) * we accept a checksum of zero here. When we find the socket * for the UDP packet we'll check if that socket allows zero checksum * for IPv6 (set by socket option). * * Note, we are only interested in != 0 or == 0, thus the * force to int. */ err = (__force int)skb_checksum_init_zero_check(skb, proto, uh->check, ip6_compute_pseudo); if (err) return err; if (skb->ip_summed == CHECKSUM_COMPLETE && !skb->csum_valid) { /* If SW calculated the value, we know it's bad */ if (skb->csum_complete_sw) return 1; /* HW says the value is bad. Let's validate that. * skb->csum is no longer the full packet checksum, * so don't treat is as such. */ skb_checksum_complete_unset(skb); } return 0; } EXPORT_SYMBOL(udp6_csum_init); /* Function to set UDP checksum for an IPv6 UDP packet. This is intended * for the simple case like when setting the checksum for a UDP tunnel. */ void udp6_set_csum(bool nocheck, struct sk_buff *skb, const struct in6_addr *saddr, const struct in6_addr *daddr, int len) { struct udphdr *uh = udp_hdr(skb); if (nocheck) uh->check = 0; else if (skb_is_gso(skb)) uh->check = ~udp_v6_check(len, saddr, daddr, 0); else if (skb->ip_summed == CHECKSUM_PARTIAL) { uh->check = 0; uh->check = udp_v6_check(len, saddr, daddr, lco_csum(skb)); if (uh->check == 0) uh->check = CSUM_MANGLED_0; } else { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct udphdr, check); uh->check = ~udp_v6_check(len, saddr, daddr, 0); } } EXPORT_SYMBOL(udp6_set_csum);
3164 5312 5312 4718 3162 4623 5312 1813 3163 3163 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 /* SPDX-License-Identifier: GPL-2.0+ */ /* * Sleepable Read-Copy Update mechanism for mutual exclusion * * Copyright (C) IBM Corporation, 2006 * Copyright (C) Fujitsu, 2012 * * Author: Paul McKenney <paulmck@linux.ibm.com> * Lai Jiangshan <laijs@cn.fujitsu.com> * * For detailed explanation of Read-Copy Update mechanism see - * Documentation/RCU/ *.txt * */ #ifndef _LINUX_SRCU_H #define _LINUX_SRCU_H #include <linux/mutex.h> #include <linux/rcupdate.h> #include <linux/workqueue.h> #include <linux/rcu_segcblist.h> struct srcu_struct; #ifdef CONFIG_DEBUG_LOCK_ALLOC int __init_srcu_struct(struct srcu_struct *ssp, const char *name, struct lock_class_key *key); #define init_srcu_struct(ssp) \ ({ \ static struct lock_class_key __srcu_key; \ \ __init_srcu_struct((ssp), #ssp, &__srcu_key); \ }) #define __SRCU_DEP_MAP_INIT(srcu_name) .dep_map = { .name = #srcu_name }, #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ int init_srcu_struct(struct srcu_struct *ssp); #define __SRCU_DEP_MAP_INIT(srcu_name) #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ #ifdef CONFIG_TINY_SRCU #include <linux/srcutiny.h> #elif defined(CONFIG_TREE_SRCU) #include <linux/srcutree.h> #else #error "Unknown SRCU implementation specified to kernel configuration" #endif void call_srcu(struct srcu_struct *ssp, struct rcu_head *head, void (*func)(struct rcu_head *head)); void cleanup_srcu_struct(struct srcu_struct *ssp); int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp); void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp); void synchronize_srcu(struct srcu_struct *ssp); unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp); unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp); bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie); #ifdef CONFIG_NEED_SRCU_NMI_SAFE int __srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires(ssp); void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) __releases(ssp); #else static inline int __srcu_read_lock_nmisafe(struct srcu_struct *ssp) { return __srcu_read_lock(ssp); } static inline void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) { __srcu_read_unlock(ssp, idx); } #endif /* CONFIG_NEED_SRCU_NMI_SAFE */ void srcu_init(void); #ifdef CONFIG_DEBUG_LOCK_ALLOC /** * srcu_read_lock_held - might we be in SRCU read-side critical section? * @ssp: The srcu_struct structure to check * * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an SRCU * read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC, * this assumes we are in an SRCU read-side critical section unless it can * prove otherwise. * * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot * and while lockdep is disabled. * * Note that SRCU is based on its own statemachine and it doesn't * relies on normal RCU, it can be called from the CPU which * is in the idle loop from an RCU point of view or offline. */ static inline int srcu_read_lock_held(const struct srcu_struct *ssp) { if (!debug_lockdep_rcu_enabled()) return 1; return lock_is_held(&ssp->dep_map); } /* * Annotations provide deadlock detection for SRCU. * * Similar to other lockdep annotations, except there is an additional * srcu_lock_sync(), which is basically an empty *write*-side critical section, * see lock_sync() for more information. */ /* Annotates a srcu_read_lock() */ static inline void srcu_lock_acquire(struct lockdep_map *map) { lock_map_acquire_read(map); } /* Annotates a srcu_read_lock() */ static inline void srcu_lock_release(struct lockdep_map *map) { lock_map_release(map); } /* Annotates a synchronize_srcu() */ static inline void srcu_lock_sync(struct lockdep_map *map) { lock_map_sync(map); } #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ static inline int srcu_read_lock_held(const struct srcu_struct *ssp) { return 1; } #define srcu_lock_acquire(m) do { } while (0) #define srcu_lock_release(m) do { } while (0) #define srcu_lock_sync(m) do { } while (0) #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ #define SRCU_NMI_UNKNOWN 0x0 #define SRCU_NMI_UNSAFE 0x1 #define SRCU_NMI_SAFE 0x2 #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_TREE_SRCU) void srcu_check_nmi_safety(struct srcu_struct *ssp, bool nmi_safe); #else static inline void srcu_check_nmi_safety(struct srcu_struct *ssp, bool nmi_safe) { } #endif /** * srcu_dereference_check - fetch SRCU-protected pointer for later dereferencing * @p: the pointer to fetch and protect for later dereferencing * @ssp: pointer to the srcu_struct, which is used to check that we * really are in an SRCU read-side critical section. * @c: condition to check for update-side use * * If PROVE_RCU is enabled, invoking this outside of an RCU read-side * critical section will result in an RCU-lockdep splat, unless @c evaluates * to 1. The @c argument will normally be a logical expression containing * lockdep_is_held() calls. */ #define srcu_dereference_check(p, ssp, c) \ __rcu_dereference_check((p), __UNIQUE_ID(rcu), \ (c) || srcu_read_lock_held(ssp), __rcu) /** * srcu_dereference - fetch SRCU-protected pointer for later dereferencing * @p: the pointer to fetch and protect for later dereferencing * @ssp: pointer to the srcu_struct, which is used to check that we * really are in an SRCU read-side critical section. * * Makes rcu_dereference_check() do the dirty work. If PROVE_RCU * is enabled, invoking this outside of an RCU read-side critical * section will result in an RCU-lockdep splat. */ #define srcu_dereference(p, ssp) srcu_dereference_check((p), (ssp), 0) /** * srcu_dereference_notrace - no tracing and no lockdep calls from here * @p: the pointer to fetch and protect for later dereferencing * @ssp: pointer to the srcu_struct, which is used to check that we * really are in an SRCU read-side critical section. */ #define srcu_dereference_notrace(p, ssp) srcu_dereference_check((p), (ssp), 1) /** * srcu_read_lock - register a new reader for an SRCU-protected structure. * @ssp: srcu_struct in which to register the new reader. * * Enter an SRCU read-side critical section. Note that SRCU read-side * critical sections may be nested. However, it is illegal to * call anything that waits on an SRCU grace period for the same * srcu_struct, whether directly or indirectly. Please note that * one way to indirectly wait on an SRCU grace period is to acquire * a mutex that is held elsewhere while calling synchronize_srcu() or * synchronize_srcu_expedited(). * * Note that srcu_read_lock() and the matching srcu_read_unlock() must * occur in the same context, for example, it is illegal to invoke * srcu_read_unlock() in an irq handler if the matching srcu_read_lock() * was invoked in process context. */ static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp) { int retval; srcu_check_nmi_safety(ssp, false); retval = __srcu_read_lock(ssp); srcu_lock_acquire(&ssp->dep_map); return retval; } /** * srcu_read_lock_nmisafe - register a new reader for an SRCU-protected structure. * @ssp: srcu_struct in which to register the new reader. * * Enter an SRCU read-side critical section, but in an NMI-safe manner. * See srcu_read_lock() for more information. */ static inline int srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires(ssp) { int retval; srcu_check_nmi_safety(ssp, true); retval = __srcu_read_lock_nmisafe(ssp); rcu_lock_acquire(&ssp->dep_map); return retval; } /* Used by tracing, cannot be traced and cannot invoke lockdep. */ static inline notrace int srcu_read_lock_notrace(struct srcu_struct *ssp) __acquires(ssp) { int retval; srcu_check_nmi_safety(ssp, false); retval = __srcu_read_lock(ssp); return retval; } /** * srcu_down_read - register a new reader for an SRCU-protected structure. * @ssp: srcu_struct in which to register the new reader. * * Enter a semaphore-like SRCU read-side critical section. Note that * SRCU read-side critical sections may be nested. However, it is * illegal to call anything that waits on an SRCU grace period for the * same srcu_struct, whether directly or indirectly. Please note that * one way to indirectly wait on an SRCU grace period is to acquire * a mutex that is held elsewhere while calling synchronize_srcu() or * synchronize_srcu_expedited(). But if you want lockdep to help you * keep this stuff straight, you should instead use srcu_read_lock(). * * The semaphore-like nature of srcu_down_read() means that the matching * srcu_up_read() can be invoked from some other context, for example, * from some other task or from an irq handler. However, neither * srcu_down_read() nor srcu_up_read() may be invoked from an NMI handler. * * Calls to srcu_down_read() may be nested, similar to the manner in * which calls to down_read() may be nested. */ static inline int srcu_down_read(struct srcu_struct *ssp) __acquires(ssp) { WARN_ON_ONCE(in_nmi()); srcu_check_nmi_safety(ssp, false); return __srcu_read_lock(ssp); } /** * srcu_read_unlock - unregister a old reader from an SRCU-protected structure. * @ssp: srcu_struct in which to unregister the old reader. * @idx: return value from corresponding srcu_read_lock(). * * Exit an SRCU read-side critical section. */ static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp) { WARN_ON_ONCE(idx & ~0x1); srcu_check_nmi_safety(ssp, false); srcu_lock_release(&ssp->dep_map); __srcu_read_unlock(ssp, idx); } /** * srcu_read_unlock_nmisafe - unregister a old reader from an SRCU-protected structure. * @ssp: srcu_struct in which to unregister the old reader. * @idx: return value from corresponding srcu_read_lock(). * * Exit an SRCU read-side critical section, but in an NMI-safe manner. */ static inline void srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) __releases(ssp) { WARN_ON_ONCE(idx & ~0x1); srcu_check_nmi_safety(ssp, true); rcu_lock_release(&ssp->dep_map); __srcu_read_unlock_nmisafe(ssp, idx); } /* Used by tracing, cannot be traced and cannot call lockdep. */ static inline notrace void srcu_read_unlock_notrace(struct srcu_struct *ssp, int idx) __releases(ssp) { srcu_check_nmi_safety(ssp, false); __srcu_read_unlock(ssp, idx); } /** * srcu_up_read - unregister a old reader from an SRCU-protected structure. * @ssp: srcu_struct in which to unregister the old reader. * @idx: return value from corresponding srcu_read_lock(). * * Exit an SRCU read-side critical section, but not necessarily from * the same context as the maching srcu_down_read(). */ static inline void srcu_up_read(struct srcu_struct *ssp, int idx) __releases(ssp) { WARN_ON_ONCE(idx & ~0x1); WARN_ON_ONCE(in_nmi()); srcu_check_nmi_safety(ssp, false); __srcu_read_unlock(ssp, idx); } /** * smp_mb__after_srcu_read_unlock - ensure full ordering after srcu_read_unlock * * Converts the preceding srcu_read_unlock into a two-way memory barrier. * * Call this after srcu_read_unlock, to guarantee that all memory operations * that occur after smp_mb__after_srcu_read_unlock will appear to happen after * the preceding srcu_read_unlock. */ static inline void smp_mb__after_srcu_read_unlock(void) { /* __srcu_read_unlock has smp_mb() internally so nothing to do here. */ } DEFINE_LOCK_GUARD_1(srcu, struct srcu_struct, _T->idx = srcu_read_lock(_T->lock), srcu_read_unlock(_T->lock, _T->idx), int idx) #endif
184 1688 2807 4287 2733 231 3 189 428 2612 249 240 204 2097 4213 2703 4109 99 67 77 1898 73 442 87 6 6 6 6 6 20 6 149 150 150 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 /* SPDX-License-Identifier: GPL-2.0+ */ #ifndef _LINUX_XARRAY_H #define _LINUX_XARRAY_H /* * eXtensible Arrays * Copyright (c) 2017 Microsoft Corporation * Author: Matthew Wilcox <willy@infradead.org> * * See Documentation/core-api/xarray.rst for how to use the XArray. */ #include <linux/bitmap.h> #include <linux/bug.h> #include <linux/compiler.h> #include <linux/gfp.h> #include <linux/kconfig.h> #include <linux/kernel.h> #include <linux/rcupdate.h> #include <linux/sched/mm.h> #include <linux/spinlock.h> #include <linux/types.h> /* * The bottom two bits of the entry determine how the XArray interprets * the contents: * * 00: Pointer entry * 10: Internal entry * x1: Value entry or tagged pointer * * Attempting to store internal entries in the XArray is a bug. * * Most internal entries are pointers to the next node in the tree. * The following internal entries have a special meaning: * * 0-62: Sibling entries * 256: Retry entry * 257: Zero entry * * Errors are also represented as internal entries, but use the negative * space (-4094 to -2). They're never stored in the slots array; only * returned by the normal API. */ #define BITS_PER_XA_VALUE (BITS_PER_LONG - 1) /** * xa_mk_value() - Create an XArray entry from an integer. * @v: Value to store in XArray. * * Context: Any context. * Return: An entry suitable for storing in the XArray. */ static inline void *xa_mk_value(unsigned long v) { WARN_ON((long)v < 0); return (void *)((v << 1) | 1); } /** * xa_to_value() - Get value stored in an XArray entry. * @entry: XArray entry. * * Context: Any context. * Return: The value stored in the XArray entry. */ static inline unsigned long xa_to_value(const void *entry) { return (unsigned long)entry >> 1; } /** * xa_is_value() - Determine if an entry is a value. * @entry: XArray entry. * * Context: Any context. * Return: True if the entry is a value, false if it is a pointer. */ static inline bool xa_is_value(const void *entry) { return (unsigned long)entry & 1; } /** * xa_tag_pointer() - Create an XArray entry for a tagged pointer. * @p: Plain pointer. * @tag: Tag value (0, 1 or 3). * * If the user of the XArray prefers, they can tag their pointers instead * of storing value entries. Three tags are available (0, 1 and 3). * These are distinct from the xa_mark_t as they are not replicated up * through the array and cannot be searched for. * * Context: Any context. * Return: An XArray entry. */ static inline void *xa_tag_pointer(void *p, unsigned long tag) { return (void *)((unsigned long)p | tag); } /** * xa_untag_pointer() - Turn an XArray entry into a plain pointer. * @entry: XArray entry. * * If you have stored a tagged pointer in the XArray, call this function * to get the untagged version of the pointer. * * Context: Any context. * Return: A pointer. */ static inline void *xa_untag_pointer(void *entry) { return (void *)((unsigned long)entry & ~3UL); } /** * xa_pointer_tag() - Get the tag stored in an XArray entry. * @entry: XArray entry. * * If you have stored a tagged pointer in the XArray, call this function * to get the tag of that pointer. * * Context: Any context. * Return: A tag. */ static inline unsigned int xa_pointer_tag(void *entry) { return (unsigned long)entry & 3UL; } /* * xa_mk_internal() - Create an internal entry. * @v: Value to turn into an internal entry. * * Internal entries are used for a number of purposes. Entries 0-255 are * used for sibling entries (only 0-62 are used by the current code). 256 * is used for the retry entry. 257 is used for the reserved / zero entry. * Negative internal entries are used to represent errnos. Node pointers * are also tagged as internal entries in some situations. * * Context: Any context. * Return: An XArray internal entry corresponding to this value. */ static inline void *xa_mk_internal(unsigned long v) { return (void *)((v << 2) | 2); } /* * xa_to_internal() - Extract the value from an internal entry. * @entry: XArray entry. * * Context: Any context. * Return: The value which was stored in the internal entry. */ static inline unsigned long xa_to_internal(const void *entry) { return (unsigned long)entry >> 2; } /* * xa_is_internal() - Is the entry an internal entry? * @entry: XArray entry. * * Context: Any context. * Return: %true if the entry is an internal entry. */ static inline bool xa_is_internal(const void *entry) { return ((unsigned long)entry & 3) == 2; } #define XA_ZERO_ENTRY xa_mk_internal(257) /** * xa_is_zero() - Is the entry a zero entry? * @entry: Entry retrieved from the XArray * * The normal API will return NULL as the contents of a slot containing * a zero entry. You can only see zero entries by using the advanced API. * * Return: %true if the entry is a zero entry. */ static inline bool xa_is_zero(const void *entry) { return unlikely(entry == XA_ZERO_ENTRY); } /** * xa_is_err() - Report whether an XArray operation returned an error * @entry: Result from calling an XArray function * * If an XArray operation cannot complete an operation, it will return * a special value indicating an error. This function tells you * whether an error occurred; xa_err() tells you which error occurred. * * Context: Any context. * Return: %true if the entry indicates an error. */ static inline bool xa_is_err(const void *entry) { return unlikely(xa_is_internal(entry) && entry >= xa_mk_internal(-MAX_ERRNO)); } /** * xa_err() - Turn an XArray result into an errno. * @entry: Result from calling an XArray function. * * If an XArray operation cannot complete an operation, it will return * a special pointer value which encodes an errno. This function extracts * the errno from the pointer value, or returns 0 if the pointer does not * represent an errno. * * Context: Any context. * Return: A negative errno or 0. */ static inline int xa_err(void *entry) { /* xa_to_internal() would not do sign extension. */ if (xa_is_err(entry)) return (long)entry >> 2; return 0; } /** * struct xa_limit - Represents a range of IDs. * @min: The lowest ID to allocate (inclusive). * @max: The maximum ID to allocate (inclusive). * * This structure is used either directly or via the XA_LIMIT() macro * to communicate the range of IDs that are valid for allocation. * Three common ranges are predefined for you: * * xa_limit_32b - [0 - UINT_MAX] * * xa_limit_31b - [0 - INT_MAX] * * xa_limit_16b - [0 - USHRT_MAX] */ struct xa_limit { u32 max; u32 min; }; #define XA_LIMIT(_min, _max) (struct xa_limit) { .min = _min, .max = _max } #define xa_limit_32b XA_LIMIT(0, UINT_MAX) #define xa_limit_31b XA_LIMIT(0, INT_MAX) #define xa_limit_16b XA_LIMIT(0, USHRT_MAX) typedef unsigned __bitwise xa_mark_t; #define XA_MARK_0 ((__force xa_mark_t)0U) #define XA_MARK_1 ((__force xa_mark_t)1U) #define XA_MARK_2 ((__force xa_mark_t)2U) #define XA_PRESENT ((__force xa_mark_t)8U) #define XA_MARK_MAX XA_MARK_2 #define XA_FREE_MARK XA_MARK_0 enum xa_lock_type { XA_LOCK_IRQ = 1, XA_LOCK_BH = 2, }; /* * Values for xa_flags. The radix tree stores its GFP flags in the xa_flags, * and we remain compatible with that. */ #define XA_FLAGS_LOCK_IRQ ((__force gfp_t)XA_LOCK_IRQ) #define XA_FLAGS_LOCK_BH ((__force gfp_t)XA_LOCK_BH) #define XA_FLAGS_TRACK_FREE ((__force gfp_t)4U) #define XA_FLAGS_ZERO_BUSY ((__force gfp_t)8U) #define XA_FLAGS_ALLOC_WRAPPED ((__force gfp_t)16U) #define XA_FLAGS_ACCOUNT ((__force gfp_t)32U) #define XA_FLAGS_MARK(mark) ((__force gfp_t)((1U << __GFP_BITS_SHIFT) << \ (__force unsigned)(mark))) /* ALLOC is for a normal 0-based alloc. ALLOC1 is for an 1-based alloc */ #define XA_FLAGS_ALLOC (XA_FLAGS_TRACK_FREE | XA_FLAGS_MARK(XA_FREE_MARK)) #define XA_FLAGS_ALLOC1 (XA_FLAGS_TRACK_FREE | XA_FLAGS_ZERO_BUSY) /** * struct xarray - The anchor of the XArray. * @xa_lock: Lock that protects the contents of the XArray. * * To use the xarray, define it statically or embed it in your data structure. * It is a very small data structure, so it does not usually make sense to * allocate it separately and keep a pointer to it in your data structure. * * You may use the xa_lock to protect your own data structures as well. */ /* * If all of the entries in the array are NULL, @xa_head is a NULL pointer. * If the only non-NULL entry in the array is at index 0, @xa_head is that * entry. If any other entry in the array is non-NULL, @xa_head points * to an @xa_node. */ struct xarray { spinlock_t xa_lock; /* private: The rest of the data structure is not to be used directly. */ gfp_t xa_flags; void __rcu * xa_head; }; #define XARRAY_INIT(name, flags) { \ .xa_lock = __SPIN_LOCK_UNLOCKED(name.xa_lock), \ .xa_flags = flags, \ .xa_head = NULL, \ } /** * DEFINE_XARRAY_FLAGS() - Define an XArray with custom flags. * @name: A string that names your XArray. * @flags: XA_FLAG values. * * This is intended for file scope definitions of XArrays. It declares * and initialises an empty XArray with the chosen name and flags. It is * equivalent to calling xa_init_flags() on the array, but it does the * initialisation at compiletime instead of runtime. */ #define DEFINE_XARRAY_FLAGS(name, flags) \ struct xarray name = XARRAY_INIT(name, flags) /** * DEFINE_XARRAY() - Define an XArray. * @name: A string that names your XArray. * * This is intended for file scope definitions of XArrays. It declares * and initialises an empty XArray with the chosen name. It is equivalent * to calling xa_init() on the array, but it does the initialisation at * compiletime instead of runtime. */ #define DEFINE_XARRAY(name) DEFINE_XARRAY_FLAGS(name, 0) /** * DEFINE_XARRAY_ALLOC() - Define an XArray which allocates IDs starting at 0. * @name: A string that names your XArray. * * This is intended for file scope definitions of allocating XArrays. * See also DEFINE_XARRAY(). */ #define DEFINE_XARRAY_ALLOC(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC) /** * DEFINE_XARRAY_ALLOC1() - Define an XArray which allocates IDs starting at 1. * @name: A string that names your XArray. * * This is intended for file scope definitions of allocating XArrays. * See also DEFINE_XARRAY(). */ #define DEFINE_XARRAY_ALLOC1(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC1) void *xa_load(struct xarray *, unsigned long index); void *xa_store(struct xarray *, unsigned long index, void *entry, gfp_t); void *xa_erase(struct xarray *, unsigned long index); void *xa_store_range(struct xarray *, unsigned long first, unsigned long last, void *entry, gfp_t); bool xa_get_mark(struct xarray *, unsigned long index, xa_mark_t); void xa_set_mark(struct xarray *, unsigned long index, xa_mark_t); void xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); void *xa_find(struct xarray *xa, unsigned long *index, unsigned long max, xa_mark_t) __attribute__((nonnull(2))); void *xa_find_after(struct xarray *xa, unsigned long *index, unsigned long max, xa_mark_t) __attribute__((nonnull(2))); unsigned int xa_extract(struct xarray *, void **dst, unsigned long start, unsigned long max, unsigned int n, xa_mark_t); void xa_destroy(struct xarray *); /** * xa_init_flags() - Initialise an empty XArray with flags. * @xa: XArray. * @flags: XA_FLAG values. * * If you need to initialise an XArray with special flags (eg you need * to take the lock from interrupt context), use this function instead * of xa_init(). * * Context: Any context. */ static inline void xa_init_flags(struct xarray *xa, gfp_t flags) { spin_lock_init(&xa->xa_lock); xa->xa_flags = flags; xa->xa_head = NULL; } /** * xa_init() - Initialise an empty XArray. * @xa: XArray. * * An empty XArray is full of NULL entries. * * Context: Any context. */ static inline void xa_init(struct xarray *xa) { xa_init_flags(xa, 0); } /** * xa_empty() - Determine if an array has any present entries. * @xa: XArray. * * Context: Any context. * Return: %true if the array contains only NULL pointers. */ static inline bool xa_empty(const struct xarray *xa) { return xa->xa_head == NULL; } /** * xa_marked() - Inquire whether any entry in this array has a mark set * @xa: Array * @mark: Mark value * * Context: Any context. * Return: %true if any entry has this mark set. */ static inline bool xa_marked(const struct xarray *xa, xa_mark_t mark) { return xa->xa_flags & XA_FLAGS_MARK(mark); } /** * xa_for_each_range() - Iterate over a portion of an XArray. * @xa: XArray. * @index: Index of @entry. * @entry: Entry retrieved from array. * @start: First index to retrieve from array. * @last: Last index to retrieve from array. * * During the iteration, @entry will have the value of the entry stored * in @xa at @index. You may modify @index during the iteration if you * want to skip or reprocess indices. It is safe to modify the array * during the iteration. At the end of the iteration, @entry will be set * to NULL and @index will have a value less than or equal to max. * * xa_for_each_range() is O(n.log(n)) while xas_for_each() is O(n). You have * to handle your own locking with xas_for_each(), and if you have to unlock * after each iteration, it will also end up being O(n.log(n)). * xa_for_each_range() will spin if it hits a retry entry; if you intend to * see retry entries, you should use the xas_for_each() iterator instead. * The xas_for_each() iterator will expand into more inline code than * xa_for_each_range(). * * Context: Any context. Takes and releases the RCU lock. */ #define xa_for_each_range(xa, index, entry, start, last) \ for (index = start, \ entry = xa_find(xa, &index, last, XA_PRESENT); \ entry; \ entry = xa_find_after(xa, &index, last, XA_PRESENT)) /** * xa_for_each_start() - Iterate over a portion of an XArray. * @xa: XArray. * @index: Index of @entry. * @entry: Entry retrieved from array. * @start: First index to retrieve from array. * * During the iteration, @entry will have the value of the entry stored * in @xa at @index. You may modify @index during the iteration if you * want to skip or reprocess indices. It is safe to modify the array * during the iteration. At the end of the iteration, @entry will be set * to NULL and @index will have a value less than or equal to max. * * xa_for_each_start() is O(n.log(n)) while xas_for_each() is O(n). You have * to handle your own locking with xas_for_each(), and if you have to unlock * after each iteration, it will also end up being O(n.log(n)). * xa_for_each_start() will spin if it hits a retry entry; if you intend to * see retry entries, you should use the xas_for_each() iterator instead. * The xas_for_each() iterator will expand into more inline code than * xa_for_each_start(). * * Context: Any context. Takes and releases the RCU lock. */ #define xa_for_each_start(xa, index, entry, start) \ xa_for_each_range(xa, index, entry, start, ULONG_MAX) /** * xa_for_each() - Iterate over present entries in an XArray. * @xa: XArray. * @index: Index of @entry. * @entry: Entry retrieved from array. * * During the iteration, @entry will have the value of the entry stored * in @xa at @index. You may modify @index during the iteration if you want * to skip or reprocess indices. It is safe to modify the array during the * iteration. At the end of the iteration, @entry will be set to NULL and * @index will have a value less than or equal to max. * * xa_for_each() is O(n.log(n)) while xas_for_each() is O(n). You have * to handle your own locking with xas_for_each(), and if you have to unlock * after each iteration, it will also end up being O(n.log(n)). xa_for_each() * will spin if it hits a retry entry; if you intend to see retry entries, * you should use the xas_for_each() iterator instead. The xas_for_each() * iterator will expand into more inline code than xa_for_each(). * * Context: Any context. Takes and releases the RCU lock. */ #define xa_for_each(xa, index, entry) \ xa_for_each_start(xa, index, entry, 0) /** * xa_for_each_marked() - Iterate over marked entries in an XArray. * @xa: XArray. * @index: Index of @entry. * @entry: Entry retrieved from array. * @filter: Selection criterion. * * During the iteration, @entry will have the value of the entry stored * in @xa at @index. The iteration will skip all entries in the array * which do not match @filter. You may modify @index during the iteration * if you want to skip or reprocess indices. It is safe to modify the array * during the iteration. At the end of the iteration, @entry will be set to * NULL and @index will have a value less than or equal to max. * * xa_for_each_marked() is O(n.log(n)) while xas_for_each_marked() is O(n). * You have to handle your own locking with xas_for_each(), and if you have * to unlock after each iteration, it will also end up being O(n.log(n)). * xa_for_each_marked() will spin if it hits a retry entry; if you intend to * see retry entries, you should use the xas_for_each_marked() iterator * instead. The xas_for_each_marked() iterator will expand into more inline * code than xa_for_each_marked(). * * Context: Any context. Takes and releases the RCU lock. */ #define xa_for_each_marked(xa, index, entry, filter) \ for (index = 0, entry = xa_find(xa, &index, ULONG_MAX, filter); \ entry; entry = xa_find_after(xa, &index, ULONG_MAX, filter)) #define xa_trylock(xa) spin_trylock(&(xa)->xa_lock) #define xa_lock(xa) spin_lock(&(xa)->xa_lock) #define xa_unlock(xa) spin_unlock(&(xa)->xa_lock) #define xa_lock_bh(xa) spin_lock_bh(&(xa)->xa_lock) #define xa_unlock_bh(xa) spin_unlock_bh(&(xa)->xa_lock) #define xa_lock_irq(xa) spin_lock_irq(&(xa)->xa_lock) #define xa_unlock_irq(xa) spin_unlock_irq(&(xa)->xa_lock) #define xa_lock_irqsave(xa, flags) \ spin_lock_irqsave(&(xa)->xa_lock, flags) #define xa_unlock_irqrestore(xa, flags) \ spin_unlock_irqrestore(&(xa)->xa_lock, flags) #define xa_lock_nested(xa, subclass) \ spin_lock_nested(&(xa)->xa_lock, subclass) #define xa_lock_bh_nested(xa, subclass) \ spin_lock_bh_nested(&(xa)->xa_lock, subclass) #define xa_lock_irq_nested(xa, subclass) \ spin_lock_irq_nested(&(xa)->xa_lock, subclass) #define xa_lock_irqsave_nested(xa, flags, subclass) \ spin_lock_irqsave_nested(&(xa)->xa_lock, flags, subclass) /* * Versions of the normal API which require the caller to hold the * xa_lock. If the GFP flags allow it, they will drop the lock to * allocate memory, then reacquire it afterwards. These functions * may also re-enable interrupts if the XArray flags indicate the * locking should be interrupt safe. */ void *__xa_erase(struct xarray *, unsigned long index); void *__xa_store(struct xarray *, unsigned long index, void *entry, gfp_t); void *__xa_cmpxchg(struct xarray *, unsigned long index, void *old, void *entry, gfp_t); int __must_check __xa_insert(struct xarray *, unsigned long index, void *entry, gfp_t); int __must_check __xa_alloc(struct xarray *, u32 *id, void *entry, struct xa_limit, gfp_t); int __must_check __xa_alloc_cyclic(struct xarray *, u32 *id, void *entry, struct xa_limit, u32 *next, gfp_t); void __xa_set_mark(struct xarray *, unsigned long index, xa_mark_t); void __xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); /** * xa_store_bh() - Store this entry in the XArray. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * This function is like calling xa_store() except it disables softirqs * while holding the array lock. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. * Return: The old entry at this index or xa_err() if an error happened. */ static inline void *xa_store_bh(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { void *curr; might_alloc(gfp); xa_lock_bh(xa); curr = __xa_store(xa, index, entry, gfp); xa_unlock_bh(xa); return curr; } /** * xa_store_irq() - Store this entry in the XArray. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * This function is like calling xa_store() except it disables interrupts * while holding the array lock. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. * Return: The old entry at this index or xa_err() if an error happened. */ static inline void *xa_store_irq(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { void *curr; might_alloc(gfp); xa_lock_irq(xa); curr = __xa_store(xa, index, entry, gfp); xa_unlock_irq(xa); return curr; } /** * xa_erase_bh() - Erase this entry from the XArray. * @xa: XArray. * @index: Index of entry. * * After this function returns, loading from @index will return %NULL. * If the index is part of a multi-index entry, all indices will be erased * and none of the entries will be part of a multi-index entry. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. * Return: The entry which used to be at this index. */ static inline void *xa_erase_bh(struct xarray *xa, unsigned long index) { void *entry; xa_lock_bh(xa); entry = __xa_erase(xa, index); xa_unlock_bh(xa); return entry; } /** * xa_erase_irq() - Erase this entry from the XArray. * @xa: XArray. * @index: Index of entry. * * After this function returns, loading from @index will return %NULL. * If the index is part of a multi-index entry, all indices will be erased * and none of the entries will be part of a multi-index entry. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. * Return: The entry which used to be at this index. */ static inline void *xa_erase_irq(struct xarray *xa, unsigned long index) { void *entry; xa_lock_irq(xa); entry = __xa_erase(xa, index); xa_unlock_irq(xa); return entry; } /** * xa_cmpxchg() - Conditionally replace an entry in the XArray. * @xa: XArray. * @index: Index into array. * @old: Old value to test against. * @entry: New value to place in array. * @gfp: Memory allocation flags. * * If the entry at @index is the same as @old, replace it with @entry. * If the return value is equal to @old, then the exchange was successful. * * Context: Any context. Takes and releases the xa_lock. May sleep * if the @gfp flags permit. * Return: The old value at this index or xa_err() if an error happened. */ static inline void *xa_cmpxchg(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp) { void *curr; might_alloc(gfp); xa_lock(xa); curr = __xa_cmpxchg(xa, index, old, entry, gfp); xa_unlock(xa); return curr; } /** * xa_cmpxchg_bh() - Conditionally replace an entry in the XArray. * @xa: XArray. * @index: Index into array. * @old: Old value to test against. * @entry: New value to place in array. * @gfp: Memory allocation flags. * * This function is like calling xa_cmpxchg() except it disables softirqs * while holding the array lock. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: The old value at this index or xa_err() if an error happened. */ static inline void *xa_cmpxchg_bh(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp) { void *curr; might_alloc(gfp); xa_lock_bh(xa); curr = __xa_cmpxchg(xa, index, old, entry, gfp); xa_unlock_bh(xa); return curr; } /** * xa_cmpxchg_irq() - Conditionally replace an entry in the XArray. * @xa: XArray. * @index: Index into array. * @old: Old value to test against. * @entry: New value to place in array. * @gfp: Memory allocation flags. * * This function is like calling xa_cmpxchg() except it disables interrupts * while holding the array lock. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: The old value at this index or xa_err() if an error happened. */ static inline void *xa_cmpxchg_irq(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp) { void *curr; might_alloc(gfp); xa_lock_irq(xa); curr = __xa_cmpxchg(xa, index, old, entry, gfp); xa_unlock_irq(xa); return curr; } /** * xa_insert() - Store this entry in the XArray unless another entry is * already present. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * Inserting a NULL entry will store a reserved entry (like xa_reserve()) * if no entry is present. Inserting will fail if a reserved entry is * present, even though loading from this index will return NULL. * * Context: Any context. Takes and releases the xa_lock. May sleep if * the @gfp flags permit. * Return: 0 if the store succeeded. -EBUSY if another entry was present. * -ENOMEM if memory could not be allocated. */ static inline int __must_check xa_insert(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { int err; might_alloc(gfp); xa_lock(xa); err = __xa_insert(xa, index, entry, gfp); xa_unlock(xa); return err; } /** * xa_insert_bh() - Store this entry in the XArray unless another entry is * already present. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * Inserting a NULL entry will store a reserved entry (like xa_reserve()) * if no entry is present. Inserting will fail if a reserved entry is * present, even though loading from this index will return NULL. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: 0 if the store succeeded. -EBUSY if another entry was present. * -ENOMEM if memory could not be allocated. */ static inline int __must_check xa_insert_bh(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_bh(xa); err = __xa_insert(xa, index, entry, gfp); xa_unlock_bh(xa); return err; } /** * xa_insert_irq() - Store this entry in the XArray unless another entry is * already present. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * Inserting a NULL entry will store a reserved entry (like xa_reserve()) * if no entry is present. Inserting will fail if a reserved entry is * present, even though loading from this index will return NULL. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: 0 if the store succeeded. -EBUSY if another entry was present. * -ENOMEM if memory could not be allocated. */ static inline int __must_check xa_insert_irq(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_irq(xa); err = __xa_insert(xa, index, entry, gfp); xa_unlock_irq(xa); return err; } /** * xa_alloc() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Any context. Takes and releases the xa_lock. May sleep if * the @gfp flags permit. * Return: 0 on success, -ENOMEM if memory could not be allocated or * -EBUSY if there are no free entries in @limit. */ static inline __must_check int xa_alloc(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, gfp_t gfp) { int err; might_alloc(gfp); xa_lock(xa); err = __xa_alloc(xa, id, entry, limit, gfp); xa_unlock(xa); return err; } /** * xa_alloc_bh() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: 0 on success, -ENOMEM if memory could not be allocated or * -EBUSY if there are no free entries in @limit. */ static inline int __must_check xa_alloc_bh(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_bh(xa); err = __xa_alloc(xa, id, entry, limit, gfp); xa_unlock_bh(xa); return err; } /** * xa_alloc_irq() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: 0 on success, -ENOMEM if memory could not be allocated or * -EBUSY if there are no free entries in @limit. */ static inline int __must_check xa_alloc_irq(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_irq(xa); err = __xa_alloc(xa, id, entry, limit, gfp); xa_unlock_irq(xa); return err; } /** * xa_alloc_cyclic() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of allocated ID. * @next: Pointer to next ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * The search for an empty entry will start at @next and will wrap * around if necessary. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Any context. Takes and releases the xa_lock. May sleep if * the @gfp flags permit. * Return: 0 if the allocation succeeded without wrapping. 1 if the * allocation succeeded after wrapping, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ static inline int xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, u32 *next, gfp_t gfp) { int err; might_alloc(gfp); xa_lock(xa); err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock(xa); return err; } /** * xa_alloc_cyclic_bh() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of allocated ID. * @next: Pointer to next ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * The search for an empty entry will start at @next and will wrap * around if necessary. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: 0 if the allocation succeeded without wrapping. 1 if the * allocation succeeded after wrapping, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ static inline int xa_alloc_cyclic_bh(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, u32 *next, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_bh(xa); err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock_bh(xa); return err; } /** * xa_alloc_cyclic_irq() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of allocated ID. * @next: Pointer to next ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * The search for an empty entry will start at @next and will wrap * around if necessary. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: 0 if the allocation succeeded without wrapping. 1 if the * allocation succeeded after wrapping, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ static inline int xa_alloc_cyclic_irq(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, u32 *next, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_irq(xa); err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock_irq(xa); return err; } /** * xa_reserve() - Reserve this index in the XArray. * @xa: XArray. * @index: Index into array. * @gfp: Memory allocation flags. * * Ensures there is somewhere to store an entry at @index in the array. * If there is already something stored at @index, this function does * nothing. If there was nothing there, the entry is marked as reserved. * Loading from a reserved entry returns a %NULL pointer. * * If you do not use the entry that you have reserved, call xa_release() * or xa_erase() to free any unnecessary memory. * * Context: Any context. Takes and releases the xa_lock. * May sleep if the @gfp flags permit. * Return: 0 if the reservation succeeded or -ENOMEM if it failed. */ static inline __must_check int xa_reserve(struct xarray *xa, unsigned long index, gfp_t gfp) { return xa_err(xa_cmpxchg(xa, index, NULL, XA_ZERO_ENTRY, gfp)); } /** * xa_reserve_bh() - Reserve this index in the XArray. * @xa: XArray. * @index: Index into array. * @gfp: Memory allocation flags. * * A softirq-disabling version of xa_reserve(). * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. * Return: 0 if the reservation succeeded or -ENOMEM if it failed. */ static inline __must_check int xa_reserve_bh(struct xarray *xa, unsigned long index, gfp_t gfp) { return xa_err(xa_cmpxchg_bh(xa, index, NULL, XA_ZERO_ENTRY, gfp)); } /** * xa_reserve_irq() - Reserve this index in the XArray. * @xa: XArray. * @index: Index into array. * @gfp: Memory allocation flags. * * An interrupt-disabling version of xa_reserve(). * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. * Return: 0 if the reservation succeeded or -ENOMEM if it failed. */ static inline __must_check int xa_reserve_irq(struct xarray *xa, unsigned long index, gfp_t gfp) { return xa_err(xa_cmpxchg_irq(xa, index, NULL, XA_ZERO_ENTRY, gfp)); } /** * xa_release() - Release a reserved entry. * @xa: XArray. * @index: Index of entry. * * After calling xa_reserve(), you can call this function to release the * reservation. If the entry at @index has been stored to, this function * will do nothing. */ static inline void xa_release(struct xarray *xa, unsigned long index) { xa_cmpxchg(xa, index, XA_ZERO_ENTRY, NULL, 0); } /* Everything below here is the Advanced API. Proceed with caution. */ /* * The xarray is constructed out of a set of 'chunks' of pointers. Choosing * the best chunk size requires some tradeoffs. A power of two recommends * itself so that we can walk the tree based purely on shifts and masks. * Generally, the larger the better; as the number of slots per level of the * tree increases, the less tall the tree needs to be. But that needs to be * balanced against the memory consumption of each node. On a 64-bit system, * xa_node is currently 576 bytes, and we get 7 of them per 4kB page. If we * doubled the number of slots per node, we'd get only 3 nodes per 4kB page. */ #ifndef XA_CHUNK_SHIFT #define XA_CHUNK_SHIFT (CONFIG_BASE_SMALL ? 4 : 6) #endif #define XA_CHUNK_SIZE (1UL << XA_CHUNK_SHIFT) #define XA_CHUNK_MASK (XA_CHUNK_SIZE - 1) #define XA_MAX_MARKS 3 #define XA_MARK_LONGS DIV_ROUND_UP(XA_CHUNK_SIZE, BITS_PER_LONG) /* * @count is the count of every non-NULL element in the ->slots array * whether that is a value entry, a retry entry, a user pointer, * a sibling entry or a pointer to the next level of the tree. * @nr_values is the count of every element in ->slots which is * either a value entry or a sibling of a value entry. */ struct xa_node { unsigned char shift; /* Bits remaining in each slot */ unsigned char offset; /* Slot offset in parent */ unsigned char count; /* Total entry count */ unsigned char nr_values; /* Value entry count */ struct xa_node __rcu *parent; /* NULL at top of tree */ struct xarray *array; /* The array we belong to */ union { struct list_head private_list; /* For tree user */ struct rcu_head rcu_head; /* Used when freeing node */ }; void __rcu *slots[XA_CHUNK_SIZE]; union { unsigned long tags[XA_MAX_MARKS][XA_MARK_LONGS]; unsigned long marks[XA_MAX_MARKS][XA_MARK_LONGS]; }; }; void xa_dump(const struct xarray *); void xa_dump_node(const struct xa_node *); #ifdef XA_DEBUG #define XA_BUG_ON(xa, x) do { \ if (x) { \ xa_dump(xa); \ BUG(); \ } \ } while (0) #define XA_NODE_BUG_ON(node, x) do { \ if (x) { \ if (node) xa_dump_node(node); \ BUG(); \ } \ } while (0) #else #define XA_BUG_ON(xa, x) do { } while (0) #define XA_NODE_BUG_ON(node, x) do { } while (0) #endif /* Private */ static inline void *xa_head(const struct xarray *xa) { return rcu_dereference_check(xa->xa_head, lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline void *xa_head_locked(const struct xarray *xa) { return rcu_dereference_protected(xa->xa_head, lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline void *xa_entry(const struct xarray *xa, const struct xa_node *node, unsigned int offset) { XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE); return rcu_dereference_check(node->slots[offset], lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline void *xa_entry_locked(const struct xarray *xa, const struct xa_node *node, unsigned int offset) { XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE); return rcu_dereference_protected(node->slots[offset], lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline struct xa_node *xa_parent(const struct xarray *xa, const struct xa_node *node) { return rcu_dereference_check(node->parent, lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline struct xa_node *xa_parent_locked(const struct xarray *xa, const struct xa_node *node) { return rcu_dereference_protected(node->parent, lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline void *xa_mk_node(const struct xa_node *node) { return (void *)((unsigned long)node | 2); } /* Private */ static inline struct xa_node *xa_to_node(const void *entry) { return (struct xa_node *)((unsigned long)entry - 2); } /* Private */ static inline bool xa_is_node(const void *entry) { return xa_is_internal(entry) && (unsigned long)entry > 4096; } /* Private */ static inline void *xa_mk_sibling(unsigned int offset) { return xa_mk_internal(offset); } /* Private */ static inline unsigned long xa_to_sibling(const void *entry) { return xa_to_internal(entry); } /** * xa_is_sibling() - Is the entry a sibling entry? * @entry: Entry retrieved from the XArray * * Return: %true if the entry is a sibling entry. */ static inline bool xa_is_sibling(const void *entry) { return IS_ENABLED(CONFIG_XARRAY_MULTI) && xa_is_internal(entry) && (entry < xa_mk_sibling(XA_CHUNK_SIZE - 1)); } #define XA_RETRY_ENTRY xa_mk_internal(256) /** * xa_is_retry() - Is the entry a retry entry? * @entry: Entry retrieved from the XArray * * Return: %true if the entry is a retry entry. */ static inline bool xa_is_retry(const void *entry) { return unlikely(entry == XA_RETRY_ENTRY); } /** * xa_is_advanced() - Is the entry only permitted for the advanced API? * @entry: Entry to be stored in the XArray. * * Return: %true if the entry cannot be stored by the normal API. */ static inline bool xa_is_advanced(const void *entry) { return xa_is_internal(entry) && (entry <= XA_RETRY_ENTRY); } /** * typedef xa_update_node_t - A callback function from the XArray. * @node: The node which is being processed * * This function is called every time the XArray updates the count of * present and value entries in a node. It allows advanced users to * maintain the private_list in the node. * * Context: The xa_lock is held and interrupts may be disabled. * Implementations should not drop the xa_lock, nor re-enable * interrupts. */ typedef void (*xa_update_node_t)(struct xa_node *node); void xa_delete_node(struct xa_node *, xa_update_node_t); /* * The xa_state is opaque to its users. It contains various different pieces * of state involved in the current operation on the XArray. It should be * declared on the stack and passed between the various internal routines. * The various elements in it should not be accessed directly, but only * through the provided accessor functions. The below documentation is for * the benefit of those working on the code, not for users of the XArray. * * @xa_node usually points to the xa_node containing the slot we're operating * on (and @xa_offset is the offset in the slots array). If there is a * single entry in the array at index 0, there are no allocated xa_nodes to * point to, and so we store %NULL in @xa_node. @xa_node is set to * the value %XAS_RESTART if the xa_state is not walked to the correct * position in the tree of nodes for this operation. If an error occurs * during an operation, it is set to an %XAS_ERROR value. If we run off the * end of the allocated nodes, it is set to %XAS_BOUNDS. */ struct xa_state { struct xarray *xa; unsigned long xa_index; unsigned char xa_shift; unsigned char xa_sibs; unsigned char xa_offset; unsigned char xa_pad; /* Helps gcc generate better code */ struct xa_node *xa_node; struct xa_node *xa_alloc; xa_update_node_t xa_update; struct list_lru *xa_lru; }; /* * We encode errnos in the xas->xa_node. If an error has happened, we need to * drop the lock to fix it, and once we've done so the xa_state is invalid. */ #define XA_ERROR(errno) ((struct xa_node *)(((unsigned long)errno << 2) | 2UL)) #define XAS_BOUNDS ((struct xa_node *)1UL) #define XAS_RESTART ((struct xa_node *)3UL) #define __XA_STATE(array, index, shift, sibs) { \ .xa = array, \ .xa_index = index, \ .xa_shift = shift, \ .xa_sibs = sibs, \ .xa_offset = 0, \ .xa_pad = 0, \ .xa_node = XAS_RESTART, \ .xa_alloc = NULL, \ .xa_update = NULL, \ .xa_lru = NULL, \ } /** * XA_STATE() - Declare an XArray operation state. * @name: Name of this operation state (usually xas). * @array: Array to operate on. * @index: Initial index of interest. * * Declare and initialise an xa_state on the stack. */ #define XA_STATE(name, array, index) \ struct xa_state name = __XA_STATE(array, index, 0, 0) /** * XA_STATE_ORDER() - Declare an XArray operation state. * @name: Name of this operation state (usually xas). * @array: Array to operate on. * @index: Initial index of interest. * @order: Order of entry. * * Declare and initialise an xa_state on the stack. This variant of * XA_STATE() allows you to specify the 'order' of the element you * want to operate on.` */ #define XA_STATE_ORDER(name, array, index, order) \ struct xa_state name = __XA_STATE(array, \ (index >> order) << order, \ order - (order % XA_CHUNK_SHIFT), \ (1U << (order % XA_CHUNK_SHIFT)) - 1) #define xas_marked(xas, mark) xa_marked((xas)->xa, (mark)) #define xas_trylock(xas) xa_trylock((xas)->xa) #define xas_lock(xas) xa_lock((xas)->xa) #define xas_unlock(xas) xa_unlock((xas)->xa) #define xas_lock_bh(xas) xa_lock_bh((xas)->xa) #define xas_unlock_bh(xas) xa_unlock_bh((xas)->xa) #define xas_lock_irq(xas) xa_lock_irq((xas)->xa) #define xas_unlock_irq(xas) xa_unlock_irq((xas)->xa) #define xas_lock_irqsave(xas, flags) \ xa_lock_irqsave((xas)->xa, flags) #define xas_unlock_irqrestore(xas, flags) \ xa_unlock_irqrestore((xas)->xa, flags) /** * xas_error() - Return an errno stored in the xa_state. * @xas: XArray operation state. * * Return: 0 if no error has been noted. A negative errno if one has. */ static inline int xas_error(const struct xa_state *xas) { return xa_err(xas->xa_node); } /** * xas_set_err() - Note an error in the xa_state. * @xas: XArray operation state. * @err: Negative error number. * * Only call this function with a negative @err; zero or positive errors * will probably not behave the way you think they should. If you want * to clear the error from an xa_state, use xas_reset(). */ static inline void xas_set_err(struct xa_state *xas, long err) { xas->xa_node = XA_ERROR(err); } /** * xas_invalid() - Is the xas in a retry or error state? * @xas: XArray operation state. * * Return: %true if the xas cannot be used for operations. */ static inline bool xas_invalid(const struct xa_state *xas) { return (unsigned long)xas->xa_node & 3; } /** * xas_valid() - Is the xas a valid cursor into the array? * @xas: XArray operation state. * * Return: %true if the xas can be used for operations. */ static inline bool xas_valid(const struct xa_state *xas) { return !xas_invalid(xas); } /** * xas_is_node() - Does the xas point to a node? * @xas: XArray operation state. * * Return: %true if the xas currently references a node. */ static inline bool xas_is_node(const struct xa_state *xas) { return xas_valid(xas) && xas->xa_node; } /* True if the pointer is something other than a node */ static inline bool xas_not_node(struct xa_node *node) { return ((unsigned long)node & 3) || !node; } /* True if the node represents RESTART or an error */ static inline bool xas_frozen(struct xa_node *node) { return (unsigned long)node & 2; } /* True if the node represents head-of-tree, RESTART or BOUNDS */ static inline bool xas_top(struct xa_node *node) { return node <= XAS_RESTART; } /** * xas_reset() - Reset an XArray operation state. * @xas: XArray operation state. * * Resets the error or walk state of the @xas so future walks of the * array will start from the root. Use this if you have dropped the * xarray lock and want to reuse the xa_state. * * Context: Any context. */ static inline void xas_reset(struct xa_state *xas) { xas->xa_node = XAS_RESTART; } /** * xas_retry() - Retry the operation if appropriate. * @xas: XArray operation state. * @entry: Entry from xarray. * * The advanced functions may sometimes return an internal entry, such as * a retry entry or a zero entry. This function sets up the @xas to restart * the walk from the head of the array if needed. * * Context: Any context. * Return: true if the operation needs to be retried. */ static inline bool xas_retry(struct xa_state *xas, const void *entry) { if (xa_is_zero(entry)) return true; if (!xa_is_retry(entry)) return false; xas_reset(xas); return true; } void *xas_load(struct xa_state *); void *xas_store(struct xa_state *, void *entry); void *xas_find(struct xa_state *, unsigned long max); void *xas_find_conflict(struct xa_state *); bool xas_get_mark(const struct xa_state *, xa_mark_t); void xas_set_mark(const struct xa_state *, xa_mark_t); void xas_clear_mark(const struct xa_state *, xa_mark_t); void *xas_find_marked(struct xa_state *, unsigned long max, xa_mark_t); void xas_init_marks(const struct xa_state *); bool xas_nomem(struct xa_state *, gfp_t); void xas_destroy(struct xa_state *); void xas_pause(struct xa_state *); void xas_create_range(struct xa_state *); #ifdef CONFIG_XARRAY_MULTI int xa_get_order(struct xarray *, unsigned long index); void xas_split(struct xa_state *, void *entry, unsigned int order); void xas_split_alloc(struct xa_state *, void *entry, unsigned int order, gfp_t); #else static inline int xa_get_order(struct xarray *xa, unsigned long index) { return 0; } static inline void xas_split(struct xa_state *xas, void *entry, unsigned int order) { xas_store(xas, entry); } static inline void xas_split_alloc(struct xa_state *xas, void *entry, unsigned int order, gfp_t gfp) { } #endif /** * xas_reload() - Refetch an entry from the xarray. * @xas: XArray operation state. * * Use this function to check that a previously loaded entry still has * the same value. This is useful for the lockless pagecache lookup where * we walk the array with only the RCU lock to protect us, lock the page, * then check that the page hasn't moved since we looked it up. * * The caller guarantees that @xas is still valid. If it may be in an * error or restart state, call xas_load() instead. * * Return: The entry at this location in the xarray. */ static inline void *xas_reload(struct xa_state *xas) { struct xa_node *node = xas->xa_node; void *entry; char offset; if (!node) return xa_head(xas->xa); if (IS_ENABLED(CONFIG_XARRAY_MULTI)) { offset = (xas->xa_index >> node->shift) & XA_CHUNK_MASK; entry = xa_entry(xas->xa, node, offset); if (!xa_is_sibling(entry)) return entry; offset = xa_to_sibling(entry); } else { offset = xas->xa_offset; } return xa_entry(xas->xa, node, offset); } /** * xas_set() - Set up XArray operation state for a different index. * @xas: XArray operation state. * @index: New index into the XArray. * * Move the operation state to refer to a different index. This will * have the effect of starting a walk from the top; see xas_next() * to move to an adjacent index. */ static inline void xas_set(struct xa_state *xas, unsigned long index) { xas->xa_index = index; xas->xa_node = XAS_RESTART; } /** * xas_advance() - Skip over sibling entries. * @xas: XArray operation state. * @index: Index of last sibling entry. * * Move the operation state to refer to the last sibling entry. * This is useful for loops that normally want to see sibling * entries but sometimes want to skip them. Use xas_set() if you * want to move to an index which is not part of this entry. */ static inline void xas_advance(struct xa_state *xas, unsigned long index) { unsigned char shift = xas_is_node(xas) ? xas->xa_node->shift : 0; xas->xa_index = index; xas->xa_offset = (index >> shift) & XA_CHUNK_MASK; } /** * xas_set_order() - Set up XArray operation state for a multislot entry. * @xas: XArray operation state. * @index: Target of the operation. * @order: Entry occupies 2^@order indices. */ static inline void xas_set_order(struct xa_state *xas, unsigned long index, unsigned int order) { #ifdef CONFIG_XARRAY_MULTI xas->xa_index = order < BITS_PER_LONG ? (index >> order) << order : 0; xas->xa_shift = order - (order % XA_CHUNK_SHIFT); xas->xa_sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1; xas->xa_node = XAS_RESTART; #else BUG_ON(order > 0); xas_set(xas, index); #endif } /** * xas_set_update() - Set up XArray operation state for a callback. * @xas: XArray operation state. * @update: Function to call when updating a node. * * The XArray can notify a caller after it has updated an xa_node. * This is advanced functionality and is only needed by the page * cache and swap cache. */ static inline void xas_set_update(struct xa_state *xas, xa_update_node_t update) { xas->xa_update = update; } static inline void xas_set_lru(struct xa_state *xas, struct list_lru *lru) { xas->xa_lru = lru; } /** * xas_next_entry() - Advance iterator to next present entry. * @xas: XArray operation state. * @max: Highest index to return. * * xas_next_entry() is an inline function to optimise xarray traversal for * speed. It is equivalent to calling xas_find(), and will call xas_find() * for all the hard cases. * * Return: The next present entry after the one currently referred to by @xas. */ static inline void *xas_next_entry(struct xa_state *xas, unsigned long max) { struct xa_node *node = xas->xa_node; void *entry; if (unlikely(xas_not_node(node) || node->shift || xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK))) return xas_find(xas, max); do { if (unlikely(xas->xa_index >= max)) return xas_find(xas, max); if (unlikely(xas->xa_offset == XA_CHUNK_MASK)) return xas_find(xas, max); entry = xa_entry(xas->xa, node, xas->xa_offset + 1); if (unlikely(xa_is_internal(entry))) return xas_find(xas, max); xas->xa_offset++; xas->xa_index++; } while (!entry); return entry; } /* Private */ static inline unsigned int xas_find_chunk(struct xa_state *xas, bool advance, xa_mark_t mark) { unsigned long *addr = xas->xa_node->marks[(__force unsigned)mark]; unsigned int offset = xas->xa_offset; if (advance) offset++; if (XA_CHUNK_SIZE == BITS_PER_LONG) { if (offset < XA_CHUNK_SIZE) { unsigned long data = *addr & (~0UL << offset); if (data) return __ffs(data); } return XA_CHUNK_SIZE; } return find_next_bit(addr, XA_CHUNK_SIZE, offset); } /** * xas_next_marked() - Advance iterator to next marked entry. * @xas: XArray operation state. * @max: Highest index to return. * @mark: Mark to search for. * * xas_next_marked() is an inline function to optimise xarray traversal for * speed. It is equivalent to calling xas_find_marked(), and will call * xas_find_marked() for all the hard cases. * * Return: The next marked entry after the one currently referred to by @xas. */ static inline void *xas_next_marked(struct xa_state *xas, unsigned long max, xa_mark_t mark) { struct xa_node *node = xas->xa_node; void *entry; unsigned int offset; if (unlikely(xas_not_node(node) || node->shift)) return xas_find_marked(xas, max, mark); offset = xas_find_chunk(xas, true, mark); xas->xa_offset = offset; xas->xa_index = (xas->xa_index & ~XA_CHUNK_MASK) + offset; if (xas->xa_index > max) return NULL; if (offset == XA_CHUNK_SIZE) return xas_find_marked(xas, max, mark); entry = xa_entry(xas->xa, node, offset); if (!entry) return xas_find_marked(xas, max, mark); return entry; } /* * If iterating while holding a lock, drop the lock and reschedule * every %XA_CHECK_SCHED loops. */ enum { XA_CHECK_SCHED = 4096, }; /** * xas_for_each() - Iterate over a range of an XArray. * @xas: XArray operation state. * @entry: Entry retrieved from the array. * @max: Maximum index to retrieve from array. * * The loop body will be executed for each entry present in the xarray * between the current xas position and @max. @entry will be set to * the entry retrieved from the xarray. It is safe to delete entries * from the array in the loop body. You should hold either the RCU lock * or the xa_lock while iterating. If you need to drop the lock, call * xas_pause() first. */ #define xas_for_each(xas, entry, max) \ for (entry = xas_find(xas, max); entry; \ entry = xas_next_entry(xas, max)) /** * xas_for_each_marked() - Iterate over a range of an XArray. * @xas: XArray operation state. * @entry: Entry retrieved from the array. * @max: Maximum index to retrieve from array. * @mark: Mark to search for. * * The loop body will be executed for each marked entry in the xarray * between the current xas position and @max. @entry will be set to * the entry retrieved from the xarray. It is safe to delete entries * from the array in the loop body. You should hold either the RCU lock * or the xa_lock while iterating. If you need to drop the lock, call * xas_pause() first. */ #define xas_for_each_marked(xas, entry, max, mark) \ for (entry = xas_find_marked(xas, max, mark); entry; \ entry = xas_next_marked(xas, max, mark)) /** * xas_for_each_conflict() - Iterate over a range of an XArray. * @xas: XArray operation state. * @entry: Entry retrieved from the array. * * The loop body will be executed for each entry in the XArray that * lies within the range specified by @xas. If the loop terminates * normally, @entry will be %NULL. The user may break out of the loop, * which will leave @entry set to the conflicting entry. The caller * may also call xa_set_err() to exit the loop while setting an error * to record the reason. */ #define xas_for_each_conflict(xas, entry) \ while ((entry = xas_find_conflict(xas))) void *__xas_next(struct xa_state *); void *__xas_prev(struct xa_state *); /** * xas_prev() - Move iterator to previous index. * @xas: XArray operation state. * * If the @xas was in an error state, it will remain in an error state * and this function will return %NULL. If the @xas has never been walked, * it will have the effect of calling xas_load(). Otherwise one will be * subtracted from the index and the state will be walked to the correct * location in the array for the next operation. * * If the iterator was referencing index 0, this function wraps * around to %ULONG_MAX. * * Return: The entry at the new index. This may be %NULL or an internal * entry. */ static inline void *xas_prev(struct xa_state *xas) { struct xa_node *node = xas->xa_node; if (unlikely(xas_not_node(node) || node->shift || xas->xa_offset == 0)) return __xas_prev(xas); xas->xa_index--; xas->xa_offset--; return xa_entry(xas->xa, node, xas->xa_offset); } /** * xas_next() - Move state to next index. * @xas: XArray operation state. * * If the @xas was in an error state, it will remain in an error state * and this function will return %NULL. If the @xas has never been walked, * it will have the effect of calling xas_load(). Otherwise one will be * added to the index and the state will be walked to the correct * location in the array for the next operation. * * If the iterator was referencing index %ULONG_MAX, this function wraps * around to 0. * * Return: The entry at the new index. This may be %NULL or an internal * entry. */ static inline void *xas_next(struct xa_state *xas) { struct xa_node *node = xas->xa_node; if (unlikely(xas_not_node(node) || node->shift || xas->xa_offset == XA_CHUNK_MASK)) return __xas_next(xas); xas->xa_index++; xas->xa_offset++; return xa_entry(xas->xa, node, xas->xa_offset); } #endif /* _LINUX_XARRAY_H */
2218 2218 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 /* SPDX-License-Identifier: GPL-2.0 */ /* * Filesystem access notification for Linux * * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com> */ #ifndef __LINUX_FSNOTIFY_BACKEND_H #define __LINUX_FSNOTIFY_BACKEND_H #ifdef __KERNEL__ #include <linux/idr.h> /* inotify uses this */ #include <linux/fs.h> /* struct inode */ #include <linux/list.h> #include <linux/path.h> /* struct path */ #include <linux/spinlock.h> #include <linux/types.h> #include <linux/atomic.h> #include <linux/user_namespace.h> #include <linux/refcount.h> #include <linux/mempool.h> #include <linux/sched/mm.h> /* * IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily * convert between them. dnotify only needs conversion at watch creation * so no perf loss there. fanotify isn't defined yet, so it can use the * wholes if it needs more events. */ #define FS_ACCESS 0x00000001 /* File was accessed */ #define FS_MODIFY 0x00000002 /* File was modified */ #define FS_ATTRIB 0x00000004 /* Metadata changed */ #define FS_CLOSE_WRITE 0x00000008 /* Writtable file was closed */ #define FS_CLOSE_NOWRITE 0x00000010 /* Unwrittable file closed */ #define FS_OPEN 0x00000020 /* File was opened */ #define FS_MOVED_FROM 0x00000040 /* File was moved from X */ #define FS_MOVED_TO 0x00000080 /* File was moved to Y */ #define FS_CREATE 0x00000100 /* Subfile was created */ #define FS_DELETE 0x00000200 /* Subfile was deleted */ #define FS_DELETE_SELF 0x00000400 /* Self was deleted */ #define FS_MOVE_SELF 0x00000800 /* Self was moved */ #define FS_OPEN_EXEC 0x00001000 /* File was opened for exec */ #define FS_UNMOUNT 0x00002000 /* inode on umount fs */ #define FS_Q_OVERFLOW 0x00004000 /* Event queued overflowed */ #define FS_ERROR 0x00008000 /* Filesystem Error (fanotify) */ /* * FS_IN_IGNORED overloads FS_ERROR. It is only used internally by inotify * which does not support FS_ERROR. */ #define FS_IN_IGNORED 0x00008000 /* last inotify event here */ #define FS_OPEN_PERM 0x00010000 /* open event in an permission hook */ #define FS_ACCESS_PERM 0x00020000 /* access event in a permissions hook */ #define FS_OPEN_EXEC_PERM 0x00040000 /* open/exec event in a permission hook */ /* * Set on inode mark that cares about things that happen to its children. * Always set for dnotify and inotify. * Set on inode/sb/mount marks that care about parent/name info. */ #define FS_EVENT_ON_CHILD 0x08000000 #define FS_RENAME 0x10000000 /* File was renamed */ #define FS_DN_MULTISHOT 0x20000000 /* dnotify multishot */ #define FS_ISDIR 0x40000000 /* event occurred against dir */ #define FS_MOVE (FS_MOVED_FROM | FS_MOVED_TO) /* * Directory entry modification events - reported only to directory * where entry is modified and not to a watching parent. * The watching parent may get an FS_ATTRIB|FS_EVENT_ON_CHILD event * when a directory entry inside a child subdir changes. */ #define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE | FS_RENAME) #define ALL_FSNOTIFY_PERM_EVENTS (FS_OPEN_PERM | FS_ACCESS_PERM | \ FS_OPEN_EXEC_PERM) /* * This is a list of all events that may get sent to a parent that is watching * with flag FS_EVENT_ON_CHILD based on fs event on a child of that directory. */ #define FS_EVENTS_POSS_ON_CHILD (ALL_FSNOTIFY_PERM_EVENTS | \ FS_ACCESS | FS_MODIFY | FS_ATTRIB | \ FS_CLOSE_WRITE | FS_CLOSE_NOWRITE | \ FS_OPEN | FS_OPEN_EXEC) /* * This is a list of all events that may get sent with the parent inode as the * @to_tell argument of fsnotify(). * It may include events that can be sent to an inode/sb/mount mark, but cannot * be sent to a parent watching children. */ #define FS_EVENTS_POSS_TO_PARENT (FS_EVENTS_POSS_ON_CHILD) /* Events that can be reported to backends */ #define ALL_FSNOTIFY_EVENTS (ALL_FSNOTIFY_DIRENT_EVENTS | \ FS_EVENTS_POSS_ON_CHILD | \ FS_DELETE_SELF | FS_MOVE_SELF | \ FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED | \ FS_ERROR) /* Extra flags that may be reported with event or control handling of events */ #define ALL_FSNOTIFY_FLAGS (FS_ISDIR | FS_EVENT_ON_CHILD | FS_DN_MULTISHOT) #define ALL_FSNOTIFY_BITS (ALL_FSNOTIFY_EVENTS | ALL_FSNOTIFY_FLAGS) struct fsnotify_group; struct fsnotify_event; struct fsnotify_mark; struct fsnotify_event_private_data; struct fsnotify_fname; struct fsnotify_iter_info; struct mem_cgroup; /* * Each group much define these ops. The fsnotify infrastructure will call * these operations for each relevant group. * * handle_event - main call for a group to handle an fs event * @group: group to notify * @mask: event type and flags * @data: object that event happened on * @data_type: type of object for fanotify_data_XXX() accessors * @dir: optional directory associated with event - * if @file_name is not NULL, this is the directory that * @file_name is relative to * @file_name: optional file name associated with event * @cookie: inotify rename cookie * @iter_info: array of marks from this group that are interested in the event * * handle_inode_event - simple variant of handle_event() for groups that only * have inode marks and don't have ignore mask * @mark: mark to notify * @mask: event type and flags * @inode: inode that event happened on * @dir: optional directory associated with event - * if @file_name is not NULL, this is the directory that * @file_name is relative to. * Either @inode or @dir must be non-NULL. * @file_name: optional file name associated with event * @cookie: inotify rename cookie * * free_group_priv - called when a group refcnt hits 0 to clean up the private union * freeing_mark - called when a mark is being destroyed for some reason. The group * MUST be holding a reference on each mark and that reference must be * dropped in this function. inotify uses this function to send * userspace messages that marks have been removed. */ struct fsnotify_ops { int (*handle_event)(struct fsnotify_group *group, u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *file_name, u32 cookie, struct fsnotify_iter_info *iter_info); int (*handle_inode_event)(struct fsnotify_mark *mark, u32 mask, struct inode *inode, struct inode *dir, const struct qstr *file_name, u32 cookie); void (*free_group_priv)(struct fsnotify_group *group); void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group); void (*free_event)(struct fsnotify_group *group, struct fsnotify_event *event); /* called on final put+free to free memory */ void (*free_mark)(struct fsnotify_mark *mark); }; /* * all of the information about the original object we want to now send to * a group. If you want to carry more info from the accessing task to the * listener this structure is where you need to be adding fields. */ struct fsnotify_event { struct list_head list; }; /* * A group is a "thing" that wants to receive notification about filesystem * events. The mask holds the subset of event types this group cares about. * refcnt on a group is up to the implementor and at any moment if it goes 0 * everything will be cleaned up. */ struct fsnotify_group { const struct fsnotify_ops *ops; /* how this group handles things */ /* * How the refcnt is used is up to each group. When the refcnt hits 0 * fsnotify will clean up all of the resources associated with this group. * As an example, the dnotify group will always have a refcnt=1 and that * will never change. Inotify, on the other hand, has a group per * inotify_init() and the refcnt will hit 0 only when that fd has been * closed. */ refcount_t refcnt; /* things with interest in this group */ /* needed to send notification to userspace */ spinlock_t notification_lock; /* protect the notification_list */ struct list_head notification_list; /* list of event_holder this group needs to send to userspace */ wait_queue_head_t notification_waitq; /* read() on the notification file blocks on this waitq */ unsigned int q_len; /* events on the queue */ unsigned int max_events; /* maximum events allowed on the list */ /* * Valid fsnotify group priorities. Events are send in order from highest * priority to lowest priority. We default to the lowest priority. */ #define FS_PRIO_0 0 /* normal notifiers, no permissions */ #define FS_PRIO_1 1 /* fanotify content based access control */ #define FS_PRIO_2 2 /* fanotify pre-content access */ unsigned int priority; bool shutdown; /* group is being shut down, don't queue more events */ #define FSNOTIFY_GROUP_USER 0x01 /* user allocated group */ #define FSNOTIFY_GROUP_DUPS 0x02 /* allow multiple marks per object */ #define FSNOTIFY_GROUP_NOFS 0x04 /* group lock is not direct reclaim safe */ int flags; unsigned int owner_flags; /* stored flags of mark_mutex owner */ /* stores all fastpath marks assoc with this group so they can be cleaned on unregister */ struct mutex mark_mutex; /* protect marks_list */ atomic_t user_waits; /* Number of tasks waiting for user * response */ struct list_head marks_list; /* all inode marks for this group */ struct fasync_struct *fsn_fa; /* async notification */ struct fsnotify_event *overflow_event; /* Event we queue when the * notification list is too * full */ struct mem_cgroup *memcg; /* memcg to charge allocations */ /* groups can define private fields here or use the void *private */ union { void *private; #ifdef CONFIG_INOTIFY_USER struct inotify_group_private_data { spinlock_t idr_lock; struct idr idr; struct ucounts *ucounts; } inotify_data; #endif #ifdef CONFIG_FANOTIFY struct fanotify_group_private_data { /* Hash table of events for merge */ struct hlist_head *merge_hash; /* allows a group to block waiting for a userspace response */ struct list_head access_list; wait_queue_head_t access_waitq; int flags; /* flags from fanotify_init() */ int f_flags; /* event_f_flags from fanotify_init() */ struct ucounts *ucounts; mempool_t error_events_pool; } fanotify_data; #endif /* CONFIG_FANOTIFY */ }; }; /* * These helpers are used to prevent deadlock when reclaiming inodes with * evictable marks of the same group that is allocating a new mark. */ static inline void fsnotify_group_lock(struct fsnotify_group *group) { mutex_lock(&group->mark_mutex); if (group->flags & FSNOTIFY_GROUP_NOFS) group->owner_flags = memalloc_nofs_save(); } static inline void fsnotify_group_unlock(struct fsnotify_group *group) { if (group->flags & FSNOTIFY_GROUP_NOFS) memalloc_nofs_restore(group->owner_flags); mutex_unlock(&group->mark_mutex); } static inline void fsnotify_group_assert_locked(struct fsnotify_group *group) { WARN_ON_ONCE(!mutex_is_locked(&group->mark_mutex)); if (group->flags & FSNOTIFY_GROUP_NOFS) WARN_ON_ONCE(!(current->flags & PF_MEMALLOC_NOFS)); } /* When calling fsnotify tell it if the data is a path or inode */ enum fsnotify_data_type { FSNOTIFY_EVENT_NONE, FSNOTIFY_EVENT_PATH, FSNOTIFY_EVENT_INODE, FSNOTIFY_EVENT_DENTRY, FSNOTIFY_EVENT_ERROR, }; struct fs_error_report { int error; struct inode *inode; struct super_block *sb; }; static inline struct inode *fsnotify_data_inode(const void *data, int data_type) { switch (data_type) { case FSNOTIFY_EVENT_INODE: return (struct inode *)data; case FSNOTIFY_EVENT_DENTRY: return d_inode(data); case FSNOTIFY_EVENT_PATH: return d_inode(((const struct path *)data)->dentry); case FSNOTIFY_EVENT_ERROR: return ((struct fs_error_report *)data)->inode; default: return NULL; } } static inline struct dentry *fsnotify_data_dentry(const void *data, int data_type) { switch (data_type) { case FSNOTIFY_EVENT_DENTRY: /* Non const is needed for dget() */ return (struct dentry *)data; case FSNOTIFY_EVENT_PATH: return ((const struct path *)data)->dentry; default: return NULL; } } static inline const struct path *fsnotify_data_path(const void *data, int data_type) { switch (data_type) { case FSNOTIFY_EVENT_PATH: return data; default: return NULL; } } static inline struct super_block *fsnotify_data_sb(const void *data, int data_type) { switch (data_type) { case FSNOTIFY_EVENT_INODE: return ((struct inode *)data)->i_sb; case FSNOTIFY_EVENT_DENTRY: return ((struct dentry *)data)->d_sb; case FSNOTIFY_EVENT_PATH: return ((const struct path *)data)->dentry->d_sb; case FSNOTIFY_EVENT_ERROR: return ((struct fs_error_report *) data)->sb; default: return NULL; } } static inline struct fs_error_report *fsnotify_data_error_report( const void *data, int data_type) { switch (data_type) { case FSNOTIFY_EVENT_ERROR: return (struct fs_error_report *) data; default: return NULL; } } /* * Index to merged marks iterator array that correlates to a type of watch. * The type of watched object can be deduced from the iterator type, but not * the other way around, because an event can match different watched objects * of the same object type. * For example, both parent and child are watching an object of type inode. */ enum fsnotify_iter_type { FSNOTIFY_ITER_TYPE_INODE, FSNOTIFY_ITER_TYPE_VFSMOUNT, FSNOTIFY_ITER_TYPE_SB, FSNOTIFY_ITER_TYPE_PARENT, FSNOTIFY_ITER_TYPE_INODE2, FSNOTIFY_ITER_TYPE_COUNT }; /* The type of object that a mark is attached to */ enum fsnotify_obj_type { FSNOTIFY_OBJ_TYPE_ANY = -1, FSNOTIFY_OBJ_TYPE_INODE, FSNOTIFY_OBJ_TYPE_VFSMOUNT, FSNOTIFY_OBJ_TYPE_SB, FSNOTIFY_OBJ_TYPE_COUNT, FSNOTIFY_OBJ_TYPE_DETACHED = FSNOTIFY_OBJ_TYPE_COUNT }; static inline bool fsnotify_valid_obj_type(unsigned int obj_type) { return (obj_type < FSNOTIFY_OBJ_TYPE_COUNT); } struct fsnotify_iter_info { struct fsnotify_mark *marks[FSNOTIFY_ITER_TYPE_COUNT]; struct fsnotify_group *current_group; unsigned int report_mask; int srcu_idx; }; static inline bool fsnotify_iter_should_report_type( struct fsnotify_iter_info *iter_info, int iter_type) { return (iter_info->report_mask & (1U << iter_type)); } static inline void fsnotify_iter_set_report_type( struct fsnotify_iter_info *iter_info, int iter_type) { iter_info->report_mask |= (1U << iter_type); } static inline struct fsnotify_mark *fsnotify_iter_mark( struct fsnotify_iter_info *iter_info, int iter_type) { if (fsnotify_iter_should_report_type(iter_info, iter_type)) return iter_info->marks[iter_type]; return NULL; } static inline int fsnotify_iter_step(struct fsnotify_iter_info *iter, int type, struct fsnotify_mark **markp) { while (type < FSNOTIFY_ITER_TYPE_COUNT) { *markp = fsnotify_iter_mark(iter, type); if (*markp) break; type++; } return type; } #define FSNOTIFY_ITER_FUNCS(name, NAME) \ static inline struct fsnotify_mark *fsnotify_iter_##name##_mark( \ struct fsnotify_iter_info *iter_info) \ { \ return fsnotify_iter_mark(iter_info, FSNOTIFY_ITER_TYPE_##NAME); \ } FSNOTIFY_ITER_FUNCS(inode, INODE) FSNOTIFY_ITER_FUNCS(parent, PARENT) FSNOTIFY_ITER_FUNCS(vfsmount, VFSMOUNT) FSNOTIFY_ITER_FUNCS(sb, SB) #define fsnotify_foreach_iter_type(type) \ for (type = 0; type < FSNOTIFY_ITER_TYPE_COUNT; type++) #define fsnotify_foreach_iter_mark_type(iter, mark, type) \ for (type = 0; \ type = fsnotify_iter_step(iter, type, &mark), \ type < FSNOTIFY_ITER_TYPE_COUNT; \ type++) /* * fsnotify_connp_t is what we embed in objects which connector can be attached * to. fsnotify_connp_t * is how we refer from connector back to object. */ struct fsnotify_mark_connector; typedef struct fsnotify_mark_connector __rcu *fsnotify_connp_t; /* * Inode/vfsmount/sb point to this structure which tracks all marks attached to * the inode/vfsmount/sb. The reference to inode/vfsmount/sb is held by this * structure. We destroy this structure when there are no more marks attached * to it. The structure is protected by fsnotify_mark_srcu. */ struct fsnotify_mark_connector { spinlock_t lock; unsigned short type; /* Type of object [lock] */ #define FSNOTIFY_CONN_FLAG_HAS_FSID 0x01 #define FSNOTIFY_CONN_FLAG_HAS_IREF 0x02 unsigned short flags; /* flags [lock] */ __kernel_fsid_t fsid; /* fsid of filesystem containing object */ union { /* Object pointer [lock] */ fsnotify_connp_t *obj; /* Used listing heads to free after srcu period expires */ struct fsnotify_mark_connector *destroy_next; }; struct hlist_head list; }; /* * A mark is simply an object attached to an in core inode which allows an * fsnotify listener to indicate they are either no longer interested in events * of a type matching mask or only interested in those events. * * These are flushed when an inode is evicted from core and may be flushed * when the inode is modified (as seen by fsnotify_access). Some fsnotify * users (such as dnotify) will flush these when the open fd is closed and not * at inode eviction or modification. * * Text in brackets is showing the lock(s) protecting modifications of a * particular entry. obj_lock means either inode->i_lock or * mnt->mnt_root->d_lock depending on the mark type. */ struct fsnotify_mark { /* Mask this mark is for [mark->lock, group->mark_mutex] */ __u32 mask; /* We hold one for presence in g_list. Also one ref for each 'thing' * in kernel that found and may be using this mark. */ refcount_t refcnt; /* Group this mark is for. Set on mark creation, stable until last ref * is dropped */ struct fsnotify_group *group; /* List of marks by group->marks_list. Also reused for queueing * mark into destroy_list when it's waiting for the end of SRCU period * before it can be freed. [group->mark_mutex] */ struct list_head g_list; /* Protects inode / mnt pointers, flags, masks */ spinlock_t lock; /* List of marks for inode / vfsmount [connector->lock, mark ref] */ struct hlist_node obj_list; /* Head of list of marks for an object [mark ref] */ struct fsnotify_mark_connector *connector; /* Events types and flags to ignore [mark->lock, group->mark_mutex] */ __u32 ignore_mask; /* General fsnotify mark flags */ #define FSNOTIFY_MARK_FLAG_ALIVE 0x0001 #define FSNOTIFY_MARK_FLAG_ATTACHED 0x0002 /* inotify mark flags */ #define FSNOTIFY_MARK_FLAG_EXCL_UNLINK 0x0010 #define FSNOTIFY_MARK_FLAG_IN_ONESHOT 0x0020 /* fanotify mark flags */ #define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x0100 #define FSNOTIFY_MARK_FLAG_NO_IREF 0x0200 #define FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS 0x0400 unsigned int flags; /* flags [mark->lock] */ }; #ifdef CONFIG_FSNOTIFY /* called from the vfs helpers */ /* main fsnotify call to send events */ extern int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, struct inode *inode, u32 cookie); extern int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data, int data_type); extern void __fsnotify_inode_delete(struct inode *inode); extern void __fsnotify_vfsmount_delete(struct vfsmount *mnt); extern void fsnotify_sb_delete(struct super_block *sb); extern u32 fsnotify_get_cookie(void); static inline __u32 fsnotify_parent_needed_mask(__u32 mask) { /* FS_EVENT_ON_CHILD is set on marks that want parent/name info */ if (!(mask & FS_EVENT_ON_CHILD)) return 0; /* * This object might be watched by a mark that cares about parent/name * info, does it care about the specific set of events that can be * reported with parent/name info? */ return mask & FS_EVENTS_POSS_TO_PARENT; } static inline int fsnotify_inode_watches_children(struct inode *inode) { /* FS_EVENT_ON_CHILD is set if the inode may care */ if (!(inode->i_fsnotify_mask & FS_EVENT_ON_CHILD)) return 0; /* this inode might care about child events, does it care about the * specific set of events that can happen on a child? */ return inode->i_fsnotify_mask & FS_EVENTS_POSS_ON_CHILD; } /* * Update the dentry with a flag indicating the interest of its parent to receive * filesystem events when those events happens to this dentry->d_inode. */ static inline void fsnotify_update_flags(struct dentry *dentry) { assert_spin_locked(&dentry->d_lock); /* * Serialisation of setting PARENT_WATCHED on the dentries is provided * by d_lock. If inotify_inode_watched changes after we have taken * d_lock, the following __fsnotify_update_child_dentry_flags call will * find our entry, so it will spin until we complete here, and update * us with the new state. */ if (fsnotify_inode_watches_children(dentry->d_parent->d_inode)) dentry->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED; else dentry->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED; } /* called from fsnotify listeners, such as fanotify or dnotify */ /* create a new group */ extern struct fsnotify_group *fsnotify_alloc_group( const struct fsnotify_ops *ops, int flags); /* get reference to a group */ extern void fsnotify_get_group(struct fsnotify_group *group); /* drop reference on a group from fsnotify_alloc_group */ extern void fsnotify_put_group(struct fsnotify_group *group); /* group destruction begins, stop queuing new events */ extern void fsnotify_group_stop_queueing(struct fsnotify_group *group); /* destroy group */ extern void fsnotify_destroy_group(struct fsnotify_group *group); /* fasync handler function */ extern int fsnotify_fasync(int fd, struct file *file, int on); /* Free event from memory */ extern void fsnotify_destroy_event(struct fsnotify_group *group, struct fsnotify_event *event); /* attach the event to the group notification queue */ extern int fsnotify_insert_event(struct fsnotify_group *group, struct fsnotify_event *event, int (*merge)(struct fsnotify_group *, struct fsnotify_event *), void (*insert)(struct fsnotify_group *, struct fsnotify_event *)); static inline int fsnotify_add_event(struct fsnotify_group *group, struct fsnotify_event *event, int (*merge)(struct fsnotify_group *, struct fsnotify_event *)) { return fsnotify_insert_event(group, event, merge, NULL); } /* Queue overflow event to a notification group */ static inline void fsnotify_queue_overflow(struct fsnotify_group *group) { fsnotify_add_event(group, group->overflow_event, NULL); } static inline bool fsnotify_is_overflow_event(u32 mask) { return mask & FS_Q_OVERFLOW; } static inline bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group) { assert_spin_locked(&group->notification_lock); return list_empty(&group->notification_list); } extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group); /* return, but do not dequeue the first event on the notification queue */ extern struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group); /* return AND dequeue the first event on the notification queue */ extern struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group); /* Remove event queued in the notification list */ extern void fsnotify_remove_queued_event(struct fsnotify_group *group, struct fsnotify_event *event); /* functions used to manipulate the marks attached to inodes */ /* * Canonical "ignore mask" including event flags. * * Note the subtle semantic difference from the legacy ->ignored_mask. * ->ignored_mask traditionally only meant which events should be ignored, * while ->ignore_mask also includes flags regarding the type of objects on * which events should be ignored. */ static inline __u32 fsnotify_ignore_mask(struct fsnotify_mark *mark) { __u32 ignore_mask = mark->ignore_mask; /* The event flags in ignore mask take effect */ if (mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS) return ignore_mask; /* * Legacy behavior: * - Always ignore events on dir * - Ignore events on child if parent is watching children */ ignore_mask |= FS_ISDIR; ignore_mask &= ~FS_EVENT_ON_CHILD; ignore_mask |= mark->mask & FS_EVENT_ON_CHILD; return ignore_mask; } /* Legacy ignored_mask - only event types to ignore */ static inline __u32 fsnotify_ignored_events(struct fsnotify_mark *mark) { return mark->ignore_mask & ALL_FSNOTIFY_EVENTS; } /* * Check if mask (or ignore mask) should be applied depending if victim is a * directory and whether it is reported to a watching parent. */ static inline bool fsnotify_mask_applicable(__u32 mask, bool is_dir, int iter_type) { /* Should mask be applied to a directory? */ if (is_dir && !(mask & FS_ISDIR)) return false; /* Should mask be applied to a child? */ if (iter_type == FSNOTIFY_ITER_TYPE_PARENT && !(mask & FS_EVENT_ON_CHILD)) return false; return true; } /* * Effective ignore mask taking into account if event victim is a * directory and whether it is reported to a watching parent. */ static inline __u32 fsnotify_effective_ignore_mask(struct fsnotify_mark *mark, bool is_dir, int iter_type) { __u32 ignore_mask = fsnotify_ignored_events(mark); if (!ignore_mask) return 0; /* For non-dir and non-child, no need to consult the event flags */ if (!is_dir && iter_type != FSNOTIFY_ITER_TYPE_PARENT) return ignore_mask; ignore_mask = fsnotify_ignore_mask(mark); if (!fsnotify_mask_applicable(ignore_mask, is_dir, iter_type)) return 0; return ignore_mask & ALL_FSNOTIFY_EVENTS; } /* Get mask for calculating object interest taking ignore mask into account */ static inline __u32 fsnotify_calc_mask(struct fsnotify_mark *mark) { __u32 mask = mark->mask; if (!fsnotify_ignored_events(mark)) return mask; /* Interest in FS_MODIFY may be needed for clearing ignore mask */ if (!(mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) mask |= FS_MODIFY; /* * If mark is interested in ignoring events on children, the object must * show interest in those events for fsnotify_parent() to notice it. */ return mask | mark->ignore_mask; } /* Get mask of events for a list of marks */ extern __u32 fsnotify_conn_mask(struct fsnotify_mark_connector *conn); /* Calculate mask of events for a list of marks */ extern void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn); extern void fsnotify_init_mark(struct fsnotify_mark *mark, struct fsnotify_group *group); /* Find mark belonging to given group in the list of marks */ extern struct fsnotify_mark *fsnotify_find_mark(fsnotify_connp_t *connp, struct fsnotify_group *group); /* attach the mark to the object */ extern int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp, unsigned int obj_type, int add_flags, __kernel_fsid_t *fsid); extern int fsnotify_add_mark_locked(struct fsnotify_mark *mark, fsnotify_connp_t *connp, unsigned int obj_type, int add_flags, __kernel_fsid_t *fsid); /* attach the mark to the inode */ static inline int fsnotify_add_inode_mark(struct fsnotify_mark *mark, struct inode *inode, int add_flags) { return fsnotify_add_mark(mark, &inode->i_fsnotify_marks, FSNOTIFY_OBJ_TYPE_INODE, add_flags, NULL); } static inline int fsnotify_add_inode_mark_locked(struct fsnotify_mark *mark, struct inode *inode, int add_flags) { return fsnotify_add_mark_locked(mark, &inode->i_fsnotify_marks, FSNOTIFY_OBJ_TYPE_INODE, add_flags, NULL); } /* given a group and a mark, flag mark to be freed when all references are dropped */ extern void fsnotify_destroy_mark(struct fsnotify_mark *mark, struct fsnotify_group *group); /* detach mark from inode / mount list, group list, drop inode reference */ extern void fsnotify_detach_mark(struct fsnotify_mark *mark); /* free mark */ extern void fsnotify_free_mark(struct fsnotify_mark *mark); /* Wait until all marks queued for destruction are destroyed */ extern void fsnotify_wait_marks_destroyed(void); /* Clear all of the marks of a group attached to a given object type */ extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group, unsigned int obj_type); /* run all the marks in a group, and clear all of the vfsmount marks */ static inline void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group) { fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_VFSMOUNT); } /* run all the marks in a group, and clear all of the inode marks */ static inline void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group) { fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_INODE); } /* run all the marks in a group, and clear all of the sn marks */ static inline void fsnotify_clear_sb_marks_by_group(struct fsnotify_group *group) { fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_SB); } extern void fsnotify_get_mark(struct fsnotify_mark *mark); extern void fsnotify_put_mark(struct fsnotify_mark *mark); extern void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info); extern bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info); static inline void fsnotify_init_event(struct fsnotify_event *event) { INIT_LIST_HEAD(&event->list); } #else static inline int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, struct inode *inode, u32 cookie) { return 0; } static inline int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data, int data_type) { return 0; } static inline void __fsnotify_inode_delete(struct inode *inode) {} static inline void __fsnotify_vfsmount_delete(struct vfsmount *mnt) {} static inline void fsnotify_sb_delete(struct super_block *sb) {} static inline void fsnotify_update_flags(struct dentry *dentry) {} static inline u32 fsnotify_get_cookie(void) { return 0; } static inline void fsnotify_unmount_inodes(struct super_block *sb) {} #endif /* CONFIG_FSNOTIFY */ #endif /* __KERNEL __ */ #endif /* __LINUX_FSNOTIFY_BACKEND_H */
23 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved. * Authors: David Chinner and Glauber Costa * * Generic LRU infrastructure */ #ifndef _LRU_LIST_H #define _LRU_LIST_H #include <linux/list.h> #include <linux/nodemask.h> #include <linux/shrinker.h> #include <linux/xarray.h> struct mem_cgroup; /* list_lru_walk_cb has to always return one of those */ enum lru_status { LRU_REMOVED, /* item removed from list */ LRU_REMOVED_RETRY, /* item removed, but lock has been dropped and reacquired */ LRU_ROTATE, /* item referenced, give another pass */ LRU_SKIP, /* item cannot be locked, skip */ LRU_RETRY, /* item not freeable. May drop the lock internally, but has to return locked. */ }; struct list_lru_one { struct list_head list; /* may become negative during memcg reparenting */ long nr_items; }; struct list_lru_memcg { struct rcu_head rcu; /* array of per cgroup per node lists, indexed by node id */ struct list_lru_one node[]; }; struct list_lru_node { /* protects all lists on the node, including per cgroup */ spinlock_t lock; /* global list, used for the root cgroup in cgroup aware lrus */ struct list_lru_one lru; long nr_items; } ____cacheline_aligned_in_smp; struct list_lru { struct list_lru_node *node; #ifdef CONFIG_MEMCG_KMEM struct list_head list; int shrinker_id; bool memcg_aware; struct xarray xa; #endif }; void list_lru_destroy(struct list_lru *lru); int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct lock_class_key *key, struct shrinker *shrinker); #define list_lru_init(lru) \ __list_lru_init((lru), false, NULL, NULL) #define list_lru_init_key(lru, key) \ __list_lru_init((lru), false, (key), NULL) #define list_lru_init_memcg(lru, shrinker) \ __list_lru_init((lru), true, NULL, shrinker) int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, gfp_t gfp); void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent); /** * list_lru_add: add an element to the lru list's tail * @list_lru: the lru pointer * @item: the item to be added. * * If the element is already part of a list, this function returns doing * nothing. Therefore the caller does not need to keep state about whether or * not the element already belongs in the list and is allowed to lazy update * it. Note however that this is valid for *a* list, not *this* list. If * the caller organize itself in a way that elements can be in more than * one type of list, it is up to the caller to fully remove the item from * the previous list (with list_lru_del() for instance) before moving it * to @list_lru * * Return value: true if the list was updated, false otherwise */ bool list_lru_add(struct list_lru *lru, struct list_head *item); /** * list_lru_del: delete an element to the lru list * @list_lru: the lru pointer * @item: the item to be deleted. * * This function works analogously as list_lru_add in terms of list * manipulation. The comments about an element already pertaining to * a list are also valid for list_lru_del. * * Return value: true if the list was updated, false otherwise */ bool list_lru_del(struct list_lru *lru, struct list_head *item); /** * list_lru_count_one: return the number of objects currently held by @lru * @lru: the lru pointer. * @nid: the node id to count from. * @memcg: the cgroup to count from. * * Always return a non-negative number, 0 for empty lists. There is no * guarantee that the list is not updated while the count is being computed. * Callers that want such a guarantee need to provide an outer lock. */ unsigned long list_lru_count_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg); unsigned long list_lru_count_node(struct list_lru *lru, int nid); static inline unsigned long list_lru_shrink_count(struct list_lru *lru, struct shrink_control *sc) { return list_lru_count_one(lru, sc->nid, sc->memcg); } static inline unsigned long list_lru_count(struct list_lru *lru) { long count = 0; int nid; for_each_node_state(nid, N_NORMAL_MEMORY) count += list_lru_count_node(lru, nid); return count; } void list_lru_isolate(struct list_lru_one *list, struct list_head *item); void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, struct list_head *head); typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item, struct list_lru_one *list, spinlock_t *lock, void *cb_arg); /** * list_lru_walk_one: walk a list_lru, isolating and disposing freeable items. * @lru: the lru pointer. * @nid: the node id to scan from. * @memcg: the cgroup to scan from. * @isolate: callback function that is responsible for deciding what to do with * the item currently being scanned * @cb_arg: opaque type that will be passed to @isolate * @nr_to_walk: how many items to scan. * * This function will scan all elements in a particular list_lru, calling the * @isolate callback for each of those items, along with the current list * spinlock and a caller-provided opaque. The @isolate callback can choose to * drop the lock internally, but *must* return with the lock held. The callback * will return an enum lru_status telling the list_lru infrastructure what to * do with the object being scanned. * * Please note that nr_to_walk does not mean how many objects will be freed, * just how many objects will be scanned. * * Return value: the number of objects effectively removed from the LRU. */ unsigned long list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk); /** * list_lru_walk_one_irq: walk a list_lru, isolating and disposing freeable items. * @lru: the lru pointer. * @nid: the node id to scan from. * @memcg: the cgroup to scan from. * @isolate: callback function that is responsible for deciding what to do with * the item currently being scanned * @cb_arg: opaque type that will be passed to @isolate * @nr_to_walk: how many items to scan. * * Same as @list_lru_walk_one except that the spinlock is acquired with * spin_lock_irq(). */ unsigned long list_lru_walk_one_irq(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk); unsigned long list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk); static inline unsigned long list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc, list_lru_walk_cb isolate, void *cb_arg) { return list_lru_walk_one(lru, sc->nid, sc->memcg, isolate, cb_arg, &sc->nr_to_scan); } static inline unsigned long list_lru_shrink_walk_irq(struct list_lru *lru, struct shrink_control *sc, list_lru_walk_cb isolate, void *cb_arg) { return list_lru_walk_one_irq(lru, sc->nid, sc->memcg, isolate, cb_arg, &sc->nr_to_scan); } static inline unsigned long list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate, void *cb_arg, unsigned long nr_to_walk) { long isolated = 0; int nid; for_each_node_state(nid, N_NORMAL_MEMORY) { isolated += list_lru_walk_node(lru, nid, isolate, cb_arg, &nr_to_walk); if (nr_to_walk <= 0) break; } return isolated; } #endif /* _LRU_LIST_H */
1 110 115 4 110 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM tcp #if !defined(_TRACE_TCP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_TCP_H #include <linux/ipv6.h> #include <linux/tcp.h> #include <linux/tracepoint.h> #include <net/ipv6.h> #include <net/tcp.h> #include <linux/sock_diag.h> #define TP_STORE_V4MAPPED(__entry, saddr, daddr) \ do { \ struct in6_addr *pin6; \ \ pin6 = (struct in6_addr *)__entry->saddr_v6; \ ipv6_addr_set_v4mapped(saddr, pin6); \ pin6 = (struct in6_addr *)__entry->daddr_v6; \ ipv6_addr_set_v4mapped(daddr, pin6); \ } while (0) #if IS_ENABLED(CONFIG_IPV6) #define TP_STORE_ADDRS(__entry, saddr, daddr, saddr6, daddr6) \ do { \ if (sk->sk_family == AF_INET6) { \ struct in6_addr *pin6; \ \ pin6 = (struct in6_addr *)__entry->saddr_v6; \ *pin6 = saddr6; \ pin6 = (struct in6_addr *)__entry->daddr_v6; \ *pin6 = daddr6; \ } else { \ TP_STORE_V4MAPPED(__entry, saddr, daddr); \ } \ } while (0) #else #define TP_STORE_ADDRS(__entry, saddr, daddr, saddr6, daddr6) \ TP_STORE_V4MAPPED(__entry, saddr, daddr) #endif /* * tcp event with arguments sk and skb * * Note: this class requires a valid sk pointer; while skb pointer could * be NULL. */ DECLARE_EVENT_CLASS(tcp_event_sk_skb, TP_PROTO(const struct sock *sk, const struct sk_buff *skb), TP_ARGS(sk, skb), TP_STRUCT__entry( __field(const void *, skbaddr) __field(const void *, skaddr) __field(int, state) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) ), TP_fast_assign( const struct inet_sock *inet = inet_sk(sk); __be32 *p32; __entry->skbaddr = skb; __entry->skaddr = sk; __entry->state = sk->sk_state; __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); __entry->family = sk->sk_family; p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); ), TP_printk("family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c state=%s", show_family_name(__entry->family), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, show_tcp_state_name(__entry->state)) ); DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb, TP_PROTO(const struct sock *sk, const struct sk_buff *skb), TP_ARGS(sk, skb) ); /* * skb of trace_tcp_send_reset is the skb that caused RST. In case of * active reset, skb should be NULL */ DEFINE_EVENT(tcp_event_sk_skb, tcp_send_reset, TP_PROTO(const struct sock *sk, const struct sk_buff *skb), TP_ARGS(sk, skb) ); /* * tcp event with arguments sk * * Note: this class requires a valid sk pointer. */ DECLARE_EVENT_CLASS(tcp_event_sk, TP_PROTO(struct sock *sk), TP_ARGS(sk), TP_STRUCT__entry( __field(const void *, skaddr) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) __field(__u64, sock_cookie) ), TP_fast_assign( struct inet_sock *inet = inet_sk(sk); __be32 *p32; __entry->skaddr = sk; __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); __entry->family = sk->sk_family; p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); __entry->sock_cookie = sock_gen_cookie(sk); ), TP_printk("family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c sock_cookie=%llx", show_family_name(__entry->family), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, __entry->sock_cookie) ); DEFINE_EVENT(tcp_event_sk, tcp_receive_reset, TP_PROTO(struct sock *sk), TP_ARGS(sk) ); DEFINE_EVENT(tcp_event_sk, tcp_destroy_sock, TP_PROTO(struct sock *sk), TP_ARGS(sk) ); DEFINE_EVENT(tcp_event_sk, tcp_rcv_space_adjust, TP_PROTO(struct sock *sk), TP_ARGS(sk) ); TRACE_EVENT(tcp_retransmit_synack, TP_PROTO(const struct sock *sk, const struct request_sock *req), TP_ARGS(sk, req), TP_STRUCT__entry( __field(const void *, skaddr) __field(const void *, req) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) ), TP_fast_assign( struct inet_request_sock *ireq = inet_rsk(req); __be32 *p32; __entry->skaddr = sk; __entry->req = req; __entry->sport = ireq->ir_num; __entry->dport = ntohs(ireq->ir_rmt_port); __entry->family = sk->sk_family; p32 = (__be32 *) __entry->saddr; *p32 = ireq->ir_loc_addr; p32 = (__be32 *) __entry->daddr; *p32 = ireq->ir_rmt_addr; TP_STORE_ADDRS(__entry, ireq->ir_loc_addr, ireq->ir_rmt_addr, ireq->ir_v6_loc_addr, ireq->ir_v6_rmt_addr); ), TP_printk("family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c", show_family_name(__entry->family), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6) ); #include <trace/events/net_probe_common.h> TRACE_EVENT(tcp_probe, TP_PROTO(struct sock *sk, struct sk_buff *skb), TP_ARGS(sk, skb), TP_STRUCT__entry( /* sockaddr_in6 is always bigger than sockaddr_in */ __array(__u8, saddr, sizeof(struct sockaddr_in6)) __array(__u8, daddr, sizeof(struct sockaddr_in6)) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __field(__u32, mark) __field(__u16, data_len) __field(__u32, snd_nxt) __field(__u32, snd_una) __field(__u32, snd_cwnd) __field(__u32, ssthresh) __field(__u32, snd_wnd) __field(__u32, srtt) __field(__u32, rcv_wnd) __field(__u64, sock_cookie) ), TP_fast_assign( const struct tcphdr *th = (const struct tcphdr *)skb->data; const struct inet_sock *inet = inet_sk(sk); const struct tcp_sock *tp = tcp_sk(sk); memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); TP_STORE_ADDR_PORTS(__entry, inet, sk); /* For filtering use */ __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); __entry->mark = skb->mark; __entry->family = sk->sk_family; __entry->data_len = skb->len - __tcp_hdrlen(th); __entry->snd_nxt = tp->snd_nxt; __entry->snd_una = tp->snd_una; __entry->snd_cwnd = tcp_snd_cwnd(tp); __entry->snd_wnd = tp->snd_wnd; __entry->rcv_wnd = tp->rcv_wnd; __entry->ssthresh = tcp_current_ssthresh(sk); __entry->srtt = tp->srtt_us >> 3; __entry->sock_cookie = sock_gen_cookie(sk); ), TP_printk("family=%s src=%pISpc dest=%pISpc mark=%#x data_len=%d snd_nxt=%#x snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u rcv_wnd=%u sock_cookie=%llx", show_family_name(__entry->family), __entry->saddr, __entry->daddr, __entry->mark, __entry->data_len, __entry->snd_nxt, __entry->snd_una, __entry->snd_cwnd, __entry->ssthresh, __entry->snd_wnd, __entry->srtt, __entry->rcv_wnd, __entry->sock_cookie) ); #define TP_STORE_ADDR_PORTS_SKB_V4(__entry, skb) \ do { \ const struct tcphdr *th = (const struct tcphdr *)skb->data; \ struct sockaddr_in *v4 = (void *)__entry->saddr; \ \ v4->sin_family = AF_INET; \ v4->sin_port = th->source; \ v4->sin_addr.s_addr = ip_hdr(skb)->saddr; \ v4 = (void *)__entry->daddr; \ v4->sin_family = AF_INET; \ v4->sin_port = th->dest; \ v4->sin_addr.s_addr = ip_hdr(skb)->daddr; \ } while (0) #if IS_ENABLED(CONFIG_IPV6) #define TP_STORE_ADDR_PORTS_SKB(__entry, skb) \ do { \ const struct iphdr *iph = ip_hdr(skb); \ \ if (iph->version == 6) { \ const struct tcphdr *th = (const struct tcphdr *)skb->data; \ struct sockaddr_in6 *v6 = (void *)__entry->saddr; \ \ v6->sin6_family = AF_INET6; \ v6->sin6_port = th->source; \ v6->sin6_addr = ipv6_hdr(skb)->saddr; \ v6 = (void *)__entry->daddr; \ v6->sin6_family = AF_INET6; \ v6->sin6_port = th->dest; \ v6->sin6_addr = ipv6_hdr(skb)->daddr; \ } else \ TP_STORE_ADDR_PORTS_SKB_V4(__entry, skb); \ } while (0) #else #define TP_STORE_ADDR_PORTS_SKB(__entry, skb) \ TP_STORE_ADDR_PORTS_SKB_V4(__entry, skb) #endif /* * tcp event with only skb */ DECLARE_EVENT_CLASS(tcp_event_skb, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb), TP_STRUCT__entry( __field(const void *, skbaddr) __array(__u8, saddr, sizeof(struct sockaddr_in6)) __array(__u8, daddr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( __entry->skbaddr = skb; memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); TP_STORE_ADDR_PORTS_SKB(__entry, skb); ), TP_printk("src=%pISpc dest=%pISpc", __entry->saddr, __entry->daddr) ); DEFINE_EVENT(tcp_event_skb, tcp_bad_csum, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb) ); TRACE_EVENT(tcp_cong_state_set, TP_PROTO(struct sock *sk, const u8 ca_state), TP_ARGS(sk, ca_state), TP_STRUCT__entry( __field(const void *, skaddr) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) __field(__u8, cong_state) ), TP_fast_assign( struct inet_sock *inet = inet_sk(sk); __be32 *p32; __entry->skaddr = sk; __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); __entry->family = sk->sk_family; p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); __entry->cong_state = ca_state; ), TP_printk("family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c cong_state=%u", show_family_name(__entry->family), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, __entry->cong_state) ); #endif /* _TRACE_TCP_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1560 1558 1558 1558 1560 492 492 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620 9621 9622 9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9758 9759 9760 9761 9762 9763 9764 9765 9766 9767 9768 9769 9770 9771 9772 9773 9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784 9785 9786 9787 9788 9789 9790 9791 9792 9793 9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805 9806 9807 9808 9809 9810 9811 9812 9813 9814 9815 9816 9817 9818 9819 9820 9821 9822 9823 9824 9825 9826 9827 9828 9829 9830 9831 9832 9833 9834 9835 9836 9837 9838 9839 9840 9841 9842 9843 9844 9845 9846 9847 9848 9849 9850 9851 9852 9853 9854 9855 9856 9857 9858 9859 9860 9861 9862 9863 9864 9865 9866 9867 9868 9869 9870 9871 9872 9873 9874 9875 9876 9877 9878 9879 9880 9881 9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895 9896 9897 9898 9899 9900 9901 9902 9903 9904 9905 9906 9907 9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924 9925 9926 9927 9928 9929 9930 9931 9932 9933 9934 9935 9936 9937 9938 9939 9940 9941 9942 9943 9944 9945 9946 9947 9948 9949 9950 9951 9952 9953 9954 9955 9956 9957 9958 9959 9960 9961 9962 9963 9964 9965 9966 9967 9968 9969 9970 9971 9972 9973 9974 9975 9976 9977 9978 9979 9980 9981 9982 9983 9984 9985 9986 9987 9988 9989 9990 9991 9992 9993 9994 9995 9996 9997 9998 9999 10000 10001 10002 10003 10004 10005 10006 10007 10008 10009 10010 10011 10012 10013 10014 10015 10016 10017 10018 10019 10020 10021 10022 10023 10024 10025 10026 10027 10028 10029 10030 10031 10032 10033 10034 10035 10036 10037 10038 10039 10040 10041 10042 10043 10044 10045 10046 10047 10048 10049 10050 10051 10052 10053 10054 10055 10056 10057 10058 10059 10060 10061 10062 10063 10064 10065 10066 10067 10068 10069 10070 10071 10072 10073 10074 10075 10076 10077 10078 10079 10080 10081 10082 10083 10084 10085 10086 10087 10088 10089 10090 10091 10092 10093 10094 10095 10096 10097 10098 10099 10100 10101 10102 10103 10104 10105 10106 10107 10108 10109 10110 10111 10112 10113 10114 10115 10116 10117 10118 10119 10120 10121 10122 10123 10124 10125 10126 10127 10128 10129 10130 10131 10132 10133 10134 10135 10136 10137 10138 10139 10140 10141 10142 10143 10144 10145 10146 10147 10148 10149 10150 10151 10152 10153 10154 10155 10156 10157 10158 10159 10160 10161 10162 10163 10164 10165 10166 10167 10168 10169 10170 10171 10172 10173 10174 10175 10176 10177 10178 10179 10180 10181 10182 10183 10184 10185 10186 10187 10188 10189 10190 10191 10192 10193 10194 10195 10196 10197 10198 10199 10200 10201 10202 10203 10204 10205 10206 10207 10208 10209 10210 10211 10212 10213 10214 10215 10216 10217 10218 10219 10220 10221 10222 10223 10224 10225 10226 10227 10228 10229 10230 10231 10232 10233 10234 10235 10236 10237 10238 10239 10240 10241 10242 10243 10244 10245 10246 10247 10248 10249 10250 10251 10252 10253 10254 10255 10256 10257 10258 10259 10260 10261 10262 10263 10264 10265 10266 10267 10268 10269 10270 10271 10272 10273 10274 10275 10276 10277 10278 10279 10280 10281 10282 10283 10284 10285 10286 10287 10288 10289 10290 10291 10292 10293 10294 10295 10296 10297 10298 10299 10300 10301 10302 10303 10304 10305 10306 10307 10308 10309 10310 10311 10312 10313 10314 10315 10316 10317 10318 10319 10320 10321 10322 10323 10324 10325 10326 10327 10328 10329 10330 10331 10332 10333 10334 10335 10336 10337 10338 10339 10340 10341 10342 10343 10344 10345 10346 10347 10348 10349 10350 10351 10352 10353 10354 10355 10356 10357 10358 10359 10360 10361 10362 10363 10364 10365 10366 10367 10368 10369 10370 10371 10372 10373 10374 10375 10376 10377 10378 10379 10380 10381 10382 10383 10384 10385 10386 10387 10388 10389 10390 10391 10392 10393 10394 10395 10396 10397 10398 10399 10400 10401 10402 10403 10404 10405 10406 10407 10408 10409 10410 10411 10412 10413 10414 10415 10416 10417 10418 10419 10420 10421 10422 10423 10424 10425 10426 10427 10428 10429 10430 10431 10432 10433 10434 10435 10436 10437 10438 10439 10440 10441 10442 10443 10444 10445 10446 10447 10448 10449 10450 10451 10452 10453 10454 10455 10456 10457 10458 10459 10460 10461 10462 10463 10464 10465 10466 10467 10468 10469 10470 10471 10472 10473 10474 10475 10476 10477 10478 10479 10480 10481 10482 10483 10484 10485 10486 10487 10488 10489 10490 10491 10492 10493 10494 10495 10496 10497 10498 10499 10500 10501 10502 10503 10504 10505 10506 10507 10508 10509 10510 10511 10512 10513 10514 10515 10516 10517 10518 10519 10520 10521 10522 10523 10524 10525 10526 10527 10528 10529 10530 10531 10532 10533 10534 10535 10536 10537 10538 10539 10540 10541 10542 10543 10544 10545 10546 10547 10548 10549 10550 10551 10552 10553 10554 10555 10556 10557 10558 10559 10560 10561 10562 10563 10564 10565 10566 10567 10568 10569 10570 10571 10572 10573 10574 10575 10576 10577 10578 10579 10580 10581 10582 10583 10584 10585 10586 10587 10588 10589 10590 10591 10592 10593 10594 10595 10596 10597 10598 10599 10600 10601 10602 10603 10604 10605 10606 10607 10608 10609 10610 10611 10612 10613 10614 10615 10616 10617 10618 10619 10620 10621 10622 10623 10624 10625 10626 10627 10628 10629 10630 10631 10632 10633 10634 10635 10636 10637 10638 10639 10640 10641 10642 10643 10644 10645 10646 10647 10648 10649 10650 10651 10652 10653 10654 10655 10656 10657 10658 10659 10660 10661 10662 10663 10664 10665 10666 10667 10668 10669 10670 10671 10672 10673 10674 10675 10676 10677 10678 10679 10680 10681 10682 10683 10684 10685 10686 10687 10688 10689 10690 10691 10692 10693 10694 10695 10696 10697 10698 10699 10700 10701 10702 10703 10704 10705 10706 10707 10708 10709 10710 10711 10712 10713 10714 10715 10716 10717 10718 10719 10720 10721 10722 10723 10724 10725 10726 10727 10728 10729 10730 10731 10732 10733 10734 10735 10736 10737 10738 10739 10740 10741 10742 10743 10744 10745 10746 10747 10748 10749 10750 10751 10752 10753 10754 10755 10756 10757 10758 10759 10760 10761 10762 10763 10764 10765 10766 10767 10768 10769 10770 10771 10772 10773 10774 10775 10776 10777 10778 10779 10780 10781 10782 10783 10784 10785 10786 10787 10788 10789 10790 10791 10792 10793 10794 10795 10796 10797 10798 10799 10800 10801 10802 10803 10804 10805 10806 10807 10808 10809 10810 10811 10812 10813 10814 10815 10816 10817 10818 10819 10820 10821 10822 10823 10824 10825 10826 10827 10828 10829 10830 10831 10832 10833 10834 10835 10836 10837 10838 10839 10840 10841 10842 10843 10844 10845 10846 10847 10848 10849 10850 10851 10852 10853 10854 10855 10856 10857 10858 10859 10860 10861 10862 10863 10864 10865 10866 10867 10868 10869 10870 10871 10872 10873 10874 10875 10876 10877 10878 10879 10880 10881 10882 10883 10884 10885 10886 10887 10888 10889 10890 10891 10892 10893 10894 10895 10896 10897 10898 10899 10900 10901 10902 10903 10904 10905 10906 10907 10908 10909 10910 10911 10912 10913 10914 10915 10916 10917 10918 10919 10920 10921 10922 10923 10924 10925 10926 10927 10928 10929 10930 10931 10932 10933 10934 10935 10936 10937 10938 10939 10940 10941 10942 10943 10944 10945 10946 10947 10948 10949 10950 10951 10952 10953 10954 10955 10956 10957 10958 10959 10960 10961 10962 10963 10964 10965 10966 10967 10968 10969 10970 10971 10972 10973 10974 10975 10976 10977 10978 10979 10980 10981 10982 10983 10984 10985 10986 10987 10988 10989 10990 10991 10992 10993 10994 10995 10996 10997 10998 10999 11000 11001 11002 11003 11004 11005 11006 11007 11008 11009 11010 11011 11012 11013 11014 11015 11016 11017 11018 11019 11020 11021 11022 11023 11024 11025 11026 11027 11028 11029 11030 11031 11032 11033 11034 11035 11036 11037 11038 11039 11040 11041 11042 11043 11044 11045 11046 11047 11048 11049 11050 11051 11052 11053 11054 11055 11056 11057 11058 11059 11060 11061 11062 11063 11064 11065 11066 11067 11068 11069 11070 11071 11072 11073 11074 11075 11076 11077 11078 11079 11080 11081 11082 11083 11084 11085 11086 11087 11088 11089 11090 11091 11092 11093 11094 11095 11096 11097 11098 11099 11100 11101 11102 11103 11104 11105 11106 11107 11108 11109 11110 11111 11112 11113 11114 11115 11116 11117 11118 11119 11120 11121 11122 11123 11124 11125 11126 11127 11128 11129 11130 11131 11132 11133 11134 11135 11136 11137 11138 11139 11140 11141 11142 11143 11144 11145 11146 11147 11148 11149 11150 11151 11152 11153 11154 11155 11156 11157 11158 11159 11160 11161 11162 11163 11164 11165 11166 11167 11168 11169 11170 11171 11172 11173 11174 11175 11176 11177 11178 11179 11180 11181 11182 11183 11184 11185 11186 11187 11188 11189 11190 11191 11192 11193 11194 11195 11196 11197 11198 11199 11200 11201 11202 11203 11204 11205 11206 11207 11208 11209 11210 11211 11212 11213 11214 11215 11216 11217 11218 11219 11220 11221 11222 11223 11224 11225 11226 11227 11228 11229 11230 11231 11232 11233 11234 11235 11236 11237 11238 11239 11240 11241 11242 11243 11244 11245 11246 11247 11248 11249 11250 11251 11252 11253 11254 11255 11256 11257 11258 11259 11260 11261 11262 11263 11264 11265 11266 11267 11268 11269 11270 11271 11272 11273 11274 11275 11276 11277 11278 11279 11280 11281 11282 11283 11284 11285 11286 11287 11288 11289 11290 11291 11292 11293 11294 11295 11296 11297 11298 11299 11300 11301 11302 11303 11304 11305 11306 11307 11308 11309 11310 11311 11312 11313 11314 11315 11316 11317 11318 11319 11320 11321 11322 11323 11324 11325 11326 11327 11328 11329 11330 11331 11332 11333 11334 11335 11336 11337 11338 11339 11340 11341 11342 11343 11344 11345 11346 11347 11348 11349 11350 11351 11352 11353 11354 11355 11356 11357 11358 11359 11360 11361 11362 11363 11364 11365 11366 11367 11368 11369 11370 11371 11372 11373 11374 11375 11376 11377 11378 11379 11380 11381 11382 11383 11384 11385 11386 11387 11388 11389 11390 11391 11392 11393 11394 11395 11396 11397 11398 11399 11400 11401 11402 11403 11404 11405 11406 11407 11408 11409 11410 11411 11412 11413 11414 11415 11416 11417 11418 11419 11420 11421 11422 11423 11424 11425 11426 11427 11428 11429 11430 11431 11432 11433 11434 11435 11436 11437 11438 11439 11440 11441 11442 11443 11444 11445 11446 11447 11448 11449 11450 11451 11452 11453 11454 11455 11456 11457 11458 11459 11460 11461 11462 11463 11464 11465 11466 11467 11468 11469 11470 11471 11472 11473 11474 11475 11476 11477 11478 11479 11480 11481 11482 11483 11484 11485 11486 11487 11488 11489 11490 11491 11492 11493 11494 11495 11496 11497 11498 11499 11500 11501 11502 11503 11504 11505 11506 11507 11508 11509 11510 11511 11512 11513 11514 11515 11516 11517 11518 11519 11520 11521 11522 11523 11524 11525 11526 11527 11528 11529 11530 11531 11532 11533 11534 11535 11536 11537 11538 11539 11540 11541 11542 11543 11544 11545 11546 11547 11548 11549 11550 11551 11552 11553 11554 11555 11556 11557 11558 11559 11560 11561 11562 11563 11564 11565 11566 11567 11568 11569 11570 11571 11572 11573 11574 11575 11576 11577 11578 11579 11580 11581 11582 11583 11584 11585 11586 11587 11588 11589 11590 11591 11592 11593 11594 11595 11596 11597 11598 11599 11600 11601 11602 11603 11604 11605 11606 11607 11608 11609 11610 11611 11612 11613 11614 11615 11616 11617 11618 11619 11620 11621 11622 11623 11624 11625 11626 11627 11628 11629 11630 11631 11632 11633 11634 11635 11636 11637 11638 11639 11640 11641 11642 11643 11644 11645 11646 11647 11648 11649 11650 11651 11652 11653 11654 11655 11656 11657 11658 11659 11660 11661 11662 11663 11664 11665 11666 11667 11668 11669 11670 11671 11672 11673 11674 11675 11676 11677 11678 11679 11680 11681 11682 11683 11684 11685 11686 11687 11688 11689 11690 11691 11692 11693 11694 11695 11696 11697 11698 11699 11700 11701 11702 11703 11704 11705 11706 11707 11708 11709 11710 11711 11712 11713 11714 11715 11716 11717 11718 11719 11720 11721 11722 11723 11724 11725 11726 11727 11728 11729 11730 11731 11732 11733 11734 11735 11736 11737 11738 11739 11740 11741 11742 11743 11744 11745 11746 11747 11748 11749 11750 11751 11752 11753 11754 11755 11756 11757 11758 11759 11760 11761 11762 11763 11764 11765 11766 11767 11768 11769 11770 11771 11772 11773 11774 11775 11776 11777 11778 11779 11780 11781 11782 11783 11784 11785 11786 11787 11788 11789 11790 11791 11792 11793 11794 11795 11796 11797 11798 11799 11800 11801 11802 11803 11804 11805 11806 11807 11808 11809 11810 11811 11812 11813 11814 11815 11816 11817 11818 11819 11820 11821 11822 11823 11824 11825 11826 11827 11828 11829 11830 11831 11832 11833 11834 11835 11836 11837 11838 11839 11840 11841 11842 11843 11844 11845 11846 11847 11848 11849 11850 11851 11852 11853 11854 11855 11856 11857 11858 11859 11860 11861 11862 11863 11864 11865 11866 11867 11868 11869 11870 11871 11872 11873 11874 11875 11876 11877 11878 11879 11880 11881 11882 11883 11884 11885 11886 11887 11888 11889 11890 11891 11892 11893 11894 11895 11896 11897 11898 11899 11900 11901 11902 11903 11904 11905 11906 11907 11908 11909 11910 11911 11912 11913 11914 11915 11916 11917 11918 11919 11920 11921 11922 11923 11924 11925 11926 11927 11928 11929 11930 11931 11932 11933 11934 11935 11936 11937 11938 11939 11940 11941 11942 11943 11944 11945 11946 11947 11948 11949 11950 11951 11952 11953 11954 11955 11956 11957 11958 11959 11960 11961 11962 11963 11964 11965 11966 11967 11968 11969 11970 11971 11972 11973 11974 11975 11976 11977 11978 11979 11980 11981 11982 11983 11984 11985 11986 11987 11988 11989 11990 11991 11992 11993 11994 11995 11996 11997 11998 11999 12000 12001 12002 12003 12004 12005 12006 12007 12008 12009 12010 12011 12012 12013 12014 12015 12016 12017 12018 12019 12020 12021 12022 12023 12024 12025 12026 12027 12028 12029 12030 12031 12032 12033 12034 12035 12036 12037 12038 12039 12040 12041 12042 12043 12044 12045 12046 12047 12048 12049 12050 12051 12052 12053 12054 12055 12056 12057 12058 12059 12060 12061 12062 12063 12064 12065 12066 12067 12068 12069 12070 12071 12072 12073 12074 12075 12076 12077 12078 12079 12080 12081 12082 12083 12084 12085 12086 12087 12088 12089 12090 12091 12092 12093 12094 12095 12096 12097 12098 12099 12100 12101 12102 12103 12104 12105 12106 12107 12108 12109 12110 12111 12112 12113 12114 12115 12116 12117 12118 12119 12120 12121 12122 12123 12124 12125 12126 12127 12128 12129 12130 12131 12132 12133 12134 12135 12136 12137 12138 12139 12140 12141 12142 12143 12144 12145 12146 12147 12148 12149 12150 12151 12152 12153 12154 12155 12156 12157 12158 12159 12160 12161 12162 12163 12164 12165 12166 12167 12168 12169 12170 12171 12172 12173 12174 12175 12176 12177 12178 12179 12180 12181 12182 12183 12184 12185 12186 12187 12188 12189 12190 12191 12192 12193 12194 12195 12196 12197 12198 12199 12200 12201 12202 12203 12204 12205 12206 12207 12208 12209 12210 12211 12212 12213 12214 12215 12216 12217 12218 12219 12220 12221 12222 12223 12224 12225 12226 12227 12228 12229 12230 12231 12232 12233 12234 12235 12236 12237 12238 12239 12240 12241 12242 12243 12244 12245 12246 12247 12248 12249 12250 12251 12252 12253 12254 12255 12256 12257 12258 12259 12260 12261 12262 12263 12264 12265 12266 12267 12268 12269 12270 12271 12272 12273 12274 12275 12276 12277 12278 12279 12280 12281 12282 12283 12284 12285 12286 12287 12288 12289 12290 12291 12292 12293 12294 12295 12296 12297 12298 12299 12300 12301 12302 12303 12304 12305 12306 12307 12308 12309 12310 12311 12312 12313 12314 12315 12316 12317 12318 12319 12320 12321 12322 12323 12324 12325 12326 12327 12328 12329 12330 12331 12332 12333 12334 12335 12336 12337 12338 12339 12340 12341 12342 12343 12344 12345 12346 12347 12348 12349 12350 12351 12352 12353 12354 12355 12356 12357 12358 12359 12360 12361 12362 12363 12364 12365 12366 12367 12368 12369 12370 12371 12372 12373 12374 12375 12376 12377 12378 12379 12380 12381 12382 12383 12384 12385 12386 12387 12388 12389 12390 12391 12392 12393 12394 12395 12396 12397 12398 12399 12400 12401 12402 12403 12404 12405 12406 12407 12408 12409 12410 12411 12412 12413 12414 12415 12416 12417 12418 12419 12420 12421 12422 12423 12424 12425 12426 12427 12428 12429 12430 12431 12432 12433 12434 12435 12436 12437 12438 12439 12440 12441 12442 12443 12444 12445 12446 12447 12448 12449 12450 12451 12452 12453 12454 12455 12456 12457 12458 12459 12460 12461 12462 12463 12464 12465 12466 12467 12468 12469 12470 12471 12472 12473 12474 12475 12476 12477 12478 12479 12480 12481 12482 12483 12484 12485 12486 12487 12488 12489 12490 12491 12492 12493 12494 12495 12496 12497 12498 12499 12500 12501 12502 12503 12504 12505 12506 12507 12508 12509 12510 12511 12512 12513 12514 12515 12516 12517 12518 12519 12520 12521 12522 12523 12524 12525 12526 12527 12528 12529 12530 12531 12532 12533 12534 12535 12536 12537 12538 12539 12540 12541 12542 12543 12544 12545 12546 12547 12548 12549 12550 12551 12552 12553 12554 12555 12556 12557 12558 12559 12560 12561 12562 12563 12564 12565 12566 12567 12568 12569 12570 12571 12572 12573 12574 12575 12576 12577 12578 12579 12580 12581 12582 12583 12584 12585 12586 12587 12588 12589 12590 12591 12592 12593 12594 12595 12596 12597 12598 12599 12600 12601 12602 12603 12604 12605 12606 12607 12608 12609 12610 12611 12612 12613 12614 12615 12616 12617 12618 12619 12620 12621 12622 12623 12624 12625 12626 12627 12628 12629 12630 12631 12632 12633 12634 12635 12636 12637 12638 12639 12640 12641 12642 12643 12644 12645 12646 12647 12648 12649 12650 12651 12652 12653 12654 12655 12656 12657 12658 12659 12660 12661 12662 12663 12664 12665 12666 12667 12668 12669 12670 12671 12672 12673 12674 12675 12676 12677 12678 12679 12680 12681 12682 12683 12684 12685 12686 12687 12688 12689 12690 12691 12692 12693 12694 12695 12696 12697 12698 12699 12700 12701 12702 12703 12704 12705 12706 12707 12708 12709 12710 12711 12712 12713 12714 12715 12716 12717 12718 12719 12720 12721 12722 12723 12724 12725 12726 12727 12728 12729 12730 12731 12732 12733 12734 12735 12736 12737 12738 12739 12740 12741 12742 12743 12744 12745 12746 12747 12748 12749 12750 12751 12752 12753 12754 12755 12756 12757 12758 12759 12760 12761 12762 12763 12764 12765 12766 12767 12768 12769 12770 12771 12772 12773 12774 12775 12776 12777 12778 12779 12780 12781 12782 12783 12784 12785 12786 12787 12788 12789 12790 12791 12792 12793 12794 12795 12796 12797 12798 12799 12800 12801 12802 12803 12804 12805 12806 12807 12808 12809 12810 12811 12812 12813 12814 12815 12816 12817 12818 12819 12820 12821 12822 12823 12824 12825 12826 12827 12828 12829 12830 12831 12832 12833 12834 12835 12836 12837 12838 12839 12840 12841 12842 12843 12844 12845 12846 12847 12848 12849 12850 12851 12852 12853 12854 12855 12856 12857 12858 12859 12860 12861 12862 12863 12864 12865 12866 12867 12868 12869 12870 12871 12872 12873 12874 12875 12876 12877 12878 12879 12880 12881 12882 12883 12884 12885 12886 12887 12888 12889 12890 12891 12892 12893 12894 12895 12896 12897 12898 12899 12900 12901 12902 12903 12904 12905 12906 12907 12908 12909 12910 12911 12912 12913 12914 12915 12916 12917 12918 12919 12920 12921 12922 12923 12924 12925 12926 12927 12928 12929 12930 12931 12932 12933 12934 12935 12936 12937 12938 12939 12940 12941 12942 12943 12944 12945 12946 12947 12948 12949 12950 12951 12952 12953 12954 12955 12956 12957 12958 12959 12960 12961 12962 12963 12964 12965 12966 12967 12968 12969 12970 12971 12972 12973 12974 12975 12976 12977 12978 12979 12980 12981 12982 12983 12984 12985 12986 12987 12988 12989 12990 12991 12992 12993 12994 12995 12996 12997 12998 12999 13000 13001 13002 13003 13004 13005 13006 13007 13008 13009 13010 13011 13012 13013 13014 13015 13016 13017 13018 13019 13020 13021 13022 13023 13024 13025 13026 13027 13028 13029 13030 13031 13032 13033 13034 13035 13036 13037 13038 13039 13040 13041 13042 13043 13044 13045 13046 13047 13048 13049 13050 13051 13052 13053 13054 13055 13056 13057 13058 13059 13060 13061 13062 13063 13064 13065 13066 13067 13068 13069 13070 13071 13072 13073 13074 13075 13076 13077 13078 13079 13080 13081 13082 13083 13084 13085 13086 13087 13088 13089 13090 13091 13092 13093 13094 13095 13096 13097 13098 13099 13100 13101 13102 13103 13104 13105 13106 13107 13108 13109 13110 13111 13112 13113 13114 13115 13116 13117 13118 13119 13120 13121 13122 13123 13124 13125 13126 13127 13128 13129 13130 13131 13132 13133 13134 13135 13136 13137 13138 13139 13140 13141 13142 13143 13144 13145 13146 13147 13148 13149 13150 13151 13152 13153 13154 13155 13156 13157 13158 13159 13160 13161 13162 13163 13164 13165 13166 13167 13168 13169 13170 13171 13172 13173 13174 13175 13176 13177 13178 13179 13180 13181 13182 13183 13184 13185 13186 13187 13188 13189 13190 13191 13192 13193 13194 13195 13196 13197 13198 13199 13200 13201 13202 13203 13204 13205 13206 13207 13208 13209 13210 13211 13212 13213 13214 13215 13216 13217 13218 13219 13220 13221 13222 13223 13224 13225 13226 13227 13228 13229 13230 13231 13232 13233 13234 13235 13236 13237 13238 13239 13240 13241 13242 13243 13244 13245 13246 13247 13248 13249 13250 13251 13252 13253 13254 13255 13256 13257 13258 13259 13260 13261 13262 13263 13264 13265 13266 13267 13268 13269 13270 13271 13272 13273 13274 13275 13276 13277 13278 13279 13280 13281 13282 13283 13284 13285 13286 13287 13288 13289 13290 13291 13292 13293 13294 13295 13296 13297 13298 13299 13300 13301 13302 13303 13304 13305 13306 13307 13308 13309 13310 13311 13312 13313 13314 13315 13316 13317 13318 13319 13320 13321 13322 13323 13324 13325 13326 13327 13328 13329 13330 13331 13332 13333 13334 13335 13336 13337 13338 13339 13340 13341 13342 13343 13344 13345 13346 13347 13348 13349 13350 13351 13352 13353 13354 13355 13356 13357 13358 13359 13360 13361 13362 13363 13364 13365 13366 13367 13368 13369 13370 13371 13372 13373 13374 13375 13376 13377 13378 13379 13380 13381 13382 13383 13384 13385 13386 13387 13388 13389 13390 13391 13392 13393 13394 13395 13396 13397 13398 13399 13400 13401 13402 13403 13404 13405 13406 13407 13408 13409 13410 13411 13412 13413 13414 13415 13416 13417 13418 13419 13420 13421 13422 13423 13424 13425 13426 13427 13428 13429 13430 13431 13432 13433 13434 13435 13436 13437 13438 13439 13440 13441 13442 13443 13444 13445 13446 13447 13448 13449 13450 13451 13452 13453 13454 13455 13456 13457 13458 13459 13460 13461 13462 13463 13464 13465 13466 13467 13468 13469 13470 13471 13472 13473 13474 13475 13476 13477 13478 13479 13480 13481 13482 13483 13484 13485 13486 13487 13488 13489 13490 13491 13492 13493 13494 13495 13496 13497 13498 13499 13500 13501 13502 13503 13504 13505 13506 13507 13508 13509 13510 13511 13512 13513 13514 13515 13516 13517 13518 13519 13520 13521 13522 13523 13524 13525 13526 13527 13528 13529 13530 13531 13532 13533 13534 13535 13536 13537 13538 13539 13540 13541 13542 13543 13544 13545 13546 13547 13548 13549 13550 13551 13552 13553 13554 13555 13556 13557 13558 13559 13560 13561 13562 13563 13564 13565 13566 13567 13568 13569 13570 13571 13572 13573 13574 13575 13576 13577 13578 13579 13580 13581 13582 13583 13584 13585 13586 13587 13588 13589 13590 13591 13592 13593 13594 13595 13596 13597 13598 13599 13600 13601 13602 13603 13604 13605 13606 13607 13608 13609 13610 13611 13612 13613 13614 13615 13616 13617 13618 13619 13620 13621 13622 13623 13624 13625 13626 13627 13628 13629 13630 13631 13632 13633 13634 13635 13636 13637 13638 13639 13640 13641 13642 13643 13644 13645 13646 13647 13648 13649 13650 13651 13652 13653 13654 13655 13656 13657 13658 13659 13660 13661 13662 13663 13664 13665 13666 13667 13668 13669 13670 13671 13672 13673 13674 13675 13676 13677 13678 13679 13680 13681 13682 13683 13684 13685 13686 13687 13688 13689 13690 13691 13692 13693 13694 13695 13696 13697 13698 13699 13700 13701 13702 13703 13704 13705 13706 13707 13708 13709 13710 13711 13712 13713 13714 13715 13716 13717 13718 13719 13720 13721 13722 13723 13724 13725 13726 13727 13728 13729 13730 13731 13732 13733 13734 13735 13736 13737 13738 13739 13740 13741 13742 13743 13744 13745 13746 13747 13748 13749 13750 13751 13752 13753 13754 13755 13756 13757 13758 13759 13760 13761 13762 13763 13764 13765 13766 13767 13768 13769 13770 13771 13772 13773 13774 13775 13776 13777 13778 13779 13780 13781 13782 13783 13784 13785 13786 13787 13788 13789 13790 13791 13792 13793 13794 13795 13796 13797 13798 13799 13800 13801 13802 13803 13804 13805 13806 13807 13808 13809 13810 13811 13812 13813 13814 13815 13816 13817 13818 13819 13820 13821 13822 13823 13824 13825 13826 13827 // SPDX-License-Identifier: GPL-2.0 /* * Performance events core code: * * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> */ #include <linux/fs.h> #include <linux/mm.h> #include <linux/cpu.h> #include <linux/smp.h> #include <linux/idr.h> #include <linux/file.h> #include <linux/poll.h> #include <linux/slab.h> #include <linux/hash.h> #include <linux/tick.h> #include <linux/sysfs.h> #include <linux/dcache.h> #include <linux/percpu.h> #include <linux/ptrace.h> #include <linux/reboot.h> #include <linux/vmstat.h> #include <linux/device.h> #include <linux/export.h> #include <linux/vmalloc.h> #include <linux/hardirq.h> #include <linux/hugetlb.h> #include <linux/rculist.h> #include <linux/uaccess.h> #include <linux/syscalls.h> #include <linux/anon_inodes.h> #include <linux/kernel_stat.h> #include <linux/cgroup.h> #include <linux/perf_event.h> #include <linux/trace_events.h> #include <linux/hw_breakpoint.h> #include <linux/mm_types.h> #include <linux/module.h> #include <linux/mman.h> #include <linux/compat.h> #include <linux/bpf.h> #include <linux/filter.h> #include <linux/namei.h> #include <linux/parser.h> #include <linux/sched/clock.h> #include <linux/sched/mm.h> #include <linux/proc_ns.h> #include <linux/mount.h> #include <linux/min_heap.h> #include <linux/highmem.h> #include <linux/pgtable.h> #include <linux/buildid.h> #include <linux/task_work.h> #include "internal.h" #include <asm/irq_regs.h> typedef int (*remote_function_f)(void *); struct remote_function_call { struct task_struct *p; remote_function_f func; void *info; int ret; }; static void remote_function(void *data) { struct remote_function_call *tfc = data; struct task_struct *p = tfc->p; if (p) { /* -EAGAIN */ if (task_cpu(p) != smp_processor_id()) return; /* * Now that we're on right CPU with IRQs disabled, we can test * if we hit the right task without races. */ tfc->ret = -ESRCH; /* No such (running) process */ if (p != current) return; } tfc->ret = tfc->func(tfc->info); } /** * task_function_call - call a function on the cpu on which a task runs * @p: the task to evaluate * @func: the function to be called * @info: the function call argument * * Calls the function @func when the task is currently running. This might * be on the current CPU, which just calls the function directly. This will * retry due to any failures in smp_call_function_single(), such as if the * task_cpu() goes offline concurrently. * * returns @func return value or -ESRCH or -ENXIO when the process isn't running */ static int task_function_call(struct task_struct *p, remote_function_f func, void *info) { struct remote_function_call data = { .p = p, .func = func, .info = info, .ret = -EAGAIN, }; int ret; for (;;) { ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1); if (!ret) ret = data.ret; if (ret != -EAGAIN) break; cond_resched(); } return ret; } /** * cpu_function_call - call a function on the cpu * @cpu: target cpu to queue this function * @func: the function to be called * @info: the function call argument * * Calls the function @func on the remote cpu. * * returns: @func return value or -ENXIO when the cpu is offline */ static int cpu_function_call(int cpu, remote_function_f func, void *info) { struct remote_function_call data = { .p = NULL, .func = func, .info = info, .ret = -ENXIO, /* No such CPU */ }; smp_call_function_single(cpu, remote_function, &data, 1); return data.ret; } static void perf_ctx_lock(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { raw_spin_lock(&cpuctx->ctx.lock); if (ctx) raw_spin_lock(&ctx->lock); } static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { if (ctx) raw_spin_unlock(&ctx->lock); raw_spin_unlock(&cpuctx->ctx.lock); } #define TASK_TOMBSTONE ((void *)-1L) static bool is_kernel_event(struct perf_event *event) { return READ_ONCE(event->owner) == TASK_TOMBSTONE; } static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); struct perf_event_context *perf_cpu_task_ctx(void) { lockdep_assert_irqs_disabled(); return this_cpu_ptr(&perf_cpu_context)->task_ctx; } /* * On task ctx scheduling... * * When !ctx->nr_events a task context will not be scheduled. This means * we can disable the scheduler hooks (for performance) without leaving * pending task ctx state. * * This however results in two special cases: * * - removing the last event from a task ctx; this is relatively straight * forward and is done in __perf_remove_from_context. * * - adding the first event to a task ctx; this is tricky because we cannot * rely on ctx->is_active and therefore cannot use event_function_call(). * See perf_install_in_context(). * * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set. */ typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *, struct perf_event_context *, void *); struct event_function_struct { struct perf_event *event; event_f func; void *data; }; static int event_function(void *info) { struct event_function_struct *efs = info; struct perf_event *event = efs->event; struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *task_ctx = cpuctx->task_ctx; int ret = 0; lockdep_assert_irqs_disabled(); perf_ctx_lock(cpuctx, task_ctx); /* * Since we do the IPI call without holding ctx->lock things can have * changed, double check we hit the task we set out to hit. */ if (ctx->task) { if (ctx->task != current) { ret = -ESRCH; goto unlock; } /* * We only use event_function_call() on established contexts, * and event_function() is only ever called when active (or * rather, we'll have bailed in task_function_call() or the * above ctx->task != current test), therefore we must have * ctx->is_active here. */ WARN_ON_ONCE(!ctx->is_active); /* * And since we have ctx->is_active, cpuctx->task_ctx must * match. */ WARN_ON_ONCE(task_ctx != ctx); } else { WARN_ON_ONCE(&cpuctx->ctx != ctx); } efs->func(event, cpuctx, ctx, efs->data); unlock: perf_ctx_unlock(cpuctx, task_ctx); return ret; } static void event_function_call(struct perf_event *event, event_f func, void *data) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */ struct event_function_struct efs = { .event = event, .func = func, .data = data, }; if (!event->parent) { /* * If this is a !child event, we must hold ctx::mutex to * stabilize the event->ctx relation. See * perf_event_ctx_lock(). */ lockdep_assert_held(&ctx->mutex); } if (!task) { cpu_function_call(event->cpu, event_function, &efs); return; } if (task == TASK_TOMBSTONE) return; again: if (!task_function_call(task, event_function, &efs)) return; raw_spin_lock_irq(&ctx->lock); /* * Reload the task pointer, it might have been changed by * a concurrent perf_event_context_sched_out(). */ task = ctx->task; if (task == TASK_TOMBSTONE) { raw_spin_unlock_irq(&ctx->lock); return; } if (ctx->is_active) { raw_spin_unlock_irq(&ctx->lock); goto again; } func(event, NULL, ctx, data); raw_spin_unlock_irq(&ctx->lock); } /* * Similar to event_function_call() + event_function(), but hard assumes IRQs * are already disabled and we're on the right CPU. */ static void event_function_local(struct perf_event *event, event_f func, void *data) { struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct task_struct *task = READ_ONCE(ctx->task); struct perf_event_context *task_ctx = NULL; lockdep_assert_irqs_disabled(); if (task) { if (task == TASK_TOMBSTONE) return; task_ctx = ctx; } perf_ctx_lock(cpuctx, task_ctx); task = ctx->task; if (task == TASK_TOMBSTONE) goto unlock; if (task) { /* * We must be either inactive or active and the right task, * otherwise we're screwed, since we cannot IPI to somewhere * else. */ if (ctx->is_active) { if (WARN_ON_ONCE(task != current)) goto unlock; if (WARN_ON_ONCE(cpuctx->task_ctx != ctx)) goto unlock; } } else { WARN_ON_ONCE(&cpuctx->ctx != ctx); } func(event, cpuctx, ctx, data); unlock: perf_ctx_unlock(cpuctx, task_ctx); } #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ PERF_FLAG_FD_OUTPUT |\ PERF_FLAG_PID_CGROUP |\ PERF_FLAG_FD_CLOEXEC) /* * branch priv levels that need permission checks */ #define PERF_SAMPLE_BRANCH_PERM_PLM \ (PERF_SAMPLE_BRANCH_KERNEL |\ PERF_SAMPLE_BRANCH_HV) enum event_type_t { EVENT_FLEXIBLE = 0x1, EVENT_PINNED = 0x2, EVENT_TIME = 0x4, /* see ctx_resched() for details */ EVENT_CPU = 0x8, EVENT_CGROUP = 0x10, EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, }; /* * perf_sched_events : >0 events exist */ static void perf_sched_delayed(struct work_struct *work); DEFINE_STATIC_KEY_FALSE(perf_sched_events); static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed); static DEFINE_MUTEX(perf_sched_mutex); static atomic_t perf_sched_count; static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; static atomic_t nr_namespaces_events __read_mostly; static atomic_t nr_task_events __read_mostly; static atomic_t nr_freq_events __read_mostly; static atomic_t nr_switch_events __read_mostly; static atomic_t nr_ksymbol_events __read_mostly; static atomic_t nr_bpf_events __read_mostly; static atomic_t nr_cgroup_events __read_mostly; static atomic_t nr_text_poke_events __read_mostly; static atomic_t nr_build_id_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); static struct srcu_struct pmus_srcu; static cpumask_var_t perf_online_mask; static struct kmem_cache *perf_event_cache; /* * perf event paranoia level: * -1 - not paranoid at all * 0 - disallow raw tracepoint access for unpriv * 1 - disallow cpu events for unpriv * 2 - disallow kernel profiling for unpriv */ int sysctl_perf_event_paranoid __read_mostly = 2; /* Minimum for 512 kiB + 1 user control page */ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ /* * max perf event sample rate */ #define DEFAULT_MAX_SAMPLE_RATE 100000 #define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE) #define DEFAULT_CPU_TIME_MAX_PERCENT 25 int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; static int perf_sample_allowed_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100; static void update_perf_cpu_limits(void) { u64 tmp = perf_sample_period_ns; tmp *= sysctl_perf_cpu_time_max_percent; tmp = div_u64(tmp, 100); if (!tmp) tmp = 1; WRITE_ONCE(perf_sample_allowed_ns, tmp); } static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc); int perf_event_max_sample_rate_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; int perf_cpu = sysctl_perf_cpu_time_max_percent; /* * If throttling is disabled don't allow the write: */ if (write && (perf_cpu == 100 || perf_cpu == 0)) return -EINVAL; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) return ret; max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; update_perf_cpu_limits(); return 0; } int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) return ret; if (sysctl_perf_cpu_time_max_percent == 100 || sysctl_perf_cpu_time_max_percent == 0) { printk(KERN_WARNING "perf: Dynamic interrupt throttling disabled, can hang your system!\n"); WRITE_ONCE(perf_sample_allowed_ns, 0); } else { update_perf_cpu_limits(); } return 0; } /* * perf samples are done in some very critical code paths (NMIs). * If they take too much CPU time, the system can lock up and not * get any real work done. This will drop the sample rate when * we detect that events are taking too long. */ #define NR_ACCUMULATED_SAMPLES 128 static DEFINE_PER_CPU(u64, running_sample_length); static u64 __report_avg; static u64 __report_allowed; static void perf_duration_warn(struct irq_work *w) { printk_ratelimited(KERN_INFO "perf: interrupt took too long (%lld > %lld), lowering " "kernel.perf_event_max_sample_rate to %d\n", __report_avg, __report_allowed, sysctl_perf_event_sample_rate); } static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn); void perf_sample_event_took(u64 sample_len_ns) { u64 max_len = READ_ONCE(perf_sample_allowed_ns); u64 running_len; u64 avg_len; u32 max; if (max_len == 0) return; /* Decay the counter by 1 average sample. */ running_len = __this_cpu_read(running_sample_length); running_len -= running_len/NR_ACCUMULATED_SAMPLES; running_len += sample_len_ns; __this_cpu_write(running_sample_length, running_len); /* * Note: this will be biased artifically low until we have * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us * from having to maintain a count. */ avg_len = running_len/NR_ACCUMULATED_SAMPLES; if (avg_len <= max_len) return; __report_avg = avg_len; __report_allowed = max_len; /* * Compute a throttle threshold 25% below the current duration. */ avg_len += avg_len / 4; max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent; if (avg_len < max) max /= (u32)avg_len; else max = 1; WRITE_ONCE(perf_sample_allowed_ns, avg_len); WRITE_ONCE(max_samples_per_tick, max); sysctl_perf_event_sample_rate = max * HZ; perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; if (!irq_work_queue(&perf_duration_work)) { early_printk("perf: interrupt took too long (%lld > %lld), lowering " "kernel.perf_event_max_sample_rate to %d\n", __report_avg, __report_allowed, sysctl_perf_event_sample_rate); } } static atomic64_t perf_event_id; static void update_context_time(struct perf_event_context *ctx); static u64 perf_event_time(struct perf_event *event); void __weak perf_event_print_debug(void) { } static inline u64 perf_clock(void) { return local_clock(); } static inline u64 perf_event_clock(struct perf_event *event) { return event->clock(); } /* * State based event timekeeping... * * The basic idea is to use event->state to determine which (if any) time * fields to increment with the current delta. This means we only need to * update timestamps when we change state or when they are explicitly requested * (read). * * Event groups make things a little more complicated, but not terribly so. The * rules for a group are that if the group leader is OFF the entire group is * OFF, irrespecive of what the group member states are. This results in * __perf_effective_state(). * * A futher ramification is that when a group leader flips between OFF and * !OFF, we need to update all group member times. * * * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we * need to make sure the relevant context time is updated before we try and * update our timestamps. */ static __always_inline enum perf_event_state __perf_effective_state(struct perf_event *event) { struct perf_event *leader = event->group_leader; if (leader->state <= PERF_EVENT_STATE_OFF) return leader->state; return event->state; } static __always_inline void __perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running) { enum perf_event_state state = __perf_effective_state(event); u64 delta = now - event->tstamp; *enabled = event->total_time_enabled; if (state >= PERF_EVENT_STATE_INACTIVE) *enabled += delta; *running = event->total_time_running; if (state >= PERF_EVENT_STATE_ACTIVE) *running += delta; } static void perf_event_update_time(struct perf_event *event) { u64 now = perf_event_time(event); __perf_update_times(event, now, &event->total_time_enabled, &event->total_time_running); event->tstamp = now; } static void perf_event_update_sibling_time(struct perf_event *leader) { struct perf_event *sibling; for_each_sibling_event(sibling, leader) perf_event_update_time(sibling); } static void perf_event_set_state(struct perf_event *event, enum perf_event_state state) { if (event->state == state) return; perf_event_update_time(event); /* * If a group leader gets enabled/disabled all its siblings * are affected too. */ if ((event->state < 0) ^ (state < 0)) perf_event_update_sibling_time(event); WRITE_ONCE(event->state, state); } /* * UP store-release, load-acquire */ #define __store_release(ptr, val) \ do { \ barrier(); \ WRITE_ONCE(*(ptr), (val)); \ } while (0) #define __load_acquire(ptr) \ ({ \ __unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr)); \ barrier(); \ ___p; \ }) static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup) { struct perf_event_pmu_context *pmu_ctx; list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { if (cgroup && !pmu_ctx->nr_cgroups) continue; perf_pmu_disable(pmu_ctx->pmu); } } static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup) { struct perf_event_pmu_context *pmu_ctx; list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { if (cgroup && !pmu_ctx->nr_cgroups) continue; perf_pmu_enable(pmu_ctx->pmu); } } static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type); static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type); #ifdef CONFIG_CGROUP_PERF static inline bool perf_cgroup_match(struct perf_event *event) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); /* @event doesn't care about cgroup */ if (!event->cgrp) return true; /* wants specific cgroup scope but @cpuctx isn't associated with any */ if (!cpuctx->cgrp) return false; /* * Cgroup scoping is recursive. An event enabled for a cgroup is * also enabled for all its descendant cgroups. If @cpuctx's * cgroup is a descendant of @event's (the test covers identity * case), it's a match. */ return cgroup_is_descendant(cpuctx->cgrp->css.cgroup, event->cgrp->css.cgroup); } static inline void perf_detach_cgroup(struct perf_event *event) { css_put(&event->cgrp->css); event->cgrp = NULL; } static inline int is_cgroup_event(struct perf_event *event) { return event->cgrp != NULL; } static inline u64 perf_cgroup_event_time(struct perf_event *event) { struct perf_cgroup_info *t; t = per_cpu_ptr(event->cgrp->info, event->cpu); return t->time; } static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) { struct perf_cgroup_info *t; t = per_cpu_ptr(event->cgrp->info, event->cpu); if (!__load_acquire(&t->active)) return t->time; now += READ_ONCE(t->timeoffset); return now; } static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv) { if (adv) info->time += now - info->timestamp; info->timestamp = now; /* * see update_context_time() */ WRITE_ONCE(info->timeoffset, info->time - info->timestamp); } static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) { struct perf_cgroup *cgrp = cpuctx->cgrp; struct cgroup_subsys_state *css; struct perf_cgroup_info *info; if (cgrp) { u64 now = perf_clock(); for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); __update_cgrp_time(info, now, true); if (final) __store_release(&info->active, 0); } } } static inline void update_cgrp_time_from_event(struct perf_event *event) { struct perf_cgroup_info *info; /* * ensure we access cgroup data only when needed and * when we know the cgroup is pinned (css_get) */ if (!is_cgroup_event(event)) return; info = this_cpu_ptr(event->cgrp->info); /* * Do not update time when cgroup is not active */ if (info->active) __update_cgrp_time(info, perf_clock(), true); } static inline void perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) { struct perf_event_context *ctx = &cpuctx->ctx; struct perf_cgroup *cgrp = cpuctx->cgrp; struct perf_cgroup_info *info; struct cgroup_subsys_state *css; /* * ctx->lock held by caller * ensure we do not access cgroup data * unless we have the cgroup pinned (css_get) */ if (!cgrp) return; WARN_ON_ONCE(!ctx->nr_cgroups); for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); __update_cgrp_time(info, ctx->timestamp, false); __store_release(&info->active, 1); } } /* * reschedule events based on the cgroup constraint of task. */ static void perf_cgroup_switch(struct task_struct *task) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_cgroup *cgrp; /* * cpuctx->cgrp is set when the first cgroup event enabled, * and is cleared when the last cgroup event disabled. */ if (READ_ONCE(cpuctx->cgrp) == NULL) return; WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); cgrp = perf_cgroup_from_task(task, NULL); if (READ_ONCE(cpuctx->cgrp) == cgrp) return; perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_ctx_disable(&cpuctx->ctx, true); ctx_sched_out(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP); /* * must not be done before ctxswout due * to update_cgrp_time_from_cpuctx() in * ctx_sched_out() */ cpuctx->cgrp = cgrp; /* * set cgrp before ctxsw in to allow * perf_cgroup_set_timestamp() in ctx_sched_in() * to not have to pass task around */ ctx_sched_in(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP); perf_ctx_enable(&cpuctx->ctx, true); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } static int perf_cgroup_ensure_storage(struct perf_event *event, struct cgroup_subsys_state *css) { struct perf_cpu_context *cpuctx; struct perf_event **storage; int cpu, heap_size, ret = 0; /* * Allow storage to have sufficent space for an iterator for each * possibly nested cgroup plus an iterator for events with no cgroup. */ for (heap_size = 1; css; css = css->parent) heap_size++; for_each_possible_cpu(cpu) { cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); if (heap_size <= cpuctx->heap_size) continue; storage = kmalloc_node(heap_size * sizeof(struct perf_event *), GFP_KERNEL, cpu_to_node(cpu)); if (!storage) { ret = -ENOMEM; break; } raw_spin_lock_irq(&cpuctx->ctx.lock); if (cpuctx->heap_size < heap_size) { swap(cpuctx->heap, storage); if (storage == cpuctx->heap_default) storage = NULL; cpuctx->heap_size = heap_size; } raw_spin_unlock_irq(&cpuctx->ctx.lock); kfree(storage); } return ret; } static inline int perf_cgroup_connect(int fd, struct perf_event *event, struct perf_event_attr *attr, struct perf_event *group_leader) { struct perf_cgroup *cgrp; struct cgroup_subsys_state *css; struct fd f = fdget(fd); int ret = 0; if (!f.file) return -EBADF; css = css_tryget_online_from_dir(f.file->f_path.dentry, &perf_event_cgrp_subsys); if (IS_ERR(css)) { ret = PTR_ERR(css); goto out; } ret = perf_cgroup_ensure_storage(event, css); if (ret) goto out; cgrp = container_of(css, struct perf_cgroup, css); event->cgrp = cgrp; /* * all events in a group must monitor * the same cgroup because a task belongs * to only one perf cgroup at a time */ if (group_leader && group_leader->cgrp != cgrp) { perf_detach_cgroup(event); ret = -EINVAL; } out: fdput(f); return ret; } static inline void perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx) { struct perf_cpu_context *cpuctx; if (!is_cgroup_event(event)) return; event->pmu_ctx->nr_cgroups++; /* * Because cgroup events are always per-cpu events, * @ctx == &cpuctx->ctx. */ cpuctx = container_of(ctx, struct perf_cpu_context, ctx); if (ctx->nr_cgroups++) return; cpuctx->cgrp = perf_cgroup_from_task(current, ctx); } static inline void perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx) { struct perf_cpu_context *cpuctx; if (!is_cgroup_event(event)) return; event->pmu_ctx->nr_cgroups--; /* * Because cgroup events are always per-cpu events, * @ctx == &cpuctx->ctx. */ cpuctx = container_of(ctx, struct perf_cpu_context, ctx); if (--ctx->nr_cgroups) return; cpuctx->cgrp = NULL; } #else /* !CONFIG_CGROUP_PERF */ static inline bool perf_cgroup_match(struct perf_event *event) { return true; } static inline void perf_detach_cgroup(struct perf_event *event) {} static inline int is_cgroup_event(struct perf_event *event) { return 0; } static inline void update_cgrp_time_from_event(struct perf_event *event) { } static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) { } static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, struct perf_event_attr *attr, struct perf_event *group_leader) { return -EINVAL; } static inline void perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) { } static inline u64 perf_cgroup_event_time(struct perf_event *event) { return 0; } static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) { return 0; } static inline void perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx) { } static inline void perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx) { } static void perf_cgroup_switch(struct task_struct *task) { } #endif /* * set default to be dependent on timer tick just * like original code */ #define PERF_CPU_HRTIMER (1000 / HZ) /* * function must be called with interrupts disabled */ static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr) { struct perf_cpu_pmu_context *cpc; bool rotations; lockdep_assert_irqs_disabled(); cpc = container_of(hr, struct perf_cpu_pmu_context, hrtimer); rotations = perf_rotate_context(cpc); raw_spin_lock(&cpc->hrtimer_lock); if (rotations) hrtimer_forward_now(hr, cpc->hrtimer_interval); else cpc->hrtimer_active = 0; raw_spin_unlock(&cpc->hrtimer_lock); return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART; } static void __perf_mux_hrtimer_init(struct perf_cpu_pmu_context *cpc, int cpu) { struct hrtimer *timer = &cpc->hrtimer; struct pmu *pmu = cpc->epc.pmu; u64 interval; /* * check default is sane, if not set then force to * default interval (1/tick) */ interval = pmu->hrtimer_interval_ms; if (interval < 1) interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER; cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval); raw_spin_lock_init(&cpc->hrtimer_lock); hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); timer->function = perf_mux_hrtimer_handler; } static int perf_mux_hrtimer_restart(struct perf_cpu_pmu_context *cpc) { struct hrtimer *timer = &cpc->hrtimer; unsigned long flags; raw_spin_lock_irqsave(&cpc->hrtimer_lock, flags); if (!cpc->hrtimer_active) { cpc->hrtimer_active = 1; hrtimer_forward_now(timer, cpc->hrtimer_interval); hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); } raw_spin_unlock_irqrestore(&cpc->hrtimer_lock, flags); return 0; } static int perf_mux_hrtimer_restart_ipi(void *arg) { return perf_mux_hrtimer_restart(arg); } void perf_pmu_disable(struct pmu *pmu) { int *count = this_cpu_ptr(pmu->pmu_disable_count); if (!(*count)++) pmu->pmu_disable(pmu); } void perf_pmu_enable(struct pmu *pmu) { int *count = this_cpu_ptr(pmu->pmu_disable_count); if (!--(*count)) pmu->pmu_enable(pmu); } static void perf_assert_pmu_disabled(struct pmu *pmu) { WARN_ON_ONCE(*this_cpu_ptr(pmu->pmu_disable_count) == 0); } static void get_ctx(struct perf_event_context *ctx) { refcount_inc(&ctx->refcount); } static void *alloc_task_ctx_data(struct pmu *pmu) { if (pmu->task_ctx_cache) return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL); return NULL; } static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data) { if (pmu->task_ctx_cache && task_ctx_data) kmem_cache_free(pmu->task_ctx_cache, task_ctx_data); } static void free_ctx(struct rcu_head *head) { struct perf_event_context *ctx; ctx = container_of(head, struct perf_event_context, rcu_head); kfree(ctx); } static void put_ctx(struct perf_event_context *ctx) { if (refcount_dec_and_test(&ctx->refcount)) { if (ctx->parent_ctx) put_ctx(ctx->parent_ctx); if (ctx->task && ctx->task != TASK_TOMBSTONE) put_task_struct(ctx->task); call_rcu(&ctx->rcu_head, free_ctx); } } /* * Because of perf_event::ctx migration in sys_perf_event_open::move_group and * perf_pmu_migrate_context() we need some magic. * * Those places that change perf_event::ctx will hold both * perf_event_ctx::mutex of the 'old' and 'new' ctx value. * * Lock ordering is by mutex address. There are two other sites where * perf_event_context::mutex nests and those are: * * - perf_event_exit_task_context() [ child , 0 ] * perf_event_exit_event() * put_event() [ parent, 1 ] * * - perf_event_init_context() [ parent, 0 ] * inherit_task_group() * inherit_group() * inherit_event() * perf_event_alloc() * perf_init_event() * perf_try_init_event() [ child , 1 ] * * While it appears there is an obvious deadlock here -- the parent and child * nesting levels are inverted between the two. This is in fact safe because * life-time rules separate them. That is an exiting task cannot fork, and a * spawning task cannot (yet) exit. * * But remember that these are parent<->child context relations, and * migration does not affect children, therefore these two orderings should not * interact. * * The change in perf_event::ctx does not affect children (as claimed above) * because the sys_perf_event_open() case will install a new event and break * the ctx parent<->child relation, and perf_pmu_migrate_context() is only * concerned with cpuctx and that doesn't have children. * * The places that change perf_event::ctx will issue: * * perf_remove_from_context(); * synchronize_rcu(); * perf_install_in_context(); * * to affect the change. The remove_from_context() + synchronize_rcu() should * quiesce the event, after which we can install it in the new location. This * means that only external vectors (perf_fops, prctl) can perturb the event * while in transit. Therefore all such accessors should also acquire * perf_event_context::mutex to serialize against this. * * However; because event->ctx can change while we're waiting to acquire * ctx->mutex we must be careful and use the below perf_event_ctx_lock() * function. * * Lock order: * exec_update_lock * task_struct::perf_event_mutex * perf_event_context::mutex * perf_event::child_mutex; * perf_event_context::lock * perf_event::mmap_mutex * mmap_lock * perf_addr_filters_head::lock * * cpu_hotplug_lock * pmus_lock * cpuctx->mutex / perf_event_context::mutex */ static struct perf_event_context * perf_event_ctx_lock_nested(struct perf_event *event, int nesting) { struct perf_event_context *ctx; again: rcu_read_lock(); ctx = READ_ONCE(event->ctx); if (!refcount_inc_not_zero(&ctx->refcount)) { rcu_read_unlock(); goto again; } rcu_read_unlock(); mutex_lock_nested(&ctx->mutex, nesting); if (event->ctx != ctx) { mutex_unlock(&ctx->mutex); put_ctx(ctx); goto again; } return ctx; } static inline struct perf_event_context * perf_event_ctx_lock(struct perf_event *event) { return perf_event_ctx_lock_nested(event, 0); } static void perf_event_ctx_unlock(struct perf_event *event, struct perf_event_context *ctx) { mutex_unlock(&ctx->mutex); put_ctx(ctx); } /* * This must be done under the ctx->lock, such as to serialize against * context_equiv(), therefore we cannot call put_ctx() since that might end up * calling scheduler related locks and ctx->lock nests inside those. */ static __must_check struct perf_event_context * unclone_ctx(struct perf_event_context *ctx) { struct perf_event_context *parent_ctx = ctx->parent_ctx; lockdep_assert_held(&ctx->lock); if (parent_ctx) ctx->parent_ctx = NULL; ctx->generation++; return parent_ctx; } static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p, enum pid_type type) { u32 nr; /* * only top level events have the pid namespace they were created in */ if (event->parent) event = event->parent; nr = __task_pid_nr_ns(p, type, event->ns); /* avoid -1 if it is idle thread or runs in another ns */ if (!nr && !pid_alive(p)) nr = -1; return nr; } static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) { return perf_event_pid_type(event, p, PIDTYPE_TGID); } static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) { return perf_event_pid_type(event, p, PIDTYPE_PID); } /* * If we inherit events we want to return the parent event id * to userspace. */ static u64 primary_event_id(struct perf_event *event) { u64 id = event->id; if (event->parent) id = event->parent->id; return id; } /* * Get the perf_event_context for a task and lock it. * * This has to cope with the fact that until it is locked, * the context could get moved to another task. */ static struct perf_event_context * perf_lock_task_context(struct task_struct *task, unsigned long *flags) { struct perf_event_context *ctx; retry: /* * One of the few rules of preemptible RCU is that one cannot do * rcu_read_unlock() while holding a scheduler (or nested) lock when * part of the read side critical section was irqs-enabled -- see * rcu_read_unlock_special(). * * Since ctx->lock nests under rq->lock we must ensure the entire read * side critical section has interrupts disabled. */ local_irq_save(*flags); rcu_read_lock(); ctx = rcu_dereference(task->perf_event_ctxp); if (ctx) { /* * If this context is a clone of another, it might * get swapped for another underneath us by * perf_event_task_sched_out, though the * rcu_read_lock() protects us from any context * getting freed. Lock the context and check if it * got swapped before we could get the lock, and retry * if so. If we locked the right context, then it * can't get swapped on us any more. */ raw_spin_lock(&ctx->lock); if (ctx != rcu_dereference(task->perf_event_ctxp)) { raw_spin_unlock(&ctx->lock); rcu_read_unlock(); local_irq_restore(*flags); goto retry; } if (ctx->task == TASK_TOMBSTONE || !refcount_inc_not_zero(&ctx->refcount)) { raw_spin_unlock(&ctx->lock); ctx = NULL; } else { WARN_ON_ONCE(ctx->task != task); } } rcu_read_unlock(); if (!ctx) local_irq_restore(*flags); return ctx; } /* * Get the context for a task and increment its pin_count so it * can't get swapped to another task. This also increments its * reference count so that the context can't get freed. */ static struct perf_event_context * perf_pin_task_context(struct task_struct *task) { struct perf_event_context *ctx; unsigned long flags; ctx = perf_lock_task_context(task, &flags); if (ctx) { ++ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); } return ctx; } static void perf_unpin_context(struct perf_event_context *ctx) { unsigned long flags; raw_spin_lock_irqsave(&ctx->lock, flags); --ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); } /* * Update the record of the current time in a context. */ static void __update_context_time(struct perf_event_context *ctx, bool adv) { u64 now = perf_clock(); lockdep_assert_held(&ctx->lock); if (adv) ctx->time += now - ctx->timestamp; ctx->timestamp = now; /* * The above: time' = time + (now - timestamp), can be re-arranged * into: time` = now + (time - timestamp), which gives a single value * offset to compute future time without locks on. * * See perf_event_time_now(), which can be used from NMI context where * it's (obviously) not possible to acquire ctx->lock in order to read * both the above values in a consistent manner. */ WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp); } static void update_context_time(struct perf_event_context *ctx) { __update_context_time(ctx, true); } static u64 perf_event_time(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; if (unlikely(!ctx)) return 0; if (is_cgroup_event(event)) return perf_cgroup_event_time(event); return ctx->time; } static u64 perf_event_time_now(struct perf_event *event, u64 now) { struct perf_event_context *ctx = event->ctx; if (unlikely(!ctx)) return 0; if (is_cgroup_event(event)) return perf_cgroup_event_time_now(event, now); if (!(__load_acquire(&ctx->is_active) & EVENT_TIME)) return ctx->time; now += READ_ONCE(ctx->timeoffset); return now; } static enum event_type_t get_event_type(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; enum event_type_t event_type; lockdep_assert_held(&ctx->lock); /* * It's 'group type', really, because if our group leader is * pinned, so are we. */ if (event->group_leader != event) event = event->group_leader; event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE; if (!ctx->task) event_type |= EVENT_CPU; return event_type; } /* * Helper function to initialize event group nodes. */ static void init_event_group(struct perf_event *event) { RB_CLEAR_NODE(&event->group_node); event->group_index = 0; } /* * Extract pinned or flexible groups from the context * based on event attrs bits. */ static struct perf_event_groups * get_event_groups(struct perf_event *event, struct perf_event_context *ctx) { if (event->attr.pinned) return &ctx->pinned_groups; else return &ctx->flexible_groups; } /* * Helper function to initializes perf_event_group trees. */ static void perf_event_groups_init(struct perf_event_groups *groups) { groups->tree = RB_ROOT; groups->index = 0; } static inline struct cgroup *event_cgroup(const struct perf_event *event) { struct cgroup *cgroup = NULL; #ifdef CONFIG_CGROUP_PERF if (event->cgrp) cgroup = event->cgrp->css.cgroup; #endif return cgroup; } /* * Compare function for event groups; * * Implements complex key that first sorts by CPU and then by virtual index * which provides ordering when rotating groups for the same CPU. */ static __always_inline int perf_event_groups_cmp(const int left_cpu, const struct pmu *left_pmu, const struct cgroup *left_cgroup, const u64 left_group_index, const struct perf_event *right) { if (left_cpu < right->cpu) return -1; if (left_cpu > right->cpu) return 1; if (left_pmu) { if (left_pmu < right->pmu_ctx->pmu) return -1; if (left_pmu > right->pmu_ctx->pmu) return 1; } #ifdef CONFIG_CGROUP_PERF { const struct cgroup *right_cgroup = event_cgroup(right); if (left_cgroup != right_cgroup) { if (!left_cgroup) { /* * Left has no cgroup but right does, no * cgroups come first. */ return -1; } if (!right_cgroup) { /* * Right has no cgroup but left does, no * cgroups come first. */ return 1; } /* Two dissimilar cgroups, order by id. */ if (cgroup_id(left_cgroup) < cgroup_id(right_cgroup)) return -1; return 1; } } #endif if (left_group_index < right->group_index) return -1; if (left_group_index > right->group_index) return 1; return 0; } #define __node_2_pe(node) \ rb_entry((node), struct perf_event, group_node) static inline bool __group_less(struct rb_node *a, const struct rb_node *b) { struct perf_event *e = __node_2_pe(a); return perf_event_groups_cmp(e->cpu, e->pmu_ctx->pmu, event_cgroup(e), e->group_index, __node_2_pe(b)) < 0; } struct __group_key { int cpu; struct pmu *pmu; struct cgroup *cgroup; }; static inline int __group_cmp(const void *key, const struct rb_node *node) { const struct __group_key *a = key; const struct perf_event *b = __node_2_pe(node); /* partial/subtree match: @cpu, @pmu, @cgroup; ignore: @group_index */ return perf_event_groups_cmp(a->cpu, a->pmu, a->cgroup, b->group_index, b); } static inline int __group_cmp_ignore_cgroup(const void *key, const struct rb_node *node) { const struct __group_key *a = key; const struct perf_event *b = __node_2_pe(node); /* partial/subtree match: @cpu, @pmu, ignore: @cgroup, @group_index */ return perf_event_groups_cmp(a->cpu, a->pmu, event_cgroup(b), b->group_index, b); } /* * Insert @event into @groups' tree; using * {@event->cpu, @event->pmu_ctx->pmu, event_cgroup(@event), ++@groups->index} * as key. This places it last inside the {cpu,pmu,cgroup} subtree. */ static void perf_event_groups_insert(struct perf_event_groups *groups, struct perf_event *event) { event->group_index = ++groups->index; rb_add(&event->group_node, &groups->tree, __group_less); } /* * Helper function to insert event into the pinned or flexible groups. */ static void add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event_groups *groups; groups = get_event_groups(event, ctx); perf_event_groups_insert(groups, event); } /* * Delete a group from a tree. */ static void perf_event_groups_delete(struct perf_event_groups *groups, struct perf_event *event) { WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) || RB_EMPTY_ROOT(&groups->tree)); rb_erase(&event->group_node, &groups->tree); init_event_group(event); } /* * Helper function to delete event from its groups. */ static void del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event_groups *groups; groups = get_event_groups(event, ctx); perf_event_groups_delete(groups, event); } /* * Get the leftmost event in the {cpu,pmu,cgroup} subtree. */ static struct perf_event * perf_event_groups_first(struct perf_event_groups *groups, int cpu, struct pmu *pmu, struct cgroup *cgrp) { struct __group_key key = { .cpu = cpu, .pmu = pmu, .cgroup = cgrp, }; struct rb_node *node; node = rb_find_first(&key, &groups->tree, __group_cmp); if (node) return __node_2_pe(node); return NULL; } static struct perf_event * perf_event_groups_next(struct perf_event *event, struct pmu *pmu) { struct __group_key key = { .cpu = event->cpu, .pmu = pmu, .cgroup = event_cgroup(event), }; struct rb_node *next; next = rb_next_match(&key, &event->group_node, __group_cmp); if (next) return __node_2_pe(next); return NULL; } #define perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) \ for (event = perf_event_groups_first(groups, cpu, pmu, NULL); \ event; event = perf_event_groups_next(event, pmu)) /* * Iterate through the whole groups tree. */ #define perf_event_groups_for_each(event, groups) \ for (event = rb_entry_safe(rb_first(&((groups)->tree)), \ typeof(*event), group_node); event; \ event = rb_entry_safe(rb_next(&event->group_node), \ typeof(*event), group_node)) /* * Add an event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. */ static void list_add_event(struct perf_event *event, struct perf_event_context *ctx) { lockdep_assert_held(&ctx->lock); WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); event->attach_state |= PERF_ATTACH_CONTEXT; event->tstamp = perf_event_time(event); /* * If we're a stand alone event or group leader, we go to the context * list, group events are kept attached to the group so that * perf_group_detach can, at all times, locate all siblings. */ if (event->group_leader == event) { event->group_caps = event->event_caps; add_event_to_groups(event, ctx); } list_add_rcu(&event->event_entry, &ctx->event_list); ctx->nr_events++; if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT) ctx->nr_user++; if (event->attr.inherit_stat) ctx->nr_stat++; if (event->state > PERF_EVENT_STATE_OFF) perf_cgroup_event_enable(event, ctx); ctx->generation++; event->pmu_ctx->nr_events++; } /* * Initialize event state based on the perf_event_attr::disabled. */ static inline void perf_event__state_init(struct perf_event *event) { event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF : PERF_EVENT_STATE_INACTIVE; } static void __perf_event_read_size(struct perf_event *event, int nr_siblings) { int entry = sizeof(u64); /* value */ int size = 0; int nr = 1; if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) size += sizeof(u64); if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) size += sizeof(u64); if (event->attr.read_format & PERF_FORMAT_ID) entry += sizeof(u64); if (event->attr.read_format & PERF_FORMAT_LOST) entry += sizeof(u64); if (event->attr.read_format & PERF_FORMAT_GROUP) { nr += nr_siblings; size += sizeof(u64); } size += entry * nr; event->read_size = size; } static void __perf_event_header_size(struct perf_event *event, u64 sample_type) { struct perf_sample_data *data; u16 size = 0; if (sample_type & PERF_SAMPLE_IP) size += sizeof(data->ip); if (sample_type & PERF_SAMPLE_ADDR) size += sizeof(data->addr); if (sample_type & PERF_SAMPLE_PERIOD) size += sizeof(data->period); if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) size += sizeof(data->weight.full); if (sample_type & PERF_SAMPLE_READ) size += event->read_size; if (sample_type & PERF_SAMPLE_DATA_SRC) size += sizeof(data->data_src.val); if (sample_type & PERF_SAMPLE_TRANSACTION) size += sizeof(data->txn); if (sample_type & PERF_SAMPLE_PHYS_ADDR) size += sizeof(data->phys_addr); if (sample_type & PERF_SAMPLE_CGROUP) size += sizeof(data->cgroup); if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) size += sizeof(data->data_page_size); if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) size += sizeof(data->code_page_size); event->header_size = size; } /* * Called at perf_event creation and when events are attached/detached from a * group. */ static void perf_event__header_size(struct perf_event *event) { __perf_event_read_size(event, event->group_leader->nr_siblings); __perf_event_header_size(event, event->attr.sample_type); } static void perf_event__id_header_size(struct perf_event *event) { struct perf_sample_data *data; u64 sample_type = event->attr.sample_type; u16 size = 0; if (sample_type & PERF_SAMPLE_TID) size += sizeof(data->tid_entry); if (sample_type & PERF_SAMPLE_TIME) size += sizeof(data->time); if (sample_type & PERF_SAMPLE_IDENTIFIER) size += sizeof(data->id); if (sample_type & PERF_SAMPLE_ID) size += sizeof(data->id); if (sample_type & PERF_SAMPLE_STREAM_ID) size += sizeof(data->stream_id); if (sample_type & PERF_SAMPLE_CPU) size += sizeof(data->cpu_entry); event->id_header_size = size; } static bool perf_event_validate_size(struct perf_event *event) { /* * The values computed here will be over-written when we actually * attach the event. */ __perf_event_read_size(event, event->group_leader->nr_siblings + 1); __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ); perf_event__id_header_size(event); /* * Sum the lot; should not exceed the 64k limit we have on records. * Conservative limit to allow for callchains and other variable fields. */ if (event->read_size + event->header_size + event->id_header_size + sizeof(struct perf_event_header) >= 16*1024) return false; return true; } static void perf_group_attach(struct perf_event *event) { struct perf_event *group_leader = event->group_leader, *pos; lockdep_assert_held(&event->ctx->lock); /* * We can have double attach due to group movement (move_group) in * perf_event_open(). */ if (event->attach_state & PERF_ATTACH_GROUP) return; event->attach_state |= PERF_ATTACH_GROUP; if (group_leader == event) return; WARN_ON_ONCE(group_leader->ctx != event->ctx); group_leader->group_caps &= event->event_caps; list_add_tail(&event->sibling_list, &group_leader->sibling_list); group_leader->nr_siblings++; group_leader->group_generation++; perf_event__header_size(group_leader); for_each_sibling_event(pos, group_leader) perf_event__header_size(pos); } /* * Remove an event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. */ static void list_del_event(struct perf_event *event, struct perf_event_context *ctx) { WARN_ON_ONCE(event->ctx != ctx); lockdep_assert_held(&ctx->lock); /* * We can have double detach due to exit/hot-unplug + close. */ if (!(event->attach_state & PERF_ATTACH_CONTEXT)) return; event->attach_state &= ~PERF_ATTACH_CONTEXT; ctx->nr_events--; if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT) ctx->nr_user--; if (event->attr.inherit_stat) ctx->nr_stat--; list_del_rcu(&event->event_entry); if (event->group_leader == event) del_event_from_groups(event, ctx); /* * If event was in error state, then keep it * that way, otherwise bogus counts will be * returned on read(). The only way to get out * of error state is by explicit re-enabling * of the event */ if (event->state > PERF_EVENT_STATE_OFF) { perf_cgroup_event_disable(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_OFF); } ctx->generation++; event->pmu_ctx->nr_events--; } static int perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event) { if (!has_aux(aux_event)) return 0; if (!event->pmu->aux_output_match) return 0; return event->pmu->aux_output_match(aux_event); } static void put_event(struct perf_event *event); static void event_sched_out(struct perf_event *event, struct perf_event_context *ctx); static void perf_put_aux_event(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct perf_event *iter; /* * If event uses aux_event tear down the link */ if (event->aux_event) { iter = event->aux_event; event->aux_event = NULL; put_event(iter); return; } /* * If the event is an aux_event, tear down all links to * it from other events. */ for_each_sibling_event(iter, event->group_leader) { if (iter->aux_event != event) continue; iter->aux_event = NULL; put_event(event); /* * If it's ACTIVE, schedule it out and put it into ERROR * state so that we don't try to schedule it again. Note * that perf_event_enable() will clear the ERROR status. */ event_sched_out(iter, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); } } static bool perf_need_aux_event(struct perf_event *event) { return !!event->attr.aux_output || !!event->attr.aux_sample_size; } static int perf_get_aux_event(struct perf_event *event, struct perf_event *group_leader) { /* * Our group leader must be an aux event if we want to be * an aux_output. This way, the aux event will precede its * aux_output events in the group, and therefore will always * schedule first. */ if (!group_leader) return 0; /* * aux_output and aux_sample_size are mutually exclusive. */ if (event->attr.aux_output && event->attr.aux_sample_size) return 0; if (event->attr.aux_output && !perf_aux_output_match(event, group_leader)) return 0; if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux) return 0; if (!atomic_long_inc_not_zero(&group_leader->refcount)) return 0; /* * Link aux_outputs to their aux event; this is undone in * perf_group_detach() by perf_put_aux_event(). When the * group in torn down, the aux_output events loose their * link to the aux_event and can't schedule any more. */ event->aux_event = group_leader; return 1; } static inline struct list_head *get_event_list(struct perf_event *event) { return event->attr.pinned ? &event->pmu_ctx->pinned_active : &event->pmu_ctx->flexible_active; } /* * Events that have PERF_EV_CAP_SIBLING require being part of a group and * cannot exist on their own, schedule them out and move them into the ERROR * state. Also see _perf_event_enable(), it will not be able to recover * this ERROR state. */ static inline void perf_remove_sibling_event(struct perf_event *event) { event_sched_out(event, event->ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); } static void perf_group_detach(struct perf_event *event) { struct perf_event *leader = event->group_leader; struct perf_event *sibling, *tmp; struct perf_event_context *ctx = event->ctx; lockdep_assert_held(&ctx->lock); /* * We can have double detach due to exit/hot-unplug + close. */ if (!(event->attach_state & PERF_ATTACH_GROUP)) return; event->attach_state &= ~PERF_ATTACH_GROUP; perf_put_aux_event(event); /* * If this is a sibling, remove it from its group. */ if (leader != event) { list_del_init(&event->sibling_list); event->group_leader->nr_siblings--; event->group_leader->group_generation++; goto out; } /* * If this was a group event with sibling events then * upgrade the siblings to singleton events by adding them * to whatever list we are on. */ list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) { if (sibling->event_caps & PERF_EV_CAP_SIBLING) perf_remove_sibling_event(sibling); sibling->group_leader = sibling; list_del_init(&sibling->sibling_list); /* Inherit group flags from the previous leader */ sibling->group_caps = event->group_caps; if (sibling->attach_state & PERF_ATTACH_CONTEXT) { add_event_to_groups(sibling, event->ctx); if (sibling->state == PERF_EVENT_STATE_ACTIVE) list_add_tail(&sibling->active_list, get_event_list(sibling)); } WARN_ON_ONCE(sibling->ctx != event->ctx); } out: for_each_sibling_event(tmp, leader) perf_event__header_size(tmp); perf_event__header_size(leader); } static void sync_child_event(struct perf_event *child_event); static void perf_child_detach(struct perf_event *event) { struct perf_event *parent_event = event->parent; if (!(event->attach_state & PERF_ATTACH_CHILD)) return; event->attach_state &= ~PERF_ATTACH_CHILD; if (WARN_ON_ONCE(!parent_event)) return; lockdep_assert_held(&parent_event->child_mutex); sync_child_event(event); list_del_init(&event->child_list); } static bool is_orphaned_event(struct perf_event *event) { return event->state == PERF_EVENT_STATE_DEAD; } static inline int event_filter_match(struct perf_event *event) { return (event->cpu == -1 || event->cpu == smp_processor_id()) && perf_cgroup_match(event); } static void event_sched_out(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event_pmu_context *epc = event->pmu_ctx; struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); enum perf_event_state state = PERF_EVENT_STATE_INACTIVE; // XXX cpc serialization, probably per-cpu IRQ disabled WARN_ON_ONCE(event->ctx != ctx); lockdep_assert_held(&ctx->lock); if (event->state != PERF_EVENT_STATE_ACTIVE) return; /* * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but * we can schedule events _OUT_ individually through things like * __perf_remove_from_context(). */ list_del_init(&event->active_list); perf_pmu_disable(event->pmu); event->pmu->del(event, 0); event->oncpu = -1; if (event->pending_disable) { event->pending_disable = 0; perf_cgroup_event_disable(event, ctx); state = PERF_EVENT_STATE_OFF; } if (event->pending_sigtrap) { bool dec = true; event->pending_sigtrap = 0; if (state != PERF_EVENT_STATE_OFF && !event->pending_work) { event->pending_work = 1; dec = false; WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount)); task_work_add(current, &event->pending_task, TWA_RESUME); } if (dec) local_dec(&event->ctx->nr_pending); } perf_event_set_state(event, state); if (!is_software_event(event)) cpc->active_oncpu--; if (event->attr.freq && event->attr.sample_freq) ctx->nr_freq--; if (event->attr.exclusive || !cpc->active_oncpu) cpc->exclusive = 0; perf_pmu_enable(event->pmu); } static void group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx) { struct perf_event *event; if (group_event->state != PERF_EVENT_STATE_ACTIVE) return; perf_assert_pmu_disabled(group_event->pmu_ctx->pmu); event_sched_out(group_event, ctx); /* * Schedule out siblings (if any): */ for_each_sibling_event(event, group_event) event_sched_out(event, ctx); } #define DETACH_GROUP 0x01UL #define DETACH_CHILD 0x02UL #define DETACH_DEAD 0x04UL /* * Cross CPU call to remove a performance event * * We disable the event on the hardware level first. After that we * remove it from the context list. */ static void __perf_remove_from_context(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, void *info) { struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx; unsigned long flags = (unsigned long)info; if (ctx->is_active & EVENT_TIME) { update_context_time(ctx); update_cgrp_time_from_cpuctx(cpuctx, false); } /* * Ensure event_sched_out() switches to OFF, at the very least * this avoids raising perf_pending_task() at this time. */ if (flags & DETACH_DEAD) event->pending_disable = 1; event_sched_out(event, ctx); if (flags & DETACH_GROUP) perf_group_detach(event); if (flags & DETACH_CHILD) perf_child_detach(event); list_del_event(event, ctx); if (flags & DETACH_DEAD) event->state = PERF_EVENT_STATE_DEAD; if (!pmu_ctx->nr_events) { pmu_ctx->rotate_necessary = 0; if (ctx->task && ctx->is_active) { struct perf_cpu_pmu_context *cpc; cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); cpc->task_epc = NULL; } } if (!ctx->nr_events && ctx->is_active) { if (ctx == &cpuctx->ctx) update_cgrp_time_from_cpuctx(cpuctx, true); ctx->is_active = 0; if (ctx->task) { WARN_ON_ONCE(cpuctx->task_ctx != ctx); cpuctx->task_ctx = NULL; } } } /* * Remove the event from a task's (or a CPU's) list of events. * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to * remains valid. This is OK when called from perf_release since * that only calls us on the top-level context, which can't be a clone. * When called from perf_event_exit_task, it's OK because the * context has been detached from its task. */ static void perf_remove_from_context(struct perf_event *event, unsigned long flags) { struct perf_event_context *ctx = event->ctx; lockdep_assert_held(&ctx->mutex); /* * Because of perf_event_exit_task(), perf_remove_from_context() ought * to work in the face of TASK_TOMBSTONE, unlike every other * event_function_call() user. */ raw_spin_lock_irq(&ctx->lock); if (!ctx->is_active) { __perf_remove_from_context(event, this_cpu_ptr(&perf_cpu_context), ctx, (void *)flags); raw_spin_unlock_irq(&ctx->lock); return; } raw_spin_unlock_irq(&ctx->lock); event_function_call(event, __perf_remove_from_context, (void *)flags); } /* * Cross CPU call to disable a performance event */ static void __perf_event_disable(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, void *info) { if (event->state < PERF_EVENT_STATE_INACTIVE) return; if (ctx->is_active & EVENT_TIME) { update_context_time(ctx); update_cgrp_time_from_event(event); } perf_pmu_disable(event->pmu_ctx->pmu); if (event == event->group_leader) group_sched_out(event, ctx); else event_sched_out(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_OFF); perf_cgroup_event_disable(event, ctx); perf_pmu_enable(event->pmu_ctx->pmu); } /* * Disable an event. * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to * remains valid. This condition is satisfied when called through * perf_event_for_each_child or perf_event_for_each because they * hold the top-level event's child_mutex, so any descendant that * goes to exit will block in perf_event_exit_event(). * * When called from perf_pending_irq it's OK because event->ctx * is the current context on this CPU and preemption is disabled, * hence we can't get into perf_event_task_sched_out for this context. */ static void _perf_event_disable(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; raw_spin_lock_irq(&ctx->lock); if (event->state <= PERF_EVENT_STATE_OFF) { raw_spin_unlock_irq(&ctx->lock); return; } raw_spin_unlock_irq(&ctx->lock); event_function_call(event, __perf_event_disable, NULL); } void perf_event_disable_local(struct perf_event *event) { event_function_local(event, __perf_event_disable, NULL); } /* * Strictly speaking kernel users cannot create groups and therefore this * interface does not need the perf_event_ctx_lock() magic. */ void perf_event_disable(struct perf_event *event) { struct perf_event_context *ctx; ctx = perf_event_ctx_lock(event); _perf_event_disable(event); perf_event_ctx_unlock(event, ctx); } EXPORT_SYMBOL_GPL(perf_event_disable); void perf_event_disable_inatomic(struct perf_event *event) { event->pending_disable = 1; irq_work_queue(&event->pending_irq); } #define MAX_INTERRUPTS (~0ULL) static void perf_log_throttle(struct perf_event *event, int enable); static void perf_log_itrace_start(struct perf_event *event); static int event_sched_in(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event_pmu_context *epc = event->pmu_ctx; struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); int ret = 0; WARN_ON_ONCE(event->ctx != ctx); lockdep_assert_held(&ctx->lock); if (event->state <= PERF_EVENT_STATE_OFF) return 0; WRITE_ONCE(event->oncpu, smp_processor_id()); /* * Order event::oncpu write to happen before the ACTIVE state is * visible. This allows perf_event_{stop,read}() to observe the correct * ->oncpu if it sees ACTIVE. */ smp_wmb(); perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE); /* * Unthrottle events, since we scheduled we might have missed several * ticks already, also for a heavily scheduling task there is little * guarantee it'll get a tick in a timely manner. */ if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) { perf_log_throttle(event, 1); event->hw.interrupts = 0; } perf_pmu_disable(event->pmu); perf_log_itrace_start(event); if (event->pmu->add(event, PERF_EF_START)) { perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); event->oncpu = -1; ret = -EAGAIN; goto out; } if (!is_software_event(event)) cpc->active_oncpu++; if (event->attr.freq && event->attr.sample_freq) ctx->nr_freq++; if (event->attr.exclusive) cpc->exclusive = 1; out: perf_pmu_enable(event->pmu); return ret; } static int group_sched_in(struct perf_event *group_event, struct perf_event_context *ctx) { struct perf_event *event, *partial_group = NULL; struct pmu *pmu = group_event->pmu_ctx->pmu; if (group_event->state == PERF_EVENT_STATE_OFF) return 0; pmu->start_txn(pmu, PERF_PMU_TXN_ADD); if (event_sched_in(group_event, ctx)) goto error; /* * Schedule in siblings as one group (if any): */ for_each_sibling_event(event, group_event) { if (event_sched_in(event, ctx)) { partial_group = event; goto group_error; } } if (!pmu->commit_txn(pmu)) return 0; group_error: /* * Groups can be scheduled in as one unit only, so undo any * partial group before returning: * The events up to the failed event are scheduled out normally. */ for_each_sibling_event(event, group_event) { if (event == partial_group) break; event_sched_out(event, ctx); } event_sched_out(group_event, ctx); error: pmu->cancel_txn(pmu); return -EAGAIN; } /* * Work out whether we can put this event group on the CPU now. */ static int group_can_go_on(struct perf_event *event, int can_add_hw) { struct perf_event_pmu_context *epc = event->pmu_ctx; struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); /* * Groups consisting entirely of software events can always go on. */ if (event->group_caps & PERF_EV_CAP_SOFTWARE) return 1; /* * If an exclusive group is already on, no other hardware * events can go on. */ if (cpc->exclusive) return 0; /* * If this group is exclusive and there are already * events on the CPU, it can't go on. */ if (event->attr.exclusive && !list_empty(get_event_list(event))) return 0; /* * Otherwise, try to add it if all previous groups were able * to go on. */ return can_add_hw; } static void add_event_to_ctx(struct perf_event *event, struct perf_event_context *ctx) { list_add_event(event, ctx); perf_group_attach(event); } static void task_ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); if (!cpuctx->task_ctx) return; if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) return; ctx_sched_out(ctx, event_type); } static void perf_event_sched_in(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { ctx_sched_in(&cpuctx->ctx, EVENT_PINNED); if (ctx) ctx_sched_in(ctx, EVENT_PINNED); ctx_sched_in(&cpuctx->ctx, EVENT_FLEXIBLE); if (ctx) ctx_sched_in(ctx, EVENT_FLEXIBLE); } /* * We want to maintain the following priority of scheduling: * - CPU pinned (EVENT_CPU | EVENT_PINNED) * - task pinned (EVENT_PINNED) * - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE) * - task flexible (EVENT_FLEXIBLE). * * In order to avoid unscheduling and scheduling back in everything every * time an event is added, only do it for the groups of equal priority and * below. * * This can be called after a batch operation on task events, in which case * event_type is a bit mask of the types of events involved. For CPU events, * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE. */ /* * XXX: ctx_resched() reschedule entire perf_event_context while adding new * event to the context or enabling existing event in the context. We can * probably optimize it by rescheduling only affected pmu_ctx. */ static void ctx_resched(struct perf_cpu_context *cpuctx, struct perf_event_context *task_ctx, enum event_type_t event_type) { bool cpu_event = !!(event_type & EVENT_CPU); /* * If pinned groups are involved, flexible groups also need to be * scheduled out. */ if (event_type & EVENT_PINNED) event_type |= EVENT_FLEXIBLE; event_type &= EVENT_ALL; perf_ctx_disable(&cpuctx->ctx, false); if (task_ctx) { perf_ctx_disable(task_ctx, false); task_ctx_sched_out(task_ctx, event_type); } /* * Decide which cpu ctx groups to schedule out based on the types * of events that caused rescheduling: * - EVENT_CPU: schedule out corresponding groups; * - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups; * - otherwise, do nothing more. */ if (cpu_event) ctx_sched_out(&cpuctx->ctx, event_type); else if (event_type & EVENT_PINNED) ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE); perf_event_sched_in(cpuctx, task_ctx); perf_ctx_enable(&cpuctx->ctx, false); if (task_ctx) perf_ctx_enable(task_ctx, false); } void perf_pmu_resched(struct pmu *pmu) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *task_ctx = cpuctx->task_ctx; perf_ctx_lock(cpuctx, task_ctx); ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU); perf_ctx_unlock(cpuctx, task_ctx); } /* * Cross CPU call to install and enable a performance event * * Very similar to remote_function() + event_function() but cannot assume that * things like ctx->is_active and cpuctx->task_ctx are set. */ static int __perf_install_in_context(void *info) { struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *task_ctx = cpuctx->task_ctx; bool reprogram = true; int ret = 0; raw_spin_lock(&cpuctx->ctx.lock); if (ctx->task) { raw_spin_lock(&ctx->lock); task_ctx = ctx; reprogram = (ctx->task == current); /* * If the task is running, it must be running on this CPU, * otherwise we cannot reprogram things. * * If its not running, we don't care, ctx->lock will * serialize against it becoming runnable. */ if (task_curr(ctx->task) && !reprogram) { ret = -ESRCH; goto unlock; } WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx); } else if (task_ctx) { raw_spin_lock(&task_ctx->lock); } #ifdef CONFIG_CGROUP_PERF if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) { /* * If the current cgroup doesn't match the event's * cgroup, we should not try to schedule it. */ struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx); reprogram = cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup); } #endif if (reprogram) { ctx_sched_out(ctx, EVENT_TIME); add_event_to_ctx(event, ctx); ctx_resched(cpuctx, task_ctx, get_event_type(event)); } else { add_event_to_ctx(event, ctx); } unlock: perf_ctx_unlock(cpuctx, task_ctx); return ret; } static bool exclusive_event_installable(struct perf_event *event, struct perf_event_context *ctx); /* * Attach a performance event to a context. * * Very similar to event_function_call, see comment there. */ static void perf_install_in_context(struct perf_event_context *ctx, struct perf_event *event, int cpu) { struct task_struct *task = READ_ONCE(ctx->task); lockdep_assert_held(&ctx->mutex); WARN_ON_ONCE(!exclusive_event_installable(event, ctx)); if (event->cpu != -1) WARN_ON_ONCE(event->cpu != cpu); /* * Ensures that if we can observe event->ctx, both the event and ctx * will be 'complete'. See perf_iterate_sb_cpu(). */ smp_store_release(&event->ctx, ctx); /* * perf_event_attr::disabled events will not run and can be initialized * without IPI. Except when this is the first event for the context, in * that case we need the magic of the IPI to set ctx->is_active. * * The IOC_ENABLE that is sure to follow the creation of a disabled * event will issue the IPI and reprogram the hardware. */ if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events && !is_cgroup_event(event)) { raw_spin_lock_irq(&ctx->lock); if (ctx->task == TASK_TOMBSTONE) { raw_spin_unlock_irq(&ctx->lock); return; } add_event_to_ctx(event, ctx); raw_spin_unlock_irq(&ctx->lock); return; } if (!task) { cpu_function_call(cpu, __perf_install_in_context, event); return; } /* * Should not happen, we validate the ctx is still alive before calling. */ if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) return; /* * Installing events is tricky because we cannot rely on ctx->is_active * to be set in case this is the nr_events 0 -> 1 transition. * * Instead we use task_curr(), which tells us if the task is running. * However, since we use task_curr() outside of rq::lock, we can race * against the actual state. This means the result can be wrong. * * If we get a false positive, we retry, this is harmless. * * If we get a false negative, things are complicated. If we are after * perf_event_context_sched_in() ctx::lock will serialize us, and the * value must be correct. If we're before, it doesn't matter since * perf_event_context_sched_in() will program the counter. * * However, this hinges on the remote context switch having observed * our task->perf_event_ctxp[] store, such that it will in fact take * ctx::lock in perf_event_context_sched_in(). * * We do this by task_function_call(), if the IPI fails to hit the task * we know any future context switch of task must see the * perf_event_ctpx[] store. */ /* * This smp_mb() orders the task->perf_event_ctxp[] store with the * task_cpu() load, such that if the IPI then does not find the task * running, a future context switch of that task must observe the * store. */ smp_mb(); again: if (!task_function_call(task, __perf_install_in_context, event)) return; raw_spin_lock_irq(&ctx->lock); task = ctx->task; if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) { /* * Cannot happen because we already checked above (which also * cannot happen), and we hold ctx->mutex, which serializes us * against perf_event_exit_task_context(). */ raw_spin_unlock_irq(&ctx->lock); return; } /* * If the task is not running, ctx->lock will avoid it becoming so, * thus we can safely install the event. */ if (task_curr(task)) { raw_spin_unlock_irq(&ctx->lock); goto again; } add_event_to_ctx(event, ctx); raw_spin_unlock_irq(&ctx->lock); } /* * Cross CPU call to enable a performance event */ static void __perf_event_enable(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, void *info) { struct perf_event *leader = event->group_leader; struct perf_event_context *task_ctx; if (event->state >= PERF_EVENT_STATE_INACTIVE || event->state <= PERF_EVENT_STATE_ERROR) return; if (ctx->is_active) ctx_sched_out(ctx, EVENT_TIME); perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); perf_cgroup_event_enable(event, ctx); if (!ctx->is_active) return; if (!event_filter_match(event)) { ctx_sched_in(ctx, EVENT_TIME); return; } /* * If the event is in a group and isn't the group leader, * then don't put it on unless the group is on. */ if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) { ctx_sched_in(ctx, EVENT_TIME); return; } task_ctx = cpuctx->task_ctx; if (ctx->task) WARN_ON_ONCE(task_ctx != ctx); ctx_resched(cpuctx, task_ctx, get_event_type(event)); } /* * Enable an event. * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to * remains valid. This condition is satisfied when called through * perf_event_for_each_child or perf_event_for_each as described * for perf_event_disable. */ static void _perf_event_enable(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; raw_spin_lock_irq(&ctx->lock); if (event->state >= PERF_EVENT_STATE_INACTIVE || event->state < PERF_EVENT_STATE_ERROR) { out: raw_spin_unlock_irq(&ctx->lock); return; } /* * If the event is in error state, clear that first. * * That way, if we see the event in error state below, we know that it * has gone back into error state, as distinct from the task having * been scheduled away before the cross-call arrived. */ if (event->state == PERF_EVENT_STATE_ERROR) { /* * Detached SIBLING events cannot leave ERROR state. */ if (event->event_caps & PERF_EV_CAP_SIBLING && event->group_leader == event) goto out; event->state = PERF_EVENT_STATE_OFF; } raw_spin_unlock_irq(&ctx->lock); event_function_call(event, __perf_event_enable, NULL); } /* * See perf_event_disable(); */ void perf_event_enable(struct perf_event *event) { struct perf_event_context *ctx; ctx = perf_event_ctx_lock(event); _perf_event_enable(event); perf_event_ctx_unlock(event, ctx); } EXPORT_SYMBOL_GPL(perf_event_enable); struct stop_event_data { struct perf_event *event; unsigned int restart; }; static int __perf_event_stop(void *info) { struct stop_event_data *sd = info; struct perf_event *event = sd->event; /* if it's already INACTIVE, do nothing */ if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE) return 0; /* matches smp_wmb() in event_sched_in() */ smp_rmb(); /* * There is a window with interrupts enabled before we get here, * so we need to check again lest we try to stop another CPU's event. */ if (READ_ONCE(event->oncpu) != smp_processor_id()) return -EAGAIN; event->pmu->stop(event, PERF_EF_UPDATE); /* * May race with the actual stop (through perf_pmu_output_stop()), * but it is only used for events with AUX ring buffer, and such * events will refuse to restart because of rb::aux_mmap_count==0, * see comments in perf_aux_output_begin(). * * Since this is happening on an event-local CPU, no trace is lost * while restarting. */ if (sd->restart) event->pmu->start(event, 0); return 0; } static int perf_event_stop(struct perf_event *event, int restart) { struct stop_event_data sd = { .event = event, .restart = restart, }; int ret = 0; do { if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE) return 0; /* matches smp_wmb() in event_sched_in() */ smp_rmb(); /* * We only want to restart ACTIVE events, so if the event goes * inactive here (event->oncpu==-1), there's nothing more to do; * fall through with ret==-ENXIO. */ ret = cpu_function_call(READ_ONCE(event->oncpu), __perf_event_stop, &sd); } while (ret == -EAGAIN); return ret; } /* * In order to contain the amount of racy and tricky in the address filter * configuration management, it is a two part process: * * (p1) when userspace mappings change as a result of (1) or (2) or (3) below, * we update the addresses of corresponding vmas in * event::addr_filter_ranges array and bump the event::addr_filters_gen; * (p2) when an event is scheduled in (pmu::add), it calls * perf_event_addr_filters_sync() which calls pmu::addr_filters_sync() * if the generation has changed since the previous call. * * If (p1) happens while the event is active, we restart it to force (p2). * * (1) perf_addr_filters_apply(): adjusting filters' offsets based on * pre-existing mappings, called once when new filters arrive via SET_FILTER * ioctl; * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly * registered mapping, called for every new mmap(), with mm::mmap_lock down * for reading; * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process * of exec. */ void perf_event_addr_filters_sync(struct perf_event *event) { struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); if (!has_addr_filter(event)) return; raw_spin_lock(&ifh->lock); if (event->addr_filters_gen != event->hw.addr_filters_gen) { event->pmu->addr_filters_sync(event); event->hw.addr_filters_gen = event->addr_filters_gen; } raw_spin_unlock(&ifh->lock); } EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync); static int _perf_event_refresh(struct perf_event *event, int refresh) { /* * not supported on inherited events */ if (event->attr.inherit || !is_sampling_event(event)) return -EINVAL; atomic_add(refresh, &event->event_limit); _perf_event_enable(event); return 0; } /* * See perf_event_disable() */ int perf_event_refresh(struct perf_event *event, int refresh) { struct perf_event_context *ctx; int ret; ctx = perf_event_ctx_lock(event); ret = _perf_event_refresh(event, refresh); perf_event_ctx_unlock(event, ctx); return ret; } EXPORT_SYMBOL_GPL(perf_event_refresh); static int perf_event_modify_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) { int err; _perf_event_disable(bp); err = modify_user_hw_breakpoint_check(bp, attr, true); if (!bp->attr.disabled) _perf_event_enable(bp); return err; } /* * Copy event-type-independent attributes that may be modified. */ static void perf_event_modify_copy_attr(struct perf_event_attr *to, const struct perf_event_attr *from) { to->sig_data = from->sig_data; } static int perf_event_modify_attr(struct perf_event *event, struct perf_event_attr *attr) { int (*func)(struct perf_event *, struct perf_event_attr *); struct perf_event *child; int err; if (event->attr.type != attr->type) return -EINVAL; switch (event->attr.type) { case PERF_TYPE_BREAKPOINT: func = perf_event_modify_breakpoint; break; default: /* Place holder for future additions. */ return -EOPNOTSUPP; } WARN_ON_ONCE(event->ctx->parent_ctx); mutex_lock(&event->child_mutex); /* * Event-type-independent attributes must be copied before event-type * modification, which will validate that final attributes match the * source attributes after all relevant attributes have been copied. */ perf_event_modify_copy_attr(&event->attr, attr); err = func(event, attr); if (err) goto out; list_for_each_entry(child, &event->child_list, child_list) { perf_event_modify_copy_attr(&child->attr, attr); err = func(child, attr); if (err) goto out; } out: mutex_unlock(&event->child_mutex); return err; } static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx, enum event_type_t event_type) { struct perf_event_context *ctx = pmu_ctx->ctx; struct perf_event *event, *tmp; struct pmu *pmu = pmu_ctx->pmu; if (ctx->task && !ctx->is_active) { struct perf_cpu_pmu_context *cpc; cpc = this_cpu_ptr(pmu->cpu_pmu_context); WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); cpc->task_epc = NULL; } if (!event_type) return; perf_pmu_disable(pmu); if (event_type & EVENT_PINNED) { list_for_each_entry_safe(event, tmp, &pmu_ctx->pinned_active, active_list) group_sched_out(event, ctx); } if (event_type & EVENT_FLEXIBLE) { list_for_each_entry_safe(event, tmp, &pmu_ctx->flexible_active, active_list) group_sched_out(event, ctx); /* * Since we cleared EVENT_FLEXIBLE, also clear * rotate_necessary, is will be reset by * ctx_flexible_sched_in() when needed. */ pmu_ctx->rotate_necessary = 0; } perf_pmu_enable(pmu); } static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_pmu_context *pmu_ctx; int is_active = ctx->is_active; bool cgroup = event_type & EVENT_CGROUP; event_type &= ~EVENT_CGROUP; lockdep_assert_held(&ctx->lock); if (likely(!ctx->nr_events)) { /* * See __perf_remove_from_context(). */ WARN_ON_ONCE(ctx->is_active); if (ctx->task) WARN_ON_ONCE(cpuctx->task_ctx); return; } /* * Always update time if it was set; not only when it changes. * Otherwise we can 'forget' to update time for any but the last * context we sched out. For example: * * ctx_sched_out(.event_type = EVENT_FLEXIBLE) * ctx_sched_out(.event_type = EVENT_PINNED) * * would only update time for the pinned events. */ if (is_active & EVENT_TIME) { /* update (and stop) ctx time */ update_context_time(ctx); update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx); /* * CPU-release for the below ->is_active store, * see __load_acquire() in perf_event_time_now() */ barrier(); } ctx->is_active &= ~event_type; if (!(ctx->is_active & EVENT_ALL)) ctx->is_active = 0; if (ctx->task) { WARN_ON_ONCE(cpuctx->task_ctx != ctx); if (!ctx->is_active) cpuctx->task_ctx = NULL; } is_active ^= ctx->is_active; /* changed bits */ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { if (cgroup && !pmu_ctx->nr_cgroups) continue; __pmu_ctx_sched_out(pmu_ctx, is_active); } } /* * Test whether two contexts are equivalent, i.e. whether they have both been * cloned from the same version of the same context. * * Equivalence is measured using a generation number in the context that is * incremented on each modification to it; see unclone_ctx(), list_add_event() * and list_del_event(). */ static int context_equiv(struct perf_event_context *ctx1, struct perf_event_context *ctx2) { lockdep_assert_held(&ctx1->lock); lockdep_assert_held(&ctx2->lock); /* Pinning disables the swap optimization */ if (ctx1->pin_count || ctx2->pin_count) return 0; /* If ctx1 is the parent of ctx2 */ if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen) return 1; /* If ctx2 is the parent of ctx1 */ if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation) return 1; /* * If ctx1 and ctx2 have the same parent; we flatten the parent * hierarchy, see perf_event_init_context(). */ if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx && ctx1->parent_gen == ctx2->parent_gen) return 1; /* Unmatched */ return 0; } static void __perf_event_sync_stat(struct perf_event *event, struct perf_event *next_event) { u64 value; if (!event->attr.inherit_stat) return; /* * Update the event value, we cannot use perf_event_read() * because we're in the middle of a context switch and have IRQs * disabled, which upsets smp_call_function_single(), however * we know the event must be on the current CPU, therefore we * don't need to use it. */ if (event->state == PERF_EVENT_STATE_ACTIVE) event->pmu->read(event); perf_event_update_time(event); /* * In order to keep per-task stats reliable we need to flip the event * values when we flip the contexts. */ value = local64_read(&next_event->count); value = local64_xchg(&event->count, value); local64_set(&next_event->count, value); swap(event->total_time_enabled, next_event->total_time_enabled); swap(event->total_time_running, next_event->total_time_running); /* * Since we swizzled the values, update the user visible data too. */ perf_event_update_userpage(event); perf_event_update_userpage(next_event); } static void perf_event_sync_stat(struct perf_event_context *ctx, struct perf_event_context *next_ctx) { struct perf_event *event, *next_event; if (!ctx->nr_stat) return; update_context_time(ctx); event = list_first_entry(&ctx->event_list, struct perf_event, event_entry); next_event = list_first_entry(&next_ctx->event_list, struct perf_event, event_entry); while (&event->event_entry != &ctx->event_list && &next_event->event_entry != &next_ctx->event_list) { __perf_event_sync_stat(event, next_event); event = list_next_entry(event, event_entry); next_event = list_next_entry(next_event, event_entry); } } #define double_list_for_each_entry(pos1, pos2, head1, head2, member) \ for (pos1 = list_first_entry(head1, typeof(*pos1), member), \ pos2 = list_first_entry(head2, typeof(*pos2), member); \ !list_entry_is_head(pos1, head1, member) && \ !list_entry_is_head(pos2, head2, member); \ pos1 = list_next_entry(pos1, member), \ pos2 = list_next_entry(pos2, member)) static void perf_event_swap_task_ctx_data(struct perf_event_context *prev_ctx, struct perf_event_context *next_ctx) { struct perf_event_pmu_context *prev_epc, *next_epc; if (!prev_ctx->nr_task_data) return; double_list_for_each_entry(prev_epc, next_epc, &prev_ctx->pmu_ctx_list, &next_ctx->pmu_ctx_list, pmu_ctx_entry) { if (WARN_ON_ONCE(prev_epc->pmu != next_epc->pmu)) continue; /* * PMU specific parts of task perf context can require * additional synchronization. As an example of such * synchronization see implementation details of Intel * LBR call stack data profiling; */ if (prev_epc->pmu->swap_task_ctx) prev_epc->pmu->swap_task_ctx(prev_epc, next_epc); else swap(prev_epc->task_ctx_data, next_epc->task_ctx_data); } } static void perf_ctx_sched_task_cb(struct perf_event_context *ctx, bool sched_in) { struct perf_event_pmu_context *pmu_ctx; struct perf_cpu_pmu_context *cpc; list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); if (cpc->sched_cb_usage && pmu_ctx->pmu->sched_task) pmu_ctx->pmu->sched_task(pmu_ctx, sched_in); } } static void perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) { struct perf_event_context *ctx = task->perf_event_ctxp; struct perf_event_context *next_ctx; struct perf_event_context *parent, *next_parent; int do_switch = 1; if (likely(!ctx)) return; rcu_read_lock(); next_ctx = rcu_dereference(next->perf_event_ctxp); if (!next_ctx) goto unlock; parent = rcu_dereference(ctx->parent_ctx); next_parent = rcu_dereference(next_ctx->parent_ctx); /* If neither context have a parent context; they cannot be clones. */ if (!parent && !next_parent) goto unlock; if (next_parent == ctx || next_ctx == parent || next_parent == parent) { /* * Looks like the two contexts are clones, so we might be * able to optimize the context switch. We lock both * contexts and check that they are clones under the * lock (including re-checking that neither has been * uncloned in the meantime). It doesn't matter which * order we take the locks because no other cpu could * be trying to lock both of these tasks. */ raw_spin_lock(&ctx->lock); raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { perf_ctx_disable(ctx, false); /* PMIs are disabled; ctx->nr_pending is stable. */ if (local_read(&ctx->nr_pending) || local_read(&next_ctx->nr_pending)) { /* * Must not swap out ctx when there's pending * events that rely on the ctx->task relation. */ raw_spin_unlock(&next_ctx->lock); rcu_read_unlock(); goto inside_switch; } WRITE_ONCE(ctx->task, next); WRITE_ONCE(next_ctx->task, task); perf_ctx_sched_task_cb(ctx, false); perf_event_swap_task_ctx_data(ctx, next_ctx); perf_ctx_enable(ctx, false); /* * RCU_INIT_POINTER here is safe because we've not * modified the ctx and the above modification of * ctx->task and ctx->task_ctx_data are immaterial * since those values are always verified under * ctx->lock which we're now holding. */ RCU_INIT_POINTER(task->perf_event_ctxp, next_ctx); RCU_INIT_POINTER(next->perf_event_ctxp, ctx); do_switch = 0; perf_event_sync_stat(ctx, next_ctx); } raw_spin_unlock(&next_ctx->lock); raw_spin_unlock(&ctx->lock); } unlock: rcu_read_unlock(); if (do_switch) { raw_spin_lock(&ctx->lock); perf_ctx_disable(ctx, false); inside_switch: perf_ctx_sched_task_cb(ctx, false); task_ctx_sched_out(ctx, EVENT_ALL); perf_ctx_enable(ctx, false); raw_spin_unlock(&ctx->lock); } } static DEFINE_PER_CPU(struct list_head, sched_cb_list); static DEFINE_PER_CPU(int, perf_sched_cb_usages); void perf_sched_cb_dec(struct pmu *pmu) { struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context); this_cpu_dec(perf_sched_cb_usages); barrier(); if (!--cpc->sched_cb_usage) list_del(&cpc->sched_cb_entry); } void perf_sched_cb_inc(struct pmu *pmu) { struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context); if (!cpc->sched_cb_usage++) list_add(&cpc->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); barrier(); this_cpu_inc(perf_sched_cb_usages); } /* * This function provides the context switch callback to the lower code * layer. It is invoked ONLY when the context switch callback is enabled. * * This callback is relevant even to per-cpu events; for example multi event * PEBS requires this to provide PID/TID information. This requires we flush * all queued PEBS records before we context switch to a new task. */ static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, bool sched_in) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct pmu *pmu; pmu = cpc->epc.pmu; /* software PMUs will not have sched_task */ if (WARN_ON_ONCE(!pmu->sched_task)) return; perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(pmu); pmu->sched_task(cpc->task_epc, sched_in); perf_pmu_enable(pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } static void perf_pmu_sched_task(struct task_struct *prev, struct task_struct *next, bool sched_in) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_cpu_pmu_context *cpc; /* cpuctx->task_ctx will be handled in perf_event_context_sched_in/out */ if (prev == next || cpuctx->task_ctx) return; list_for_each_entry(cpc, this_cpu_ptr(&sched_cb_list), sched_cb_entry) __perf_pmu_sched_task(cpc, sched_in); } static void perf_event_switch(struct task_struct *task, struct task_struct *next_prev, bool sched_in); /* * Called from scheduler to remove the events of the current task, * with interrupts disabled. * * We stop each event and update the event value in event->count. * * This does not protect us against NMI, but disable() * sets the disabled bit in the control field of event _before_ * accessing the event control register. If a NMI hits, then it will * not restart the event. */ void __perf_event_task_sched_out(struct task_struct *task, struct task_struct *next) { if (__this_cpu_read(perf_sched_cb_usages)) perf_pmu_sched_task(task, next, false); if (atomic_read(&nr_switch_events)) perf_event_switch(task, next, false); perf_event_context_sched_out(task, next); /* * if cgroup events exist on this CPU, then we need * to check if we have to switch out PMU state. * cgroup event are system-wide mode only */ perf_cgroup_switch(next); } static bool perf_less_group_idx(const void *l, const void *r) { const struct perf_event *le = *(const struct perf_event **)l; const struct perf_event *re = *(const struct perf_event **)r; return le->group_index < re->group_index; } static void swap_ptr(void *l, void *r) { void **lp = l, **rp = r; swap(*lp, *rp); } static const struct min_heap_callbacks perf_min_heap = { .elem_size = sizeof(struct perf_event *), .less = perf_less_group_idx, .swp = swap_ptr, }; static void __heap_add(struct min_heap *heap, struct perf_event *event) { struct perf_event **itrs = heap->data; if (event) { itrs[heap->nr] = event; heap->nr++; } } static void __link_epc(struct perf_event_pmu_context *pmu_ctx) { struct perf_cpu_pmu_context *cpc; if (!pmu_ctx->ctx->task) return; cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); cpc->task_epc = pmu_ctx; } static noinline int visit_groups_merge(struct perf_event_context *ctx, struct perf_event_groups *groups, int cpu, struct pmu *pmu, int (*func)(struct perf_event *, void *), void *data) { #ifdef CONFIG_CGROUP_PERF struct cgroup_subsys_state *css = NULL; #endif struct perf_cpu_context *cpuctx = NULL; /* Space for per CPU and/or any CPU event iterators. */ struct perf_event *itrs[2]; struct min_heap event_heap; struct perf_event **evt; int ret; if (pmu->filter && pmu->filter(pmu, cpu)) return 0; if (!ctx->task) { cpuctx = this_cpu_ptr(&perf_cpu_context); event_heap = (struct min_heap){ .data = cpuctx->heap, .nr = 0, .size = cpuctx->heap_size, }; lockdep_assert_held(&cpuctx->ctx.lock); #ifdef CONFIG_CGROUP_PERF if (cpuctx->cgrp) css = &cpuctx->cgrp->css; #endif } else { event_heap = (struct min_heap){ .data = itrs, .nr = 0, .size = ARRAY_SIZE(itrs), }; /* Events not within a CPU context may be on any CPU. */ __heap_add(&event_heap, perf_event_groups_first(groups, -1, pmu, NULL)); } evt = event_heap.data; __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, NULL)); #ifdef CONFIG_CGROUP_PERF for (; css; css = css->parent) __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, css->cgroup)); #endif if (event_heap.nr) { __link_epc((*evt)->pmu_ctx); perf_assert_pmu_disabled((*evt)->pmu_ctx->pmu); } min_heapify_all(&event_heap, &perf_min_heap); while (event_heap.nr) { ret = func(*evt, data); if (ret) return ret; *evt = perf_event_groups_next(*evt, pmu); if (*evt) min_heapify(&event_heap, 0, &perf_min_heap); else min_heap_pop(&event_heap, &perf_min_heap); } return 0; } /* * Because the userpage is strictly per-event (there is no concept of context, * so there cannot be a context indirection), every userpage must be updated * when context time starts :-( * * IOW, we must not miss EVENT_TIME edges. */ static inline bool event_update_userpage(struct perf_event *event) { if (likely(!atomic_read(&event->mmap_count))) return false; perf_event_update_time(event); perf_event_update_userpage(event); return true; } static inline void group_update_userpage(struct perf_event *group_event) { struct perf_event *event; if (!event_update_userpage(group_event)) return; for_each_sibling_event(event, group_event) event_update_userpage(event); } static int merge_sched_in(struct perf_event *event, void *data) { struct perf_event_context *ctx = event->ctx; int *can_add_hw = data; if (event->state <= PERF_EVENT_STATE_OFF) return 0; if (!event_filter_match(event)) return 0; if (group_can_go_on(event, *can_add_hw)) { if (!group_sched_in(event, ctx)) list_add_tail(&event->active_list, get_event_list(event)); } if (event->state == PERF_EVENT_STATE_INACTIVE) { *can_add_hw = 0; if (event->attr.pinned) { perf_cgroup_event_disable(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); } else { struct perf_cpu_pmu_context *cpc; event->pmu_ctx->rotate_necessary = 1; cpc = this_cpu_ptr(event->pmu_ctx->pmu->cpu_pmu_context); perf_mux_hrtimer_restart(cpc); group_update_userpage(event); } } return 0; } static void pmu_groups_sched_in(struct perf_event_context *ctx, struct perf_event_groups *groups, struct pmu *pmu) { int can_add_hw = 1; visit_groups_merge(ctx, groups, smp_processor_id(), pmu, merge_sched_in, &can_add_hw); } static void ctx_groups_sched_in(struct perf_event_context *ctx, struct perf_event_groups *groups, bool cgroup) { struct perf_event_pmu_context *pmu_ctx; list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { if (cgroup && !pmu_ctx->nr_cgroups) continue; pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu); } } static void __pmu_ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu) { pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu); } static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); int is_active = ctx->is_active; bool cgroup = event_type & EVENT_CGROUP; event_type &= ~EVENT_CGROUP; lockdep_assert_held(&ctx->lock); if (likely(!ctx->nr_events)) return; if (!(is_active & EVENT_TIME)) { /* start ctx time */ __update_context_time(ctx, false); perf_cgroup_set_timestamp(cpuctx); /* * CPU-release for the below ->is_active store, * see __load_acquire() in perf_event_time_now() */ barrier(); } ctx->is_active |= (event_type | EVENT_TIME); if (ctx->task) { if (!is_active) cpuctx->task_ctx = ctx; else WARN_ON_ONCE(cpuctx->task_ctx != ctx); } is_active ^= ctx->is_active; /* changed bits */ /* * First go through the list and put on any pinned groups * in order to give them the best chance of going on. */ if (is_active & EVENT_PINNED) ctx_groups_sched_in(ctx, &ctx->pinned_groups, cgroup); /* Then walk through the lower prio flexible groups */ if (is_active & EVENT_FLEXIBLE) ctx_groups_sched_in(ctx, &ctx->flexible_groups, cgroup); } static void perf_event_context_sched_in(struct task_struct *task) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *ctx; rcu_read_lock(); ctx = rcu_dereference(task->perf_event_ctxp); if (!ctx) goto rcu_unlock; if (cpuctx->task_ctx == ctx) { perf_ctx_lock(cpuctx, ctx); perf_ctx_disable(ctx, false); perf_ctx_sched_task_cb(ctx, true); perf_ctx_enable(ctx, false); perf_ctx_unlock(cpuctx, ctx); goto rcu_unlock; } perf_ctx_lock(cpuctx, ctx); /* * We must check ctx->nr_events while holding ctx->lock, such * that we serialize against perf_install_in_context(). */ if (!ctx->nr_events) goto unlock; perf_ctx_disable(ctx, false); /* * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, * cpu flexible, task flexible. * * However, if task's ctx is not carrying any pinned * events, no need to flip the cpuctx's events around. */ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) { perf_ctx_disable(&cpuctx->ctx, false); ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE); } perf_event_sched_in(cpuctx, ctx); perf_ctx_sched_task_cb(cpuctx->task_ctx, true); if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) perf_ctx_enable(&cpuctx->ctx, false); perf_ctx_enable(ctx, false); unlock: perf_ctx_unlock(cpuctx, ctx); rcu_unlock: rcu_read_unlock(); } /* * Called from scheduler to add the events of the current task * with interrupts disabled. * * We restore the event value and then enable it. * * This does not protect us against NMI, but enable() * sets the enabled bit in the control field of event _before_ * accessing the event control register. If a NMI hits, then it will * keep the event running. */ void __perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { perf_event_context_sched_in(task); if (atomic_read(&nr_switch_events)) perf_event_switch(task, prev, true); if (__this_cpu_read(perf_sched_cb_usages)) perf_pmu_sched_task(prev, task, true); } static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) { u64 frequency = event->attr.sample_freq; u64 sec = NSEC_PER_SEC; u64 divisor, dividend; int count_fls, nsec_fls, frequency_fls, sec_fls; count_fls = fls64(count); nsec_fls = fls64(nsec); frequency_fls = fls64(frequency); sec_fls = 30; /* * We got @count in @nsec, with a target of sample_freq HZ * the target period becomes: * * @count * 10^9 * period = ------------------- * @nsec * sample_freq * */ /* * Reduce accuracy by one bit such that @a and @b converge * to a similar magnitude. */ #define REDUCE_FLS(a, b) \ do { \ if (a##_fls > b##_fls) { \ a >>= 1; \ a##_fls--; \ } else { \ b >>= 1; \ b##_fls--; \ } \ } while (0) /* * Reduce accuracy until either term fits in a u64, then proceed with * the other, so that finally we can do a u64/u64 division. */ while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) { REDUCE_FLS(nsec, frequency); REDUCE_FLS(sec, count); } if (count_fls + sec_fls > 64) { divisor = nsec * frequency; while (count_fls + sec_fls > 64) { REDUCE_FLS(count, sec); divisor >>= 1; } dividend = count * sec; } else { dividend = count * sec; while (nsec_fls + frequency_fls > 64) { REDUCE_FLS(nsec, frequency); dividend >>= 1; } divisor = nsec * frequency; } if (!divisor) return dividend; return div64_u64(dividend, divisor); } static DEFINE_PER_CPU(int, perf_throttled_count); static DEFINE_PER_CPU(u64, perf_throttled_seq); static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable) { struct hw_perf_event *hwc = &event->hw; s64 period, sample_period; s64 delta; period = perf_calculate_period(event, nsec, count); delta = (s64)(period - hwc->sample_period); delta = (delta + 7) / 8; /* low pass filter */ sample_period = hwc->sample_period + delta; if (!sample_period) sample_period = 1; hwc->sample_period = sample_period; if (local64_read(&hwc->period_left) > 8*sample_period) { if (disable) event->pmu->stop(event, PERF_EF_UPDATE); local64_set(&hwc->period_left, 0); if (disable) event->pmu->start(event, PERF_EF_RELOAD); } } /* * combine freq adjustment with unthrottling to avoid two passes over the * events. At the same time, make sure, having freq events does not change * the rate of unthrottling as that would introduce bias. */ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle) { struct perf_event *event; struct hw_perf_event *hwc; u64 now, period = TICK_NSEC; s64 delta; /* * only need to iterate over all events iff: * - context have events in frequency mode (needs freq adjust) * - there are events to unthrottle on this cpu */ if (!(ctx->nr_freq || unthrottle)) return; raw_spin_lock(&ctx->lock); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (event->state != PERF_EVENT_STATE_ACTIVE) continue; // XXX use visit thingy to avoid the -1,cpu match if (!event_filter_match(event)) continue; perf_pmu_disable(event->pmu); hwc = &event->hw; if (hwc->interrupts == MAX_INTERRUPTS) { hwc->interrupts = 0; perf_log_throttle(event, 1); event->pmu->start(event, 0); } if (!event->attr.freq || !event->attr.sample_freq) goto next; /* * stop the event and update event->count */ event->pmu->stop(event, PERF_EF_UPDATE); now = local64_read(&event->count); delta = now - hwc->freq_count_stamp; hwc->freq_count_stamp = now; /* * restart the event * reload only if value has changed * we have stopped the event so tell that * to perf_adjust_period() to avoid stopping it * twice. */ if (delta > 0) perf_adjust_period(event, period, delta, false); event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0); next: perf_pmu_enable(event->pmu); } raw_spin_unlock(&ctx->lock); } /* * Move @event to the tail of the @ctx's elegible events. */ static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event) { /* * Rotate the first entry last of non-pinned groups. Rotation might be * disabled by the inheritance code. */ if (ctx->rotate_disable) return; perf_event_groups_delete(&ctx->flexible_groups, event); perf_event_groups_insert(&ctx->flexible_groups, event); } /* pick an event from the flexible_groups to rotate */ static inline struct perf_event * ctx_event_to_rotate(struct perf_event_pmu_context *pmu_ctx) { struct perf_event *event; struct rb_node *node; struct rb_root *tree; struct __group_key key = { .pmu = pmu_ctx->pmu, }; /* pick the first active flexible event */ event = list_first_entry_or_null(&pmu_ctx->flexible_active, struct perf_event, active_list); if (event) goto out; /* if no active flexible event, pick the first event */ tree = &pmu_ctx->ctx->flexible_groups.tree; if (!pmu_ctx->ctx->task) { key.cpu = smp_processor_id(); node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup); if (node) event = __node_2_pe(node); goto out; } key.cpu = -1; node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup); if (node) { event = __node_2_pe(node); goto out; } key.cpu = smp_processor_id(); node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup); if (node) event = __node_2_pe(node); out: /* * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in() * finds there are unschedulable events, it will set it again. */ pmu_ctx->rotate_necessary = 0; return event; } static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_pmu_context *cpu_epc, *task_epc = NULL; struct perf_event *cpu_event = NULL, *task_event = NULL; int cpu_rotate, task_rotate; struct pmu *pmu; /* * Since we run this from IRQ context, nobody can install new * events, thus the event count values are stable. */ cpu_epc = &cpc->epc; pmu = cpu_epc->pmu; task_epc = cpc->task_epc; cpu_rotate = cpu_epc->rotate_necessary; task_rotate = task_epc ? task_epc->rotate_necessary : 0; if (!(cpu_rotate || task_rotate)) return false; perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(pmu); if (task_rotate) task_event = ctx_event_to_rotate(task_epc); if (cpu_rotate) cpu_event = ctx_event_to_rotate(cpu_epc); /* * As per the order given at ctx_resched() first 'pop' task flexible * and then, if needed CPU flexible. */ if (task_event || (task_epc && cpu_event)) { update_context_time(task_epc->ctx); __pmu_ctx_sched_out(task_epc, EVENT_FLEXIBLE); } if (cpu_event) { update_context_time(&cpuctx->ctx); __pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE); rotate_ctx(&cpuctx->ctx, cpu_event); __pmu_ctx_sched_in(&cpuctx->ctx, pmu); } if (task_event) rotate_ctx(task_epc->ctx, task_event); if (task_event || (task_epc && cpu_event)) __pmu_ctx_sched_in(task_epc->ctx, pmu); perf_pmu_enable(pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); return true; } void perf_event_task_tick(void) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *ctx; int throttled; lockdep_assert_irqs_disabled(); __this_cpu_inc(perf_throttled_seq); throttled = __this_cpu_xchg(perf_throttled_count, 0); tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); perf_adjust_freq_unthr_context(&cpuctx->ctx, !!throttled); rcu_read_lock(); ctx = rcu_dereference(current->perf_event_ctxp); if (ctx) perf_adjust_freq_unthr_context(ctx, !!throttled); rcu_read_unlock(); } static int event_enable_on_exec(struct perf_event *event, struct perf_event_context *ctx) { if (!event->attr.enable_on_exec) return 0; event->attr.enable_on_exec = 0; if (event->state >= PERF_EVENT_STATE_INACTIVE) return 0; perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); return 1; } /* * Enable all of a task's events that have been marked enable-on-exec. * This expects task == current. */ static void perf_event_enable_on_exec(struct perf_event_context *ctx) { struct perf_event_context *clone_ctx = NULL; enum event_type_t event_type = 0; struct perf_cpu_context *cpuctx; struct perf_event *event; unsigned long flags; int enabled = 0; local_irq_save(flags); if (WARN_ON_ONCE(current->perf_event_ctxp != ctx)) goto out; if (!ctx->nr_events) goto out; cpuctx = this_cpu_ptr(&perf_cpu_context); perf_ctx_lock(cpuctx, ctx); ctx_sched_out(ctx, EVENT_TIME); list_for_each_entry(event, &ctx->event_list, event_entry) { enabled |= event_enable_on_exec(event, ctx); event_type |= get_event_type(event); } /* * Unclone and reschedule this context if we enabled any event. */ if (enabled) { clone_ctx = unclone_ctx(ctx); ctx_resched(cpuctx, ctx, event_type); } else { ctx_sched_in(ctx, EVENT_TIME); } perf_ctx_unlock(cpuctx, ctx); out: local_irq_restore(flags); if (clone_ctx) put_ctx(clone_ctx); } static void perf_remove_from_owner(struct perf_event *event); static void perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx); /* * Removes all events from the current task that have been marked * remove-on-exec, and feeds their values back to parent events. */ static void perf_event_remove_on_exec(struct perf_event_context *ctx) { struct perf_event_context *clone_ctx = NULL; struct perf_event *event, *next; unsigned long flags; bool modified = false; mutex_lock(&ctx->mutex); if (WARN_ON_ONCE(ctx->task != current)) goto unlock; list_for_each_entry_safe(event, next, &ctx->event_list, event_entry) { if (!event->attr.remove_on_exec) continue; if (!is_kernel_event(event)) perf_remove_from_owner(event); modified = true; perf_event_exit_event(event, ctx); } raw_spin_lock_irqsave(&ctx->lock, flags); if (modified) clone_ctx = unclone_ctx(ctx); raw_spin_unlock_irqrestore(&ctx->lock, flags); unlock: mutex_unlock(&ctx->mutex); if (clone_ctx) put_ctx(clone_ctx); } struct perf_read_data { struct perf_event *event; bool group; int ret; }; static int __perf_event_read_cpu(struct perf_event *event, int event_cpu) { u16 local_pkg, event_pkg; if ((unsigned)event_cpu >= nr_cpu_ids) return event_cpu; if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) { int local_cpu = smp_processor_id(); event_pkg = topology_physical_package_id(event_cpu); local_pkg = topology_physical_package_id(local_cpu); if (event_pkg == local_pkg) return local_cpu; } return event_cpu; } /* * Cross CPU call to read the hardware event */ static void __perf_event_read(void *info) { struct perf_read_data *data = info; struct perf_event *sub, *event = data->event; struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct pmu *pmu = event->pmu; /* * If this is a task context, we need to check whether it is * the current task context of this cpu. If not it has been * scheduled out before the smp call arrived. In that case * event->count would have been updated to a recent sample * when the event was scheduled out. */ if (ctx->task && cpuctx->task_ctx != ctx) return; raw_spin_lock(&ctx->lock); if (ctx->is_active & EVENT_TIME) { update_context_time(ctx); update_cgrp_time_from_event(event); } perf_event_update_time(event); if (data->group) perf_event_update_sibling_time(event); if (event->state != PERF_EVENT_STATE_ACTIVE) goto unlock; if (!data->group) { pmu->read(event); data->ret = 0; goto unlock; } pmu->start_txn(pmu, PERF_PMU_TXN_READ); pmu->read(event); for_each_sibling_event(sub, event) { if (sub->state == PERF_EVENT_STATE_ACTIVE) { /* * Use sibling's PMU rather than @event's since * sibling could be on different (eg: software) PMU. */ sub->pmu->read(sub); } } data->ret = pmu->commit_txn(pmu); unlock: raw_spin_unlock(&ctx->lock); } static inline u64 perf_event_count(struct perf_event *event) { return local64_read(&event->count) + atomic64_read(&event->child_count); } static void calc_timer_values(struct perf_event *event, u64 *now, u64 *enabled, u64 *running) { u64 ctx_time; *now = perf_clock(); ctx_time = perf_event_time_now(event, *now); __perf_update_times(event, ctx_time, enabled, running); } /* * NMI-safe method to read a local event, that is an event that * is: * - either for the current task, or for this CPU * - does not have inherit set, for inherited task events * will not be local and we cannot read them atomically * - must not have a pmu::count method */ int perf_event_read_local(struct perf_event *event, u64 *value, u64 *enabled, u64 *running) { unsigned long flags; int event_oncpu; int event_cpu; int ret = 0; /* * Disabling interrupts avoids all counter scheduling (context * switches, timer based rotation and IPIs). */ local_irq_save(flags); /* * It must not be an event with inherit set, we cannot read * all child counters from atomic context. */ if (event->attr.inherit) { ret = -EOPNOTSUPP; goto out; } /* If this is a per-task event, it must be for current */ if ((event->attach_state & PERF_ATTACH_TASK) && event->hw.target != current) { ret = -EINVAL; goto out; } /* * Get the event CPU numbers, and adjust them to local if the event is * a per-package event that can be read locally */ event_oncpu = __perf_event_read_cpu(event, event->oncpu); event_cpu = __perf_event_read_cpu(event, event->cpu); /* If this is a per-CPU event, it must be for this CPU */ if (!(event->attach_state & PERF_ATTACH_TASK) && event_cpu != smp_processor_id()) { ret = -EINVAL; goto out; } /* If this is a pinned event it must be running on this CPU */ if (event->attr.pinned && event_oncpu != smp_processor_id()) { ret = -EBUSY; goto out; } /* * If the event is currently on this CPU, its either a per-task event, * or local to this CPU. Furthermore it means its ACTIVE (otherwise * oncpu == -1). */ if (event_oncpu == smp_processor_id()) event->pmu->read(event); *value = local64_read(&event->count); if (enabled || running) { u64 __enabled, __running, __now; calc_timer_values(event, &__now, &__enabled, &__running); if (enabled) *enabled = __enabled; if (running) *running = __running; } out: local_irq_restore(flags); return ret; } static int perf_event_read(struct perf_event *event, bool group) { enum perf_event_state state = READ_ONCE(event->state); int event_cpu, ret = 0; /* * If event is enabled and currently active on a CPU, update the * value in the event structure: */ again: if (state == PERF_EVENT_STATE_ACTIVE) { struct perf_read_data data; /* * Orders the ->state and ->oncpu loads such that if we see * ACTIVE we must also see the right ->oncpu. * * Matches the smp_wmb() from event_sched_in(). */ smp_rmb(); event_cpu = READ_ONCE(event->oncpu); if ((unsigned)event_cpu >= nr_cpu_ids) return 0; data = (struct perf_read_data){ .event = event, .group = group, .ret = 0, }; preempt_disable(); event_cpu = __perf_event_read_cpu(event, event_cpu); /* * Purposely ignore the smp_call_function_single() return * value. * * If event_cpu isn't a valid CPU it means the event got * scheduled out and that will have updated the event count. * * Therefore, either way, we'll have an up-to-date event count * after this. */ (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1); preempt_enable(); ret = data.ret; } else if (state == PERF_EVENT_STATE_INACTIVE) { struct perf_event_context *ctx = event->ctx; unsigned long flags; raw_spin_lock_irqsave(&ctx->lock, flags); state = event->state; if (state != PERF_EVENT_STATE_INACTIVE) { raw_spin_unlock_irqrestore(&ctx->lock, flags); goto again; } /* * May read while context is not active (e.g., thread is * blocked), in that case we cannot update context time */ if (ctx->is_active & EVENT_TIME) { update_context_time(ctx); update_cgrp_time_from_event(event); } perf_event_update_time(event); if (group) perf_event_update_sibling_time(event); raw_spin_unlock_irqrestore(&ctx->lock, flags); } return ret; } /* * Initialize the perf_event context in a task_struct: */ static void __perf_event_init_context(struct perf_event_context *ctx) { raw_spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); INIT_LIST_HEAD(&ctx->pmu_ctx_list); perf_event_groups_init(&ctx->pinned_groups); perf_event_groups_init(&ctx->flexible_groups); INIT_LIST_HEAD(&ctx->event_list); refcount_set(&ctx->refcount, 1); } static void __perf_init_event_pmu_context(struct perf_event_pmu_context *epc, struct pmu *pmu) { epc->pmu = pmu; INIT_LIST_HEAD(&epc->pmu_ctx_entry); INIT_LIST_HEAD(&epc->pinned_active); INIT_LIST_HEAD(&epc->flexible_active); atomic_set(&epc->refcount, 1); } static struct perf_event_context * alloc_perf_context(struct task_struct *task) { struct perf_event_context *ctx; ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); if (!ctx) return NULL; __perf_event_init_context(ctx); if (task) ctx->task = get_task_struct(task); return ctx; } static struct task_struct * find_lively_task_by_vpid(pid_t vpid) { struct task_struct *task; rcu_read_lock(); if (!vpid) task = current; else task = find_task_by_vpid(vpid); if (task) get_task_struct(task); rcu_read_unlock(); if (!task) return ERR_PTR(-ESRCH); return task; } /* * Returns a matching context with refcount and pincount. */ static struct perf_event_context * find_get_context(struct task_struct *task, struct perf_event *event) { struct perf_event_context *ctx, *clone_ctx = NULL; struct perf_cpu_context *cpuctx; unsigned long flags; int err; if (!task) { /* Must be root to operate on a CPU event: */ err = perf_allow_cpu(&event->attr); if (err) return ERR_PTR(err); cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu); ctx = &cpuctx->ctx; get_ctx(ctx); raw_spin_lock_irqsave(&ctx->lock, flags); ++ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); return ctx; } err = -EINVAL; retry: ctx = perf_lock_task_context(task, &flags); if (ctx) { clone_ctx = unclone_ctx(ctx); ++ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); if (clone_ctx) put_ctx(clone_ctx); } else { ctx = alloc_perf_context(task); err = -ENOMEM; if (!ctx) goto errout; err = 0; mutex_lock(&task->perf_event_mutex); /* * If it has already passed perf_event_exit_task(). * we must see PF_EXITING, it takes this mutex too. */ if (task->flags & PF_EXITING) err = -ESRCH; else if (task->perf_event_ctxp) err = -EAGAIN; else { get_ctx(ctx); ++ctx->pin_count; rcu_assign_pointer(task->perf_event_ctxp, ctx); } mutex_unlock(&task->perf_event_mutex); if (unlikely(err)) { put_ctx(ctx); if (err == -EAGAIN) goto retry; goto errout; } } return ctx; errout: return ERR_PTR(err); } static struct perf_event_pmu_context * find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, struct perf_event *event) { struct perf_event_pmu_context *new = NULL, *epc; void *task_ctx_data = NULL; if (!ctx->task) { struct perf_cpu_pmu_context *cpc; cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu); epc = &cpc->epc; raw_spin_lock_irq(&ctx->lock); if (!epc->ctx) { atomic_set(&epc->refcount, 1); epc->embedded = 1; list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); epc->ctx = ctx; } else { WARN_ON_ONCE(epc->ctx != ctx); atomic_inc(&epc->refcount); } raw_spin_unlock_irq(&ctx->lock); return epc; } new = kzalloc(sizeof(*epc), GFP_KERNEL); if (!new) return ERR_PTR(-ENOMEM); if (event->attach_state & PERF_ATTACH_TASK_DATA) { task_ctx_data = alloc_task_ctx_data(pmu); if (!task_ctx_data) { kfree(new); return ERR_PTR(-ENOMEM); } } __perf_init_event_pmu_context(new, pmu); /* * XXX * * lockdep_assert_held(&ctx->mutex); * * can't because perf_event_init_task() doesn't actually hold the * child_ctx->mutex. */ raw_spin_lock_irq(&ctx->lock); list_for_each_entry(epc, &ctx->pmu_ctx_list, pmu_ctx_entry) { if (epc->pmu == pmu) { WARN_ON_ONCE(epc->ctx != ctx); atomic_inc(&epc->refcount); goto found_epc; } } epc = new; new = NULL; list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); epc->ctx = ctx; found_epc: if (task_ctx_data && !epc->task_ctx_data) { epc->task_ctx_data = task_ctx_data; task_ctx_data = NULL; ctx->nr_task_data++; } raw_spin_unlock_irq(&ctx->lock); free_task_ctx_data(pmu, task_ctx_data); kfree(new); return epc; } static void get_pmu_ctx(struct perf_event_pmu_context *epc) { WARN_ON_ONCE(!atomic_inc_not_zero(&epc->refcount)); } static void free_epc_rcu(struct rcu_head *head) { struct perf_event_pmu_context *epc = container_of(head, typeof(*epc), rcu_head); kfree(epc->task_ctx_data); kfree(epc); } static void put_pmu_ctx(struct perf_event_pmu_context *epc) { struct perf_event_context *ctx = epc->ctx; unsigned long flags; /* * XXX * * lockdep_assert_held(&ctx->mutex); * * can't because of the call-site in _free_event()/put_event() * which isn't always called under ctx->mutex. */ if (!atomic_dec_and_raw_lock_irqsave(&epc->refcount, &ctx->lock, flags)) return; WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry)); list_del_init(&epc->pmu_ctx_entry); epc->ctx = NULL; WARN_ON_ONCE(!list_empty(&epc->pinned_active)); WARN_ON_ONCE(!list_empty(&epc->flexible_active)); raw_spin_unlock_irqrestore(&ctx->lock, flags); if (epc->embedded) return; call_rcu(&epc->rcu_head, free_epc_rcu); } static void perf_event_free_filter(struct perf_event *event); static void free_event_rcu(struct rcu_head *head) { struct perf_event *event = container_of(head, typeof(*event), rcu_head); if (event->ns) put_pid_ns(event->ns); perf_event_free_filter(event); kmem_cache_free(perf_event_cache, event); } static void ring_buffer_attach(struct perf_event *event, struct perf_buffer *rb); static void detach_sb_event(struct perf_event *event) { struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu); raw_spin_lock(&pel->lock); list_del_rcu(&event->sb_list); raw_spin_unlock(&pel->lock); } static bool is_sb_event(struct perf_event *event) { struct perf_event_attr *attr = &event->attr; if (event->parent) return false; if (event->attach_state & PERF_ATTACH_TASK) return false; if (attr->mmap || attr->mmap_data || attr->mmap2 || attr->comm || attr->comm_exec || attr->task || attr->ksymbol || attr->context_switch || attr->text_poke || attr->bpf_event) return true; return false; } static void unaccount_pmu_sb_event(struct perf_event *event) { if (is_sb_event(event)) detach_sb_event(event); } #ifdef CONFIG_NO_HZ_FULL static DEFINE_SPINLOCK(nr_freq_lock); #endif static void unaccount_freq_event_nohz(void) { #ifdef CONFIG_NO_HZ_FULL spin_lock(&nr_freq_lock); if (atomic_dec_and_test(&nr_freq_events)) tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS); spin_unlock(&nr_freq_lock); #endif } static void unaccount_freq_event(void) { if (tick_nohz_full_enabled()) unaccount_freq_event_nohz(); else atomic_dec(&nr_freq_events); } static void unaccount_event(struct perf_event *event) { bool dec = false; if (event->parent) return; if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB)) dec = true; if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); if (event->attr.build_id) atomic_dec(&nr_build_id_events); if (event->attr.comm) atomic_dec(&nr_comm_events); if (event->attr.namespaces) atomic_dec(&nr_namespaces_events); if (event->attr.cgroup) atomic_dec(&nr_cgroup_events); if (event->attr.task) atomic_dec(&nr_task_events); if (event->attr.freq) unaccount_freq_event(); if (event->attr.context_switch) { dec = true; atomic_dec(&nr_switch_events); } if (is_cgroup_event(event)) dec = true; if (has_branch_stack(event)) dec = true; if (event->attr.ksymbol) atomic_dec(&nr_ksymbol_events); if (event->attr.bpf_event) atomic_dec(&nr_bpf_events); if (event->attr.text_poke) atomic_dec(&nr_text_poke_events); if (dec) { if (!atomic_add_unless(&perf_sched_count, -1, 1)) schedule_delayed_work(&perf_sched_work, HZ); } unaccount_pmu_sb_event(event); } static void perf_sched_delayed(struct work_struct *work) { mutex_lock(&perf_sched_mutex); if (atomic_dec_and_test(&perf_sched_count)) static_branch_disable(&perf_sched_events); mutex_unlock(&perf_sched_mutex); } /* * The following implement mutual exclusion of events on "exclusive" pmus * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled * at a time, so we disallow creating events that might conflict, namely: * * 1) cpu-wide events in the presence of per-task events, * 2) per-task events in the presence of cpu-wide events, * 3) two matching events on the same perf_event_context. * * The former two cases are handled in the allocation path (perf_event_alloc(), * _free_event()), the latter -- before the first perf_install_in_context(). */ static int exclusive_event_init(struct perf_event *event) { struct pmu *pmu = event->pmu; if (!is_exclusive_pmu(pmu)) return 0; /* * Prevent co-existence of per-task and cpu-wide events on the * same exclusive pmu. * * Negative pmu::exclusive_cnt means there are cpu-wide * events on this "exclusive" pmu, positive means there are * per-task events. * * Since this is called in perf_event_alloc() path, event::ctx * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK * to mean "per-task event", because unlike other attach states it * never gets cleared. */ if (event->attach_state & PERF_ATTACH_TASK) { if (!atomic_inc_unless_negative(&pmu->exclusive_cnt)) return -EBUSY; } else { if (!atomic_dec_unless_positive(&pmu->exclusive_cnt)) return -EBUSY; } return 0; } static void exclusive_event_destroy(struct perf_event *event) { struct pmu *pmu = event->pmu; if (!is_exclusive_pmu(pmu)) return; /* see comment in exclusive_event_init() */ if (event->attach_state & PERF_ATTACH_TASK) atomic_dec(&pmu->exclusive_cnt); else atomic_inc(&pmu->exclusive_cnt); } static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2) { if ((e1->pmu == e2->pmu) && (e1->cpu == e2->cpu || e1->cpu == -1 || e2->cpu == -1)) return true; return false; } static bool exclusive_event_installable(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event *iter_event; struct pmu *pmu = event->pmu; lockdep_assert_held(&ctx->mutex); if (!is_exclusive_pmu(pmu)) return true; list_for_each_entry(iter_event, &ctx->event_list, event_entry) { if (exclusive_event_match(iter_event, event)) return false; } return true; } static void perf_addr_filters_splice(struct perf_event *event, struct list_head *head); static void _free_event(struct perf_event *event) { irq_work_sync(&event->pending_irq); unaccount_event(event); security_perf_event_free(event); if (event->rb) { /* * Can happen when we close an event with re-directed output. * * Since we have a 0 refcount, perf_mmap_close() will skip * over us; possibly making our ring_buffer_put() the last. */ mutex_lock(&event->mmap_mutex); ring_buffer_attach(event, NULL); mutex_unlock(&event->mmap_mutex); } if (is_cgroup_event(event)) perf_detach_cgroup(event); if (!event->parent) { if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) put_callchain_buffers(); } perf_event_free_bpf_prog(event); perf_addr_filters_splice(event, NULL); kfree(event->addr_filter_ranges); if (event->destroy) event->destroy(event); /* * Must be after ->destroy(), due to uprobe_perf_close() using * hw.target. */ if (event->hw.target) put_task_struct(event->hw.target); if (event->pmu_ctx) put_pmu_ctx(event->pmu_ctx); /* * perf_event_free_task() relies on put_ctx() being 'last', in particular * all task references must be cleaned up. */ if (event->ctx) put_ctx(event->ctx); exclusive_event_destroy(event); module_put(event->pmu->module); call_rcu(&event->rcu_head, free_event_rcu); } /* * Used to free events which have a known refcount of 1, such as in error paths * where the event isn't exposed yet and inherited events. */ static void free_event(struct perf_event *event) { if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1, "unexpected event refcount: %ld; ptr=%p\n", atomic_long_read(&event->refcount), event)) { /* leak to avoid use-after-free */ return; } _free_event(event); } /* * Remove user event from the owner task. */ static void perf_remove_from_owner(struct perf_event *event) { struct task_struct *owner; rcu_read_lock(); /* * Matches the smp_store_release() in perf_event_exit_task(). If we * observe !owner it means the list deletion is complete and we can * indeed free this event, otherwise we need to serialize on * owner->perf_event_mutex. */ owner = READ_ONCE(event->owner); if (owner) { /* * Since delayed_put_task_struct() also drops the last * task reference we can safely take a new reference * while holding the rcu_read_lock(). */ get_task_struct(owner); } rcu_read_unlock(); if (owner) { /* * If we're here through perf_event_exit_task() we're already * holding ctx->mutex which would be an inversion wrt. the * normal lock order. * * However we can safely take this lock because its the child * ctx->mutex. */ mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING); /* * We have to re-check the event->owner field, if it is cleared * we raced with perf_event_exit_task(), acquiring the mutex * ensured they're done, and we can proceed with freeing the * event. */ if (event->owner) { list_del_init(&event->owner_entry); smp_store_release(&event->owner, NULL); } mutex_unlock(&owner->perf_event_mutex); put_task_struct(owner); } } static void put_event(struct perf_event *event) { if (!atomic_long_dec_and_test(&event->refcount)) return; _free_event(event); } /* * Kill an event dead; while event:refcount will preserve the event * object, it will not preserve its functionality. Once the last 'user' * gives up the object, we'll destroy the thing. */ int perf_event_release_kernel(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct perf_event *child, *tmp; LIST_HEAD(free_list); /* * If we got here through err_alloc: free_event(event); we will not * have attached to a context yet. */ if (!ctx) { WARN_ON_ONCE(event->attach_state & (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP)); goto no_ctx; } if (!is_kernel_event(event)) perf_remove_from_owner(event); ctx = perf_event_ctx_lock(event); WARN_ON_ONCE(ctx->parent_ctx); /* * Mark this event as STATE_DEAD, there is no external reference to it * anymore. * * Anybody acquiring event->child_mutex after the below loop _must_ * also see this, most importantly inherit_event() which will avoid * placing more children on the list. * * Thus this guarantees that we will in fact observe and kill _ALL_ * child events. */ perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD); perf_event_ctx_unlock(event, ctx); again: mutex_lock(&event->child_mutex); list_for_each_entry(child, &event->child_list, child_list) { /* * Cannot change, child events are not migrated, see the * comment with perf_event_ctx_lock_nested(). */ ctx = READ_ONCE(child->ctx); /* * Since child_mutex nests inside ctx::mutex, we must jump * through hoops. We start by grabbing a reference on the ctx. * * Since the event cannot get freed while we hold the * child_mutex, the context must also exist and have a !0 * reference count. */ get_ctx(ctx); /* * Now that we have a ctx ref, we can drop child_mutex, and * acquire ctx::mutex without fear of it going away. Then we * can re-acquire child_mutex. */ mutex_unlock(&event->child_mutex); mutex_lock(&ctx->mutex); mutex_lock(&event->child_mutex); /* * Now that we hold ctx::mutex and child_mutex, revalidate our * state, if child is still the first entry, it didn't get freed * and we can continue doing so. */ tmp = list_first_entry_or_null(&event->child_list, struct perf_event, child_list); if (tmp == child) { perf_remove_from_context(child, DETACH_GROUP); list_move(&child->child_list, &free_list); /* * This matches the refcount bump in inherit_event(); * this can't be the last reference. */ put_event(event); } mutex_unlock(&event->child_mutex); mutex_unlock(&ctx->mutex); put_ctx(ctx); goto again; } mutex_unlock(&event->child_mutex); list_for_each_entry_safe(child, tmp, &free_list, child_list) { void *var = &child->ctx->refcount; list_del(&child->child_list); free_event(child); /* * Wake any perf_event_free_task() waiting for this event to be * freed. */ smp_mb(); /* pairs with wait_var_event() */ wake_up_var(var); } no_ctx: put_event(event); /* Must be the 'last' reference */ return 0; } EXPORT_SYMBOL_GPL(perf_event_release_kernel); /* * Called when the last reference to the file is gone. */ static int perf_release(struct inode *inode, struct file *file) { perf_event_release_kernel(file->private_data); return 0; } static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) { struct perf_event *child; u64 total = 0; *enabled = 0; *running = 0; mutex_lock(&event->child_mutex); (void)perf_event_read(event, false); total += perf_event_count(event); *enabled += event->total_time_enabled + atomic64_read(&event->child_total_time_enabled); *running += event->total_time_running + atomic64_read(&event->child_total_time_running); list_for_each_entry(child, &event->child_list, child_list) { (void)perf_event_read(child, false); total += perf_event_count(child); *enabled += child->total_time_enabled; *running += child->total_time_running; } mutex_unlock(&event->child_mutex); return total; } u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) { struct perf_event_context *ctx; u64 count; ctx = perf_event_ctx_lock(event); count = __perf_event_read_value(event, enabled, running); perf_event_ctx_unlock(event, ctx); return count; } EXPORT_SYMBOL_GPL(perf_event_read_value); static int __perf_read_group_add(struct perf_event *leader, u64 read_format, u64 *values) { struct perf_event_context *ctx = leader->ctx; struct perf_event *sub, *parent; unsigned long flags; int n = 1; /* skip @nr */ int ret; ret = perf_event_read(leader, true); if (ret) return ret; raw_spin_lock_irqsave(&ctx->lock, flags); /* * Verify the grouping between the parent and child (inherited) * events is still in tact. * * Specifically: * - leader->ctx->lock pins leader->sibling_list * - parent->child_mutex pins parent->child_list * - parent->ctx->mutex pins parent->sibling_list * * Because parent->ctx != leader->ctx (and child_list nests inside * ctx->mutex), group destruction is not atomic between children, also * see perf_event_release_kernel(). Additionally, parent can grow the * group. * * Therefore it is possible to have parent and child groups in a * different configuration and summing over such a beast makes no sense * what so ever. * * Reject this. */ parent = leader->parent; if (parent && (parent->group_generation != leader->group_generation || parent->nr_siblings != leader->nr_siblings)) { ret = -ECHILD; goto unlock; } /* * Since we co-schedule groups, {enabled,running} times of siblings * will be identical to those of the leader, so we only publish one * set. */ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { values[n++] += leader->total_time_enabled + atomic64_read(&leader->child_total_time_enabled); } if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { values[n++] += leader->total_time_running + atomic64_read(&leader->child_total_time_running); } /* * Write {count,id} tuples for every sibling. */ values[n++] += perf_event_count(leader); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); if (read_format & PERF_FORMAT_LOST) values[n++] = atomic64_read(&leader->lost_samples); for_each_sibling_event(sub, leader) { values[n++] += perf_event_count(sub); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); if (read_format & PERF_FORMAT_LOST) values[n++] = atomic64_read(&sub->lost_samples); } unlock: raw_spin_unlock_irqrestore(&ctx->lock, flags); return ret; } static int perf_read_group(struct perf_event *event, u64 read_format, char __user *buf) { struct perf_event *leader = event->group_leader, *child; struct perf_event_context *ctx = leader->ctx; int ret; u64 *values; lockdep_assert_held(&ctx->mutex); values = kzalloc(event->read_size, GFP_KERNEL); if (!values) return -ENOMEM; values[0] = 1 + leader->nr_siblings; mutex_lock(&leader->child_mutex); ret = __perf_read_group_add(leader, read_format, values); if (ret) goto unlock; list_for_each_entry(child, &leader->child_list, child_list) { ret = __perf_read_group_add(child, read_format, values); if (ret) goto unlock; } mutex_unlock(&leader->child_mutex); ret = event->read_size; if (copy_to_user(buf, values, event->read_size)) ret = -EFAULT; goto out; unlock: mutex_unlock(&leader->child_mutex); out: kfree(values); return ret; } static int perf_read_one(struct perf_event *event, u64 read_format, char __user *buf) { u64 enabled, running; u64 values[5]; int n = 0; values[n++] = __perf_event_read_value(event, &enabled, &running); if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) values[n++] = enabled; if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) values[n++] = running; if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(event); if (read_format & PERF_FORMAT_LOST) values[n++] = atomic64_read(&event->lost_samples); if (copy_to_user(buf, values, n * sizeof(u64))) return -EFAULT; return n * sizeof(u64); } static bool is_event_hup(struct perf_event *event) { bool no_children; if (event->state > PERF_EVENT_STATE_EXIT) return false; mutex_lock(&event->child_mutex); no_children = list_empty(&event->child_list); mutex_unlock(&event->child_mutex); return no_children; } /* * Read the performance event - simple non blocking version for now */ static ssize_t __perf_read(struct perf_event *event, char __user *buf, size_t count) { u64 read_format = event->attr.read_format; int ret; /* * Return end-of-file for a read on an event that is in * error state (i.e. because it was pinned but it couldn't be * scheduled on to the CPU at some point). */ if (event->state == PERF_EVENT_STATE_ERROR) return 0; if (count < event->read_size) return -ENOSPC; WARN_ON_ONCE(event->ctx->parent_ctx); if (read_format & PERF_FORMAT_GROUP) ret = perf_read_group(event, read_format, buf); else ret = perf_read_one(event, read_format, buf); return ret; } static ssize_t perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct perf_event *event = file->private_data; struct perf_event_context *ctx; int ret; ret = security_perf_event_read(event); if (ret) return ret; ctx = perf_event_ctx_lock(event); ret = __perf_read(event, buf, count); perf_event_ctx_unlock(event, ctx); return ret; } static __poll_t perf_poll(struct file *file, poll_table *wait) { struct perf_event *event = file->private_data; struct perf_buffer *rb; __poll_t events = EPOLLHUP; poll_wait(file, &event->waitq, wait); if (is_event_hup(event)) return events; /* * Pin the event->rb by taking event->mmap_mutex; otherwise * perf_event_set_output() can swizzle our rb and make us miss wakeups. */ mutex_lock(&event->mmap_mutex); rb = event->rb; if (rb) events = atomic_xchg(&rb->poll, 0); mutex_unlock(&event->mmap_mutex); return events; } static void _perf_event_reset(struct perf_event *event) { (void)perf_event_read(event, false); local64_set(&event->count, 0); perf_event_update_userpage(event); } /* Assume it's not an event with inherit set. */ u64 perf_event_pause(struct perf_event *event, bool reset) { struct perf_event_context *ctx; u64 count; ctx = perf_event_ctx_lock(event); WARN_ON_ONCE(event->attr.inherit); _perf_event_disable(event); count = local64_read(&event->count); if (reset) local64_set(&event->count, 0); perf_event_ctx_unlock(event, ctx); return count; } EXPORT_SYMBOL_GPL(perf_event_pause); /* * Holding the top-level event's child_mutex means that any * descendant process that has inherited this event will block * in perf_event_exit_event() if it goes to exit, thus satisfying the * task existence requirements of perf_event_enable/disable. */ static void perf_event_for_each_child(struct perf_event *event, void (*func)(struct perf_event *)) { struct perf_event *child; WARN_ON_ONCE(event->ctx->parent_ctx); mutex_lock(&event->child_mutex); func(event); list_for_each_entry(child, &event->child_list, child_list) func(child); mutex_unlock(&event->child_mutex); } static void perf_event_for_each(struct perf_event *event, void (*func)(struct perf_event *)) { struct perf_event_context *ctx = event->ctx; struct perf_event *sibling; lockdep_assert_held(&ctx->mutex); event = event->group_leader; perf_event_for_each_child(event, func); for_each_sibling_event(sibling, event) perf_event_for_each_child(sibling, func); } static void __perf_event_period(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, void *info) { u64 value = *((u64 *)info); bool active; if (event->attr.freq) { event->attr.sample_freq = value; } else { event->attr.sample_period = value; event->hw.sample_period = value; } active = (event->state == PERF_EVENT_STATE_ACTIVE); if (active) { perf_pmu_disable(event->pmu); /* * We could be throttled; unthrottle now to avoid the tick * trying to unthrottle while we already re-started the event. */ if (event->hw.interrupts == MAX_INTERRUPTS) { event->hw.interrupts = 0; perf_log_throttle(event, 1); } event->pmu->stop(event, PERF_EF_UPDATE); } local64_set(&event->hw.period_left, 0); if (active) { event->pmu->start(event, PERF_EF_RELOAD); perf_pmu_enable(event->pmu); } } static int perf_event_check_period(struct perf_event *event, u64 value) { return event->pmu->check_period(event, value); } static int _perf_event_period(struct perf_event *event, u64 value) { if (!is_sampling_event(event)) return -EINVAL; if (!value) return -EINVAL; if (event->attr.freq && value > sysctl_perf_event_sample_rate) return -EINVAL; if (perf_event_check_period(event, value)) return -EINVAL; if (!event->attr.freq && (value & (1ULL << 63))) return -EINVAL; event_function_call(event, __perf_event_period, &value); return 0; } int perf_event_period(struct perf_event *event, u64 value) { struct perf_event_context *ctx; int ret; ctx = perf_event_ctx_lock(event); ret = _perf_event_period(event, value); perf_event_ctx_unlock(event, ctx); return ret; } EXPORT_SYMBOL_GPL(perf_event_period); static const struct file_operations perf_fops; static inline int perf_fget_light(int fd, struct fd *p) { struct fd f = fdget(fd); if (!f.file) return -EBADF; if (f.file->f_op != &perf_fops) { fdput(f); return -EBADF; } *p = f; return 0; } static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event); static int perf_event_set_filter(struct perf_event *event, void __user *arg); static int perf_copy_attr(struct perf_event_attr __user *uattr, struct perf_event_attr *attr); static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) { void (*func)(struct perf_event *); u32 flags = arg; switch (cmd) { case PERF_EVENT_IOC_ENABLE: func = _perf_event_enable; break; case PERF_EVENT_IOC_DISABLE: func = _perf_event_disable; break; case PERF_EVENT_IOC_RESET: func = _perf_event_reset; break; case PERF_EVENT_IOC_REFRESH: return _perf_event_refresh(event, arg); case PERF_EVENT_IOC_PERIOD: { u64 value; if (copy_from_user(&value, (u64 __user *)arg, sizeof(value))) return -EFAULT; return _perf_event_period(event, value); } case PERF_EVENT_IOC_ID: { u64 id = primary_event_id(event); if (copy_to_user((void __user *)arg, &id, sizeof(id))) return -EFAULT; return 0; } case PERF_EVENT_IOC_SET_OUTPUT: { int ret; if (arg != -1) { struct perf_event *output_event; struct fd output; ret = perf_fget_light(arg, &output); if (ret) return ret; output_event = output.file->private_data; ret = perf_event_set_output(event, output_event); fdput(output); } else { ret = perf_event_set_output(event, NULL); } return ret; } case PERF_EVENT_IOC_SET_FILTER: return perf_event_set_filter(event, (void __user *)arg); case PERF_EVENT_IOC_SET_BPF: { struct bpf_prog *prog; int err; prog = bpf_prog_get(arg); if (IS_ERR(prog)) return PTR_ERR(prog); err = perf_event_set_bpf_prog(event, prog, 0); if (err) { bpf_prog_put(prog); return err; } return 0; } case PERF_EVENT_IOC_PAUSE_OUTPUT: { struct perf_buffer *rb; rcu_read_lock(); rb = rcu_dereference(event->rb); if (!rb || !rb->nr_pages) { rcu_read_unlock(); return -EINVAL; } rb_toggle_paused(rb, !!arg); rcu_read_unlock(); return 0; } case PERF_EVENT_IOC_QUERY_BPF: return perf_event_query_prog_array(event, (void __user *)arg); case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: { struct perf_event_attr new_attr; int err = perf_copy_attr((struct perf_event_attr __user *)arg, &new_attr); if (err) return err; return perf_event_modify_attr(event, &new_attr); } default: return -ENOTTY; } if (flags & PERF_IOC_FLAG_GROUP) perf_event_for_each(event, func); else perf_event_for_each_child(event, func); return 0; } static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct perf_event *event = file->private_data; struct perf_event_context *ctx; long ret; /* Treat ioctl like writes as it is likely a mutating operation. */ ret = security_perf_event_write(event); if (ret) return ret; ctx = perf_event_ctx_lock(event); ret = _perf_ioctl(event, cmd, arg); perf_event_ctx_unlock(event, ctx); return ret; } #ifdef CONFIG_COMPAT static long perf_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { switch (_IOC_NR(cmd)) { case _IOC_NR(PERF_EVENT_IOC_SET_FILTER): case _IOC_NR(PERF_EVENT_IOC_ID): case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF): case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES): /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */ if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) { cmd &= ~IOCSIZE_MASK; cmd |= sizeof(void *) << IOCSIZE_SHIFT; } break; } return perf_ioctl(file, cmd, arg); } #else # define perf_compat_ioctl NULL #endif int perf_event_task_enable(void) { struct perf_event_context *ctx; struct perf_event *event; mutex_lock(&current->perf_event_mutex); list_for_each_entry(event, &current->perf_event_list, owner_entry) { ctx = perf_event_ctx_lock(event); perf_event_for_each_child(event, _perf_event_enable); perf_event_ctx_unlock(event, ctx); } mutex_unlock(&current->perf_event_mutex); return 0; } int perf_event_task_disable(void) { struct perf_event_context *ctx; struct perf_event *event; mutex_lock(&current->perf_event_mutex); list_for_each_entry(event, &current->perf_event_list, owner_entry) { ctx = perf_event_ctx_lock(event); perf_event_for_each_child(event, _perf_event_disable); perf_event_ctx_unlock(event, ctx); } mutex_unlock(&current->perf_event_mutex); return 0; } static int perf_event_index(struct perf_event *event) { if (event->hw.state & PERF_HES_STOPPED) return 0; if (event->state != PERF_EVENT_STATE_ACTIVE) return 0; return event->pmu->event_idx(event); } static void perf_event_init_userpage(struct perf_event *event) { struct perf_event_mmap_page *userpg; struct perf_buffer *rb; rcu_read_lock(); rb = rcu_dereference(event->rb); if (!rb) goto unlock; userpg = rb->user_page; /* Allow new userspace to detect that bit 0 is deprecated */ userpg->cap_bit0_is_deprecated = 1; userpg->size = offsetof(struct perf_event_mmap_page, __reserved); userpg->data_offset = PAGE_SIZE; userpg->data_size = perf_data_size(rb); unlock: rcu_read_unlock(); } void __weak arch_perf_update_userpage( struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now) { } /* * Callers need to ensure there can be no nesting of this function, otherwise * the seqlock logic goes bad. We can not serialize this because the arch * code calls this from NMI context. */ void perf_event_update_userpage(struct perf_event *event) { struct perf_event_mmap_page *userpg; struct perf_buffer *rb; u64 enabled, running, now; rcu_read_lock(); rb = rcu_dereference(event->rb); if (!rb) goto unlock; /* * compute total_time_enabled, total_time_running * based on snapshot values taken when the event * was last scheduled in. * * we cannot simply called update_context_time() * because of locking issue as we can be called in * NMI context */ calc_timer_values(event, &now, &enabled, &running); userpg = rb->user_page; /* * Disable preemption to guarantee consistent time stamps are stored to * the user page. */ preempt_disable(); ++userpg->lock; barrier(); userpg->index = perf_event_index(event); userpg->offset = perf_event_count(event); if (userpg->index) userpg->offset -= local64_read(&event->hw.prev_count); userpg->time_enabled = enabled + atomic64_read(&event->child_total_time_enabled); userpg->time_running = running + atomic64_read(&event->child_total_time_running); arch_perf_update_userpage(event, userpg, now); barrier(); ++userpg->lock; preempt_enable(); unlock: rcu_read_unlock(); } EXPORT_SYMBOL_GPL(perf_event_update_userpage); static vm_fault_t perf_mmap_fault(struct vm_fault *vmf) { struct perf_event *event = vmf->vma->vm_file->private_data; struct perf_buffer *rb; vm_fault_t ret = VM_FAULT_SIGBUS; if (vmf->flags & FAULT_FLAG_MKWRITE) { if (vmf->pgoff == 0) ret = 0; return ret; } rcu_read_lock(); rb = rcu_dereference(event->rb); if (!rb) goto unlock; if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) goto unlock; vmf->page = perf_mmap_to_page(rb, vmf->pgoff); if (!vmf->page) goto unlock; get_page(vmf->page); vmf->page->mapping = vmf->vma->vm_file->f_mapping; vmf->page->index = vmf->pgoff; ret = 0; unlock: rcu_read_unlock(); return ret; } static void ring_buffer_attach(struct perf_event *event, struct perf_buffer *rb) { struct perf_buffer *old_rb = NULL; unsigned long flags; WARN_ON_ONCE(event->parent); if (event->rb) { /* * Should be impossible, we set this when removing * event->rb_entry and wait/clear when adding event->rb_entry. */ WARN_ON_ONCE(event->rcu_pending); old_rb = event->rb; spin_lock_irqsave(&old_rb->event_lock, flags); list_del_rcu(&event->rb_entry); spin_unlock_irqrestore(&old_rb->event_lock, flags); event->rcu_batches = get_state_synchronize_rcu(); event->rcu_pending = 1; } if (rb) { if (event->rcu_pending) { cond_synchronize_rcu(event->rcu_batches); event->rcu_pending = 0; } spin_lock_irqsave(&rb->event_lock, flags); list_add_rcu(&event->rb_entry, &rb->event_list); spin_unlock_irqrestore(&rb->event_lock, flags); } /* * Avoid racing with perf_mmap_close(AUX): stop the event * before swizzling the event::rb pointer; if it's getting * unmapped, its aux_mmap_count will be 0 and it won't * restart. See the comment in __perf_pmu_output_stop(). * * Data will inevitably be lost when set_output is done in * mid-air, but then again, whoever does it like this is * not in for the data anyway. */ if (has_aux(event)) perf_event_stop(event, 0); rcu_assign_pointer(event->rb, rb); if (old_rb) { ring_buffer_put(old_rb); /* * Since we detached before setting the new rb, so that we * could attach the new rb, we could have missed a wakeup. * Provide it now. */ wake_up_all(&event->waitq); } } static void ring_buffer_wakeup(struct perf_event *event) { struct perf_buffer *rb; if (event->parent) event = event->parent; rcu_read_lock(); rb = rcu_dereference(event->rb); if (rb) { list_for_each_entry_rcu(event, &rb->event_list, rb_entry) wake_up_all(&event->waitq); } rcu_read_unlock(); } struct perf_buffer *ring_buffer_get(struct perf_event *event) { struct perf_buffer *rb; if (event->parent) event = event->parent; rcu_read_lock(); rb = rcu_dereference(event->rb); if (rb) { if (!refcount_inc_not_zero(&rb->refcount)) rb = NULL; } rcu_read_unlock(); return rb; } void ring_buffer_put(struct perf_buffer *rb) { if (!refcount_dec_and_test(&rb->refcount)) return; WARN_ON_ONCE(!list_empty(&rb->event_list)); call_rcu(&rb->rcu_head, rb_free_rcu); } static void perf_mmap_open(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; atomic_inc(&event->mmap_count); atomic_inc(&event->rb->mmap_count); if (vma->vm_pgoff) atomic_inc(&event->rb->aux_mmap_count); if (event->pmu->event_mapped) event->pmu->event_mapped(event, vma->vm_mm); } static void perf_pmu_output_stop(struct perf_event *event); /* * A buffer can be mmap()ed multiple times; either directly through the same * event, or through other events by use of perf_event_set_output(). * * In order to undo the VM accounting done by perf_mmap() we need to destroy * the buffer here, where we still have a VM context. This means we need * to detach all events redirecting to us. */ static void perf_mmap_close(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; struct perf_buffer *rb = ring_buffer_get(event); struct user_struct *mmap_user = rb->mmap_user; int mmap_locked = rb->mmap_locked; unsigned long size = perf_data_size(rb); bool detach_rest = false; if (event->pmu->event_unmapped) event->pmu->event_unmapped(event, vma->vm_mm); /* * rb->aux_mmap_count will always drop before rb->mmap_count and * event->mmap_count, so it is ok to use event->mmap_mutex to * serialize with perf_mmap here. */ if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff && atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) { /* * Stop all AUX events that are writing to this buffer, * so that we can free its AUX pages and corresponding PMU * data. Note that after rb::aux_mmap_count dropped to zero, * they won't start any more (see perf_aux_output_begin()). */ perf_pmu_output_stop(event); /* now it's safe to free the pages */ atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm); atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm); /* this has to be the last one */ rb_free_aux(rb); WARN_ON_ONCE(refcount_read(&rb->aux_refcount)); mutex_unlock(&event->mmap_mutex); } if (atomic_dec_and_test(&rb->mmap_count)) detach_rest = true; if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) goto out_put; ring_buffer_attach(event, NULL); mutex_unlock(&event->mmap_mutex); /* If there's still other mmap()s of this buffer, we're done. */ if (!detach_rest) goto out_put; /* * No other mmap()s, detach from all other events that might redirect * into the now unreachable buffer. Somewhat complicated by the * fact that rb::event_lock otherwise nests inside mmap_mutex. */ again: rcu_read_lock(); list_for_each_entry_rcu(event, &rb->event_list, rb_entry) { if (!atomic_long_inc_not_zero(&event->refcount)) { /* * This event is en-route to free_event() which will * detach it and remove it from the list. */ continue; } rcu_read_unlock(); mutex_lock(&event->mmap_mutex); /* * Check we didn't race with perf_event_set_output() which can * swizzle the rb from under us while we were waiting to * acquire mmap_mutex. * * If we find a different rb; ignore this event, a next * iteration will no longer find it on the list. We have to * still restart the iteration to make sure we're not now * iterating the wrong list. */ if (event->rb == rb) ring_buffer_attach(event, NULL); mutex_unlock(&event->mmap_mutex); put_event(event); /* * Restart the iteration; either we're on the wrong list or * destroyed its integrity by doing a deletion. */ goto again; } rcu_read_unlock(); /* * It could be there's still a few 0-ref events on the list; they'll * get cleaned up by free_event() -- they'll also still have their * ref on the rb and will free it whenever they are done with it. * * Aside from that, this buffer is 'fully' detached and unmapped, * undo the VM accounting. */ atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked, &mmap_user->locked_vm); atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm); free_uid(mmap_user); out_put: ring_buffer_put(rb); /* could be last */ } static const struct vm_operations_struct perf_mmap_vmops = { .open = perf_mmap_open, .close = perf_mmap_close, /* non mergeable */ .fault = perf_mmap_fault, .page_mkwrite = perf_mmap_fault, }; static int perf_mmap(struct file *file, struct vm_area_struct *vma) { struct perf_event *event = file->private_data; unsigned long user_locked, user_lock_limit; struct user_struct *user = current_user(); struct perf_buffer *rb = NULL; unsigned long locked, lock_limit; unsigned long vma_size; unsigned long nr_pages; long user_extra = 0, extra = 0; int ret = 0, flags = 0; /* * Don't allow mmap() of inherited per-task counters. This would * create a performance issue due to all children writing to the * same rb. */ if (event->cpu == -1 && event->attr.inherit) return -EINVAL; if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; ret = security_perf_event_read(event); if (ret) return ret; vma_size = vma->vm_end - vma->vm_start; if (vma->vm_pgoff == 0) { nr_pages = (vma_size / PAGE_SIZE) - 1; } else { /* * AUX area mapping: if rb->aux_nr_pages != 0, it's already * mapped, all subsequent mappings should have the same size * and offset. Must be above the normal perf buffer. */ u64 aux_offset, aux_size; if (!event->rb) return -EINVAL; nr_pages = vma_size / PAGE_SIZE; mutex_lock(&event->mmap_mutex); ret = -EINVAL; rb = event->rb; if (!rb) goto aux_unlock; aux_offset = READ_ONCE(rb->user_page->aux_offset); aux_size = READ_ONCE(rb->user_page->aux_size); if (aux_offset < perf_data_size(rb) + PAGE_SIZE) goto aux_unlock; if (aux_offset != vma->vm_pgoff << PAGE_SHIFT) goto aux_unlock; /* already mapped with a different offset */ if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff) goto aux_unlock; if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE) goto aux_unlock; /* already mapped with a different size */ if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages) goto aux_unlock; if (!is_power_of_2(nr_pages)) goto aux_unlock; if (!atomic_inc_not_zero(&rb->mmap_count)) goto aux_unlock; if (rb_has_aux(rb)) { atomic_inc(&rb->aux_mmap_count); ret = 0; goto unlock; } atomic_set(&rb->aux_mmap_count, 1); user_extra = nr_pages; goto accounting; } /* * If we have rb pages ensure they're a power-of-two number, so we * can do bitmasks instead of modulo. */ if (nr_pages != 0 && !is_power_of_2(nr_pages)) return -EINVAL; if (vma_size != PAGE_SIZE * (1 + nr_pages)) return -EINVAL; WARN_ON_ONCE(event->ctx->parent_ctx); again: mutex_lock(&event->mmap_mutex); if (event->rb) { if (data_page_nr(event->rb) != nr_pages) { ret = -EINVAL; goto unlock; } if (!atomic_inc_not_zero(&event->rb->mmap_count)) { /* * Raced against perf_mmap_close(); remove the * event and try again. */ ring_buffer_attach(event, NULL); mutex_unlock(&event->mmap_mutex); goto again; } goto unlock; } user_extra = nr_pages + 1; accounting: user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); /* * Increase the limit linearly with more CPUs: */ user_lock_limit *= num_online_cpus(); user_locked = atomic_long_read(&user->locked_vm); /* * sysctl_perf_event_mlock may have changed, so that * user->locked_vm > user_lock_limit */ if (user_locked > user_lock_limit) user_locked = user_lock_limit; user_locked += user_extra; if (user_locked > user_lock_limit) { /* * charge locked_vm until it hits user_lock_limit; * charge the rest from pinned_vm */ extra = user_locked - user_lock_limit; user_extra -= extra; } lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra; if ((locked > lock_limit) && perf_is_paranoid() && !capable(CAP_IPC_LOCK)) { ret = -EPERM; goto unlock; } WARN_ON(!rb && event->rb); if (vma->vm_flags & VM_WRITE) flags |= RING_BUFFER_WRITABLE; if (!rb) { rb = rb_alloc(nr_pages, event->attr.watermark ? event->attr.wakeup_watermark : 0, event->cpu, flags); if (!rb) { ret = -ENOMEM; goto unlock; } atomic_set(&rb->mmap_count, 1); rb->mmap_user = get_current_user(); rb->mmap_locked = extra; ring_buffer_attach(event, rb); perf_event_update_time(event); perf_event_init_userpage(event); perf_event_update_userpage(event); } else { ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, event->attr.aux_watermark, flags); if (!ret) rb->aux_mmap_locked = extra; } unlock: if (!ret) { atomic_long_add(user_extra, &user->locked_vm); atomic64_add(extra, &vma->vm_mm->pinned_vm); atomic_inc(&event->mmap_count); } else if (rb) { atomic_dec(&rb->mmap_count); } aux_unlock: mutex_unlock(&event->mmap_mutex); /* * Since pinned accounting is per vm we cannot allow fork() to copy our * vma. */ vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &perf_mmap_vmops; if (event->pmu->event_mapped) event->pmu->event_mapped(event, vma->vm_mm); return ret; } static int perf_fasync(int fd, struct file *filp, int on) { struct inode *inode = file_inode(filp); struct perf_event *event = filp->private_data; int retval; inode_lock(inode); retval = fasync_helper(fd, filp, on, &event->fasync); inode_unlock(inode); if (retval < 0) return retval; return 0; } static const struct file_operations perf_fops = { .llseek = no_llseek, .release = perf_release, .read = perf_read, .poll = perf_poll, .unlocked_ioctl = perf_ioctl, .compat_ioctl = perf_compat_ioctl, .mmap = perf_mmap, .fasync = perf_fasync, }; /* * Perf event wakeup * * If there's data, ensure we set the poll() state and publish everything * to user-space before waking everybody up. */ static inline struct fasync_struct **perf_event_fasync(struct perf_event *event) { /* only the parent has fasync state */ if (event->parent) event = event->parent; return &event->fasync; } void perf_event_wakeup(struct perf_event *event) { ring_buffer_wakeup(event); if (event->pending_kill) { kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill); event->pending_kill = 0; } } static void perf_sigtrap(struct perf_event *event) { /* * We'd expect this to only occur if the irq_work is delayed and either * ctx->task or current has changed in the meantime. This can be the * case on architectures that do not implement arch_irq_work_raise(). */ if (WARN_ON_ONCE(event->ctx->task != current)) return; /* * Both perf_pending_task() and perf_pending_irq() can race with the * task exiting. */ if (current->flags & PF_EXITING) return; send_sig_perf((void __user *)event->pending_addr, event->orig_type, event->attr.sig_data); } /* * Deliver the pending work in-event-context or follow the context. */ static void __perf_pending_irq(struct perf_event *event) { int cpu = READ_ONCE(event->oncpu); /* * If the event isn't running; we done. event_sched_out() will have * taken care of things. */ if (cpu < 0) return; /* * Yay, we hit home and are in the context of the event. */ if (cpu == smp_processor_id()) { if (event->pending_sigtrap) { event->pending_sigtrap = 0; perf_sigtrap(event); local_dec(&event->ctx->nr_pending); } if (event->pending_disable) { event->pending_disable = 0; perf_event_disable_local(event); } return; } /* * CPU-A CPU-B * * perf_event_disable_inatomic() * @pending_disable = CPU-A; * irq_work_queue(); * * sched-out * @pending_disable = -1; * * sched-in * perf_event_disable_inatomic() * @pending_disable = CPU-B; * irq_work_queue(); // FAILS * * irq_work_run() * perf_pending_irq() * * But the event runs on CPU-B and wants disabling there. */ irq_work_queue_on(&event->pending_irq, cpu); } static void perf_pending_irq(struct irq_work *entry) { struct perf_event *event = container_of(entry, struct perf_event, pending_irq); int rctx; /* * If we 'fail' here, that's OK, it means recursion is already disabled * and we won't recurse 'further'. */ rctx = perf_swevent_get_recursion_context(); /* * The wakeup isn't bound to the context of the event -- it can happen * irrespective of where the event is. */ if (event->pending_wakeup) { event->pending_wakeup = 0; perf_event_wakeup(event); } __perf_pending_irq(event); if (rctx >= 0) perf_swevent_put_recursion_context(rctx); } static void perf_pending_task(struct callback_head *head) { struct perf_event *event = container_of(head, struct perf_event, pending_task); int rctx; /* * If we 'fail' here, that's OK, it means recursion is already disabled * and we won't recurse 'further'. */ preempt_disable_notrace(); rctx = perf_swevent_get_recursion_context(); if (event->pending_work) { event->pending_work = 0; perf_sigtrap(event); local_dec(&event->ctx->nr_pending); } if (rctx >= 0) perf_swevent_put_recursion_context(rctx); preempt_enable_notrace(); put_event(event); } #ifdef CONFIG_GUEST_PERF_EVENTS struct perf_guest_info_callbacks __rcu *perf_guest_cbs; DEFINE_STATIC_CALL_RET0(__perf_guest_state, *perf_guest_cbs->state); DEFINE_STATIC_CALL_RET0(__perf_guest_get_ip, *perf_guest_cbs->get_ip); DEFINE_STATIC_CALL_RET0(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr); void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs))) return; rcu_assign_pointer(perf_guest_cbs, cbs); static_call_update(__perf_guest_state, cbs->state); static_call_update(__perf_guest_get_ip, cbs->get_ip); /* Implementing ->handle_intel_pt_intr is optional. */ if (cbs->handle_intel_pt_intr) static_call_update(__perf_guest_handle_intel_pt_intr, cbs->handle_intel_pt_intr); } EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs)) return; rcu_assign_pointer(perf_guest_cbs, NULL); static_call_update(__perf_guest_state, (void *)&__static_call_return0); static_call_update(__perf_guest_get_ip, (void *)&__static_call_return0); static_call_update(__perf_guest_handle_intel_pt_intr, (void *)&__static_call_return0); synchronize_rcu(); } EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); #endif static void perf_output_sample_regs(struct perf_output_handle *handle, struct pt_regs *regs, u64 mask) { int bit; DECLARE_BITMAP(_mask, 64); bitmap_from_u64(_mask, mask); for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) { u64 val; val = perf_reg_value(regs, bit); perf_output_put(handle, val); } } static void perf_sample_regs_user(struct perf_regs *regs_user, struct pt_regs *regs) { if (user_mode(regs)) { regs_user->abi = perf_reg_abi(current); regs_user->regs = regs; } else if (!(current->flags & PF_KTHREAD)) { perf_get_regs_user(regs_user, regs); } else { regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; regs_user->regs = NULL; } } static void perf_sample_regs_intr(struct perf_regs *regs_intr, struct pt_regs *regs) { regs_intr->regs = regs; regs_intr->abi = perf_reg_abi(current); } /* * Get remaining task size from user stack pointer. * * It'd be better to take stack vma map and limit this more * precisely, but there's no way to get it safely under interrupt, * so using TASK_SIZE as limit. */ static u64 perf_ustack_task_size(struct pt_regs *regs) { unsigned long addr = perf_user_stack_pointer(regs); if (!addr || addr >= TASK_SIZE) return 0; return TASK_SIZE - addr; } static u16 perf_sample_ustack_size(u16 stack_size, u16 header_size, struct pt_regs *regs) { u64 task_size; /* No regs, no stack pointer, no dump. */ if (!regs) return 0; /* * Check if we fit in with the requested stack size into the: * - TASK_SIZE * If we don't, we limit the size to the TASK_SIZE. * * - remaining sample size * If we don't, we customize the stack size to * fit in to the remaining sample size. */ task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs)); stack_size = min(stack_size, (u16) task_size); /* Current header size plus static size and dynamic size. */ header_size += 2 * sizeof(u64); /* Do we fit in with the current stack dump size? */ if ((u16) (header_size + stack_size) < header_size) { /* * If we overflow the maximum size for the sample, * we customize the stack dump size to fit in. */ stack_size = USHRT_MAX - header_size - sizeof(u64); stack_size = round_up(stack_size, sizeof(u64)); } return stack_size; } static void perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, struct pt_regs *regs) { /* Case of a kernel thread, nothing to dump */ if (!regs) { u64 size = 0; perf_output_put(handle, size); } else { unsigned long sp; unsigned int rem; u64 dyn_size; /* * We dump: * static size * - the size requested by user or the best one we can fit * in to the sample max size * data * - user stack dump data * dynamic size * - the actual dumped size */ /* Static size. */ perf_output_put(handle, dump_size); /* Data. */ sp = perf_user_stack_pointer(regs); rem = __output_copy_user(handle, (void *) sp, dump_size); dyn_size = dump_size - rem; perf_output_skip(handle, rem); /* Dynamic size. */ perf_output_put(handle, dyn_size); } } static unsigned long perf_prepare_sample_aux(struct perf_event *event, struct perf_sample_data *data, size_t size) { struct perf_event *sampler = event->aux_event; struct perf_buffer *rb; data->aux_size = 0; if (!sampler) goto out; if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE)) goto out; if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id())) goto out; rb = ring_buffer_get(sampler); if (!rb) goto out; /* * If this is an NMI hit inside sampling code, don't take * the sample. See also perf_aux_sample_output(). */ if (READ_ONCE(rb->aux_in_sampling)) { data->aux_size = 0; } else { size = min_t(size_t, size, perf_aux_size(rb)); data->aux_size = ALIGN(size, sizeof(u64)); } ring_buffer_put(rb); out: return data->aux_size; } static long perf_pmu_snapshot_aux(struct perf_buffer *rb, struct perf_event *event, struct perf_output_handle *handle, unsigned long size) { unsigned long flags; long ret; /* * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler * paths. If we start calling them in NMI context, they may race with * the IRQ ones, that is, for example, re-starting an event that's just * been stopped, which is why we're using a separate callback that * doesn't change the event state. * * IRQs need to be disabled to prevent IPIs from racing with us. */ local_irq_save(flags); /* * Guard against NMI hits inside the critical section; * see also perf_prepare_sample_aux(). */ WRITE_ONCE(rb->aux_in_sampling, 1); barrier(); ret = event->pmu->snapshot_aux(event, handle, size); barrier(); WRITE_ONCE(rb->aux_in_sampling, 0); local_irq_restore(flags); return ret; } static void perf_aux_sample_output(struct perf_event *event, struct perf_output_handle *handle, struct perf_sample_data *data) { struct perf_event *sampler = event->aux_event; struct perf_buffer *rb; unsigned long pad; long size; if (WARN_ON_ONCE(!sampler || !data->aux_size)) return; rb = ring_buffer_get(sampler); if (!rb) return; size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size); /* * An error here means that perf_output_copy() failed (returned a * non-zero surplus that it didn't copy), which in its current * enlightened implementation is not possible. If that changes, we'd * like to know. */ if (WARN_ON_ONCE(size < 0)) goto out_put; /* * The pad comes from ALIGN()ing data->aux_size up to u64 in * perf_prepare_sample_aux(), so should not be more than that. */ pad = data->aux_size - size; if (WARN_ON_ONCE(pad >= sizeof(u64))) pad = 8; if (pad) { u64 zero = 0; perf_output_copy(handle, &zero, pad); } out_put: ring_buffer_put(rb); } /* * A set of common sample data types saved even for non-sample records * when event->attr.sample_id_all is set. */ #define PERF_SAMPLE_ID_ALL (PERF_SAMPLE_TID | PERF_SAMPLE_TIME | \ PERF_SAMPLE_ID | PERF_SAMPLE_STREAM_ID | \ PERF_SAMPLE_CPU | PERF_SAMPLE_IDENTIFIER) static void __perf_event_header__init_id(struct perf_sample_data *data, struct perf_event *event, u64 sample_type) { data->type = event->attr.sample_type; data->sample_flags |= data->type & PERF_SAMPLE_ID_ALL; if (sample_type & PERF_SAMPLE_TID) { /* namespace issues */ data->tid_entry.pid = perf_event_pid(event, current); data->tid_entry.tid = perf_event_tid(event, current); } if (sample_type & PERF_SAMPLE_TIME) data->time = perf_event_clock(event); if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) data->id = primary_event_id(event); if (sample_type & PERF_SAMPLE_STREAM_ID) data->stream_id = event->id; if (sample_type & PERF_SAMPLE_CPU) { data->cpu_entry.cpu = raw_smp_processor_id(); data->cpu_entry.reserved = 0; } } void perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event) { if (event->attr.sample_id_all) { header->size += event->id_header_size; __perf_event_header__init_id(data, event, event->attr.sample_type); } } static void __perf_event__output_id_sample(struct perf_output_handle *handle, struct perf_sample_data *data) { u64 sample_type = data->type; if (sample_type & PERF_SAMPLE_TID) perf_output_put(handle, data->tid_entry); if (sample_type & PERF_SAMPLE_TIME) perf_output_put(handle, data->time); if (sample_type & PERF_SAMPLE_ID) perf_output_put(handle, data->id); if (sample_type & PERF_SAMPLE_STREAM_ID) perf_output_put(handle, data->stream_id); if (sample_type & PERF_SAMPLE_CPU) perf_output_put(handle, data->cpu_entry); if (sample_type & PERF_SAMPLE_IDENTIFIER) perf_output_put(handle, data->id); } void perf_event__output_id_sample(struct perf_event *event, struct perf_output_handle *handle, struct perf_sample_data *sample) { if (event->attr.sample_id_all) __perf_event__output_id_sample(handle, sample); } static void perf_output_read_one(struct perf_output_handle *handle, struct perf_event *event, u64 enabled, u64 running) { u64 read_format = event->attr.read_format; u64 values[5]; int n = 0; values[n++] = perf_event_count(event); if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { values[n++] = enabled + atomic64_read(&event->child_total_time_enabled); } if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { values[n++] = running + atomic64_read(&event->child_total_time_running); } if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(event); if (read_format & PERF_FORMAT_LOST) values[n++] = atomic64_read(&event->lost_samples); __output_copy(handle, values, n * sizeof(u64)); } static void perf_output_read_group(struct perf_output_handle *handle, struct perf_event *event, u64 enabled, u64 running) { struct perf_event *leader = event->group_leader, *sub; u64 read_format = event->attr.read_format; unsigned long flags; u64 values[6]; int n = 0; /* * Disabling interrupts avoids all counter scheduling * (context switches, timer based rotation and IPIs). */ local_irq_save(flags); values[n++] = 1 + leader->nr_siblings; if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) values[n++] = enabled; if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) values[n++] = running; if ((leader != event) && (leader->state == PERF_EVENT_STATE_ACTIVE)) leader->pmu->read(leader); values[n++] = perf_event_count(leader); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); if (read_format & PERF_FORMAT_LOST) values[n++] = atomic64_read(&leader->lost_samples); __output_copy(handle, values, n * sizeof(u64)); for_each_sibling_event(sub, leader) { n = 0; if ((sub != event) && (sub->state == PERF_EVENT_STATE_ACTIVE)) sub->pmu->read(sub); values[n++] = perf_event_count(sub); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); if (read_format & PERF_FORMAT_LOST) values[n++] = atomic64_read(&sub->lost_samples); __output_copy(handle, values, n * sizeof(u64)); } local_irq_restore(flags); } #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\ PERF_FORMAT_TOTAL_TIME_RUNNING) /* * XXX PERF_SAMPLE_READ vs inherited events seems difficult. * * The problem is that its both hard and excessively expensive to iterate the * child list, not to mention that its impossible to IPI the children running * on another CPU, from interrupt/NMI context. */ static void perf_output_read(struct perf_output_handle *handle, struct perf_event *event) { u64 enabled = 0, running = 0, now; u64 read_format = event->attr.read_format; /* * compute total_time_enabled, total_time_running * based on snapshot values taken when the event * was last scheduled in. * * we cannot simply called update_context_time() * because of locking issue as we are called in * NMI context */ if (read_format & PERF_FORMAT_TOTAL_TIMES) calc_timer_values(event, &now, &enabled, &running); if (event->attr.read_format & PERF_FORMAT_GROUP) perf_output_read_group(handle, event, enabled, running); else perf_output_read_one(handle, event, enabled, running); } void perf_output_sample(struct perf_output_handle *handle, struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event) { u64 sample_type = data->type; perf_output_put(handle, *header); if (sample_type & PERF_SAMPLE_IDENTIFIER) perf_output_put(handle, data->id); if (sample_type & PERF_SAMPLE_IP) perf_output_put(handle, data->ip); if (sample_type & PERF_SAMPLE_TID) perf_output_put(handle, data->tid_entry); if (sample_type & PERF_SAMPLE_TIME) perf_output_put(handle, data->time); if (sample_type & PERF_SAMPLE_ADDR) perf_output_put(handle, data->addr); if (sample_type & PERF_SAMPLE_ID) perf_output_put(handle, data->id); if (sample_type & PERF_SAMPLE_STREAM_ID) perf_output_put(handle, data->stream_id); if (sample_type & PERF_SAMPLE_CPU) perf_output_put(handle, data->cpu_entry); if (sample_type & PERF_SAMPLE_PERIOD) perf_output_put(handle, data->period); if (sample_type & PERF_SAMPLE_READ) perf_output_read(handle, event); if (sample_type & PERF_SAMPLE_CALLCHAIN) { int size = 1; size += data->callchain->nr; size *= sizeof(u64); __output_copy(handle, data->callchain, size); } if (sample_type & PERF_SAMPLE_RAW) { struct perf_raw_record *raw = data->raw; if (raw) { struct perf_raw_frag *frag = &raw->frag; perf_output_put(handle, raw->size); do { if (frag->copy) { __output_custom(handle, frag->copy, frag->data, frag->size); } else { __output_copy(handle, frag->data, frag->size); } if (perf_raw_frag_last(frag)) break; frag = frag->next; } while (1); if (frag->pad) __output_skip(handle, NULL, frag->pad); } else { struct { u32 size; u32 data; } raw = { .size = sizeof(u32), .data = 0, }; perf_output_put(handle, raw); } } if (sample_type & PERF_SAMPLE_BRANCH_STACK) { if (data->br_stack) { size_t size; size = data->br_stack->nr * sizeof(struct perf_branch_entry); perf_output_put(handle, data->br_stack->nr); if (branch_sample_hw_index(event)) perf_output_put(handle, data->br_stack->hw_idx); perf_output_copy(handle, data->br_stack->entries, size); } else { /* * we always store at least the value of nr */ u64 nr = 0; perf_output_put(handle, nr); } } if (sample_type & PERF_SAMPLE_REGS_USER) { u64 abi = data->regs_user.abi; /* * If there are no regs to dump, notice it through * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE). */ perf_output_put(handle, abi); if (abi) { u64 mask = event->attr.sample_regs_user; perf_output_sample_regs(handle, data->regs_user.regs, mask); } } if (sample_type & PERF_SAMPLE_STACK_USER) { perf_output_sample_ustack(handle, data->stack_user_size, data->regs_user.regs); } if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) perf_output_put(handle, data->weight.full); if (sample_type & PERF_SAMPLE_DATA_SRC) perf_output_put(handle, data->data_src.val); if (sample_type & PERF_SAMPLE_TRANSACTION) perf_output_put(handle, data->txn); if (sample_type & PERF_SAMPLE_REGS_INTR) { u64 abi = data->regs_intr.abi; /* * If there are no regs to dump, notice it through * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE). */ perf_output_put(handle, abi); if (abi) { u64 mask = event->attr.sample_regs_intr; perf_output_sample_regs(handle, data->regs_intr.regs, mask); } } if (sample_type & PERF_SAMPLE_PHYS_ADDR) perf_output_put(handle, data->phys_addr); if (sample_type & PERF_SAMPLE_CGROUP) perf_output_put(handle, data->cgroup); if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) perf_output_put(handle, data->data_page_size); if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) perf_output_put(handle, data->code_page_size); if (sample_type & PERF_SAMPLE_AUX) { perf_output_put(handle, data->aux_size); if (data->aux_size) perf_aux_sample_output(event, handle, data); } if (!event->attr.watermark) { int wakeup_events = event->attr.wakeup_events; if (wakeup_events) { struct perf_buffer *rb = handle->rb; int events = local_inc_return(&rb->events); if (events >= wakeup_events) { local_sub(wakeup_events, &rb->events); local_inc(&rb->wakeup); } } } } static u64 perf_virt_to_phys(u64 virt) { u64 phys_addr = 0; if (!virt) return 0; if (virt >= TASK_SIZE) { /* If it's vmalloc()d memory, leave phys_addr as 0 */ if (virt_addr_valid((void *)(uintptr_t)virt) && !(virt >= VMALLOC_START && virt < VMALLOC_END)) phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt); } else { /* * Walking the pages tables for user address. * Interrupts are disabled, so it prevents any tear down * of the page tables. * Try IRQ-safe get_user_page_fast_only first. * If failed, leave phys_addr as 0. */ if (current->mm != NULL) { struct page *p; pagefault_disable(); if (get_user_page_fast_only(virt, 0, &p)) { phys_addr = page_to_phys(p) + virt % PAGE_SIZE; put_page(p); } pagefault_enable(); } } return phys_addr; } /* * Return the pagetable size of a given virtual address. */ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr) { u64 size = 0; #ifdef CONFIG_HAVE_FAST_GUP pgd_t *pgdp, pgd; p4d_t *p4dp, p4d; pud_t *pudp, pud; pmd_t *pmdp, pmd; pte_t *ptep, pte; pgdp = pgd_offset(mm, addr); pgd = READ_ONCE(*pgdp); if (pgd_none(pgd)) return 0; if (pgd_leaf(pgd)) return pgd_leaf_size(pgd); p4dp = p4d_offset_lockless(pgdp, pgd, addr); p4d = READ_ONCE(*p4dp); if (!p4d_present(p4d)) return 0; if (p4d_leaf(p4d)) return p4d_leaf_size(p4d); pudp = pud_offset_lockless(p4dp, p4d, addr); pud = READ_ONCE(*pudp); if (!pud_present(pud)) return 0; if (pud_leaf(pud)) return pud_leaf_size(pud); pmdp = pmd_offset_lockless(pudp, pud, addr); again: pmd = pmdp_get_lockless(pmdp); if (!pmd_present(pmd)) return 0; if (pmd_leaf(pmd)) return pmd_leaf_size(pmd); ptep = pte_offset_map(&pmd, addr); if (!ptep) goto again; pte = ptep_get_lockless(ptep); if (pte_present(pte)) size = pte_leaf_size(pte); pte_unmap(ptep); #endif /* CONFIG_HAVE_FAST_GUP */ return size; } static u64 perf_get_page_size(unsigned long addr) { struct mm_struct *mm; unsigned long flags; u64 size; if (!addr) return 0; /* * Software page-table walkers must disable IRQs, * which prevents any tear down of the page tables. */ local_irq_save(flags); mm = current->mm; if (!mm) { /* * For kernel threads and the like, use init_mm so that * we can find kernel memory. */ mm = &init_mm; } size = perf_get_pgtable_size(mm, addr); local_irq_restore(flags); return size; } static struct perf_callchain_entry __empty_callchain = { .nr = 0, }; struct perf_callchain_entry * perf_callchain(struct perf_event *event, struct pt_regs *regs) { bool kernel = !event->attr.exclude_callchain_kernel; bool user = !event->attr.exclude_callchain_user; /* Disallow cross-task user callchains. */ bool crosstask = event->ctx->task && event->ctx->task != current; const u32 max_stack = event->attr.sample_max_stack; struct perf_callchain_entry *callchain; if (!kernel && !user) return &__empty_callchain; callchain = get_perf_callchain(regs, 0, kernel, user, max_stack, crosstask, true); return callchain ?: &__empty_callchain; } static __always_inline u64 __cond_set(u64 flags, u64 s, u64 d) { return d * !!(flags & s); } void perf_prepare_sample(struct perf_sample_data *data, struct perf_event *event, struct pt_regs *regs) { u64 sample_type = event->attr.sample_type; u64 filtered_sample_type; /* * Add the sample flags that are dependent to others. And clear the * sample flags that have already been done by the PMU driver. */ filtered_sample_type = sample_type; filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_CODE_PAGE_SIZE, PERF_SAMPLE_IP); filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_DATA_PAGE_SIZE | PERF_SAMPLE_PHYS_ADDR, PERF_SAMPLE_ADDR); filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_STACK_USER, PERF_SAMPLE_REGS_USER); filtered_sample_type &= ~data->sample_flags; if (filtered_sample_type == 0) { /* Make sure it has the correct data->type for output */ data->type = event->attr.sample_type; return; } __perf_event_header__init_id(data, event, filtered_sample_type); if (filtered_sample_type & PERF_SAMPLE_IP) { data->ip = perf_instruction_pointer(regs); data->sample_flags |= PERF_SAMPLE_IP; } if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN) perf_sample_save_callchain(data, event, regs); if (filtered_sample_type & PERF_SAMPLE_RAW) { data->raw = NULL; data->dyn_size += sizeof(u64); data->sample_flags |= PERF_SAMPLE_RAW; } if (filtered_sample_type & PERF_SAMPLE_BRANCH_STACK) { data->br_stack = NULL; data->dyn_size += sizeof(u64); data->sample_flags |= PERF_SAMPLE_BRANCH_STACK; } if (filtered_sample_type & PERF_SAMPLE_REGS_USER) perf_sample_regs_user(&data->regs_user, regs); /* * It cannot use the filtered_sample_type here as REGS_USER can be set * by STACK_USER (using __cond_set() above) and we don't want to update * the dyn_size if it's not requested by users. */ if ((sample_type & ~data->sample_flags) & PERF_SAMPLE_REGS_USER) { /* regs dump ABI info */ int size = sizeof(u64); if (data->regs_user.regs) { u64 mask = event->attr.sample_regs_user; size += hweight64(mask) * sizeof(u64); } data->dyn_size += size; data->sample_flags |= PERF_SAMPLE_REGS_USER; } if (filtered_sample_type & PERF_SAMPLE_STACK_USER) { /* * Either we need PERF_SAMPLE_STACK_USER bit to be always * processed as the last one or have additional check added * in case new sample type is added, because we could eat * up the rest of the sample size. */ u16 stack_size = event->attr.sample_stack_user; u16 header_size = perf_sample_data_size(data, event); u16 size = sizeof(u64); stack_size = perf_sample_ustack_size(stack_size, header_size, data->regs_user.regs); /* * If there is something to dump, add space for the dump * itself and for the field that tells the dynamic size, * which is how many have been actually dumped. */ if (stack_size) size += sizeof(u64) + stack_size; data->stack_user_size = stack_size; data->dyn_size += size; data->sample_flags |= PERF_SAMPLE_STACK_USER; } if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) { data->weight.full = 0; data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; } if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) { data->data_src.val = PERF_MEM_NA; data->sample_flags |= PERF_SAMPLE_DATA_SRC; } if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) { data->txn = 0; data->sample_flags |= PERF_SAMPLE_TRANSACTION; } if (filtered_sample_type & PERF_SAMPLE_ADDR) { data->addr = 0; data->sample_flags |= PERF_SAMPLE_ADDR; } if (filtered_sample_type & PERF_SAMPLE_REGS_INTR) { /* regs dump ABI info */ int size = sizeof(u64); perf_sample_regs_intr(&data->regs_intr, regs); if (data->regs_intr.regs) { u64 mask = event->attr.sample_regs_intr; size += hweight64(mask) * sizeof(u64); } data->dyn_size += size; data->sample_flags |= PERF_SAMPLE_REGS_INTR; } if (filtered_sample_type & PERF_SAMPLE_PHYS_ADDR) { data->phys_addr = perf_virt_to_phys(data->addr); data->sample_flags |= PERF_SAMPLE_PHYS_ADDR; } #ifdef CONFIG_CGROUP_PERF if (filtered_sample_type & PERF_SAMPLE_CGROUP) { struct cgroup *cgrp; /* protected by RCU */ cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup; data->cgroup = cgroup_id(cgrp); data->sample_flags |= PERF_SAMPLE_CGROUP; } #endif /* * PERF_DATA_PAGE_SIZE requires PERF_SAMPLE_ADDR. If the user doesn't * require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr, * but the value will not dump to the userspace. */ if (filtered_sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) { data->data_page_size = perf_get_page_size(data->addr); data->sample_flags |= PERF_SAMPLE_DATA_PAGE_SIZE; } if (filtered_sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) { data->code_page_size = perf_get_page_size(data->ip); data->sample_flags |= PERF_SAMPLE_CODE_PAGE_SIZE; } if (filtered_sample_type & PERF_SAMPLE_AUX) { u64 size; u16 header_size = perf_sample_data_size(data, event); header_size += sizeof(u64); /* size */ /* * Given the 16bit nature of header::size, an AUX sample can * easily overflow it, what with all the preceding sample bits. * Make sure this doesn't happen by using up to U16_MAX bytes * per sample in total (rounded down to 8 byte boundary). */ size = min_t(size_t, U16_MAX - header_size, event->attr.aux_sample_size); size = rounddown(size, 8); size = perf_prepare_sample_aux(event, data, size); WARN_ON_ONCE(size + header_size > U16_MAX); data->dyn_size += size + sizeof(u64); /* size above */ data->sample_flags |= PERF_SAMPLE_AUX; } } void perf_prepare_header(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event, struct pt_regs *regs) { header->type = PERF_RECORD_SAMPLE; header->size = perf_sample_data_size(data, event); header->misc = perf_misc_flags(regs); /* * If you're adding more sample types here, you likely need to do * something about the overflowing header::size, like repurpose the * lowest 3 bits of size, which should be always zero at the moment. * This raises a more important question, do we really need 512k sized * samples and why, so good argumentation is in order for whatever you * do here next. */ WARN_ON_ONCE(header->size & 7); } static __always_inline int __perf_event_output(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs, int (*output_begin)(struct perf_output_handle *, struct perf_sample_data *, struct perf_event *, unsigned int)) { struct perf_output_handle handle; struct perf_event_header header; int err; /* protect the callchain buffers */ rcu_read_lock(); perf_prepare_sample(data, event, regs); perf_prepare_header(&header, data, event, regs); err = output_begin(&handle, data, event, header.size); if (err) goto exit; perf_output_sample(&handle, &header, data, event); perf_output_end(&handle); exit: rcu_read_unlock(); return err; } void perf_event_output_forward(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { __perf_event_output(event, data, regs, perf_output_begin_forward); } void perf_event_output_backward(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { __perf_event_output(event, data, regs, perf_output_begin_backward); } int perf_event_output(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { return __perf_event_output(event, data, regs, perf_output_begin); } /* * read event_id */ struct perf_read_event { struct perf_event_header header; u32 pid; u32 tid; }; static void perf_event_read_event(struct perf_event *event, struct task_struct *task) { struct perf_output_handle handle; struct perf_sample_data sample; struct perf_read_event read_event = { .header = { .type = PERF_RECORD_READ, .misc = 0, .size = sizeof(read_event) + event->read_size, }, .pid = perf_event_pid(event, task), .tid = perf_event_tid(event, task), }; int ret; perf_event_header__init_id(&read_event.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, read_event.header.size); if (ret) return; perf_output_put(&handle, read_event); perf_output_read(&handle, event); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } typedef void (perf_iterate_f)(struct perf_event *event, void *data); static void perf_iterate_ctx(struct perf_event_context *ctx, perf_iterate_f output, void *data, bool all) { struct perf_event *event; list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (!all) { if (event->state < PERF_EVENT_STATE_INACTIVE) continue; if (!event_filter_match(event)) continue; } output(event, data); } } static void perf_iterate_sb_cpu(perf_iterate_f output, void *data) { struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events); struct perf_event *event; list_for_each_entry_rcu(event, &pel->list, sb_list) { /* * Skip events that are not fully formed yet; ensure that * if we observe event->ctx, both event and ctx will be * complete enough. See perf_install_in_context(). */ if (!smp_load_acquire(&event->ctx)) continue; if (event->state < PERF_EVENT_STATE_INACTIVE) continue; if (!event_filter_match(event)) continue; output(event, data); } } /* * Iterate all events that need to receive side-band events. * * For new callers; ensure that account_pmu_sb_event() includes * your event, otherwise it might not get delivered. */ static void perf_iterate_sb(perf_iterate_f output, void *data, struct perf_event_context *task_ctx) { struct perf_event_context *ctx; rcu_read_lock(); preempt_disable(); /* * If we have task_ctx != NULL we only notify the task context itself. * The task_ctx is set only for EXIT events before releasing task * context. */ if (task_ctx) { perf_iterate_ctx(task_ctx, output, data, false); goto done; } perf_iterate_sb_cpu(output, data); ctx = rcu_dereference(current->perf_event_ctxp); if (ctx) perf_iterate_ctx(ctx, output, data, false); done: preempt_enable(); rcu_read_unlock(); } /* * Clear all file-based filters at exec, they'll have to be * re-instated when/if these objects are mmapped again. */ static void perf_event_addr_filters_exec(struct perf_event *event, void *data) { struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); struct perf_addr_filter *filter; unsigned int restart = 0, count = 0; unsigned long flags; if (!has_addr_filter(event)) return; raw_spin_lock_irqsave(&ifh->lock, flags); list_for_each_entry(filter, &ifh->list, entry) { if (filter->path.dentry) { event->addr_filter_ranges[count].start = 0; event->addr_filter_ranges[count].size = 0; restart++; } count++; } if (restart) event->addr_filters_gen++; raw_spin_unlock_irqrestore(&ifh->lock, flags); if (restart) perf_event_stop(event, 1); } void perf_event_exec(void) { struct perf_event_context *ctx; ctx = perf_pin_task_context(current); if (!ctx) return; perf_event_enable_on_exec(ctx); perf_event_remove_on_exec(ctx); perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true); perf_unpin_context(ctx); put_ctx(ctx); } struct remote_output { struct perf_buffer *rb; int err; }; static void __perf_event_output_stop(struct perf_event *event, void *data) { struct perf_event *parent = event->parent; struct remote_output *ro = data; struct perf_buffer *rb = ro->rb; struct stop_event_data sd = { .event = event, }; if (!has_aux(event)) return; if (!parent) parent = event; /* * In case of inheritance, it will be the parent that links to the * ring-buffer, but it will be the child that's actually using it. * * We are using event::rb to determine if the event should be stopped, * however this may race with ring_buffer_attach() (through set_output), * which will make us skip the event that actually needs to be stopped. * So ring_buffer_attach() has to stop an aux event before re-assigning * its rb pointer. */ if (rcu_dereference(parent->rb) == rb) ro->err = __perf_event_stop(&sd); } static int __perf_pmu_output_stop(void *info) { struct perf_event *event = info; struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct remote_output ro = { .rb = event->rb, }; rcu_read_lock(); perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false); if (cpuctx->task_ctx) perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop, &ro, false); rcu_read_unlock(); return ro.err; } static void perf_pmu_output_stop(struct perf_event *event) { struct perf_event *iter; int err, cpu; restart: rcu_read_lock(); list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) { /* * For per-CPU events, we need to make sure that neither they * nor their children are running; for cpu==-1 events it's * sufficient to stop the event itself if it's active, since * it can't have children. */ cpu = iter->cpu; if (cpu == -1) cpu = READ_ONCE(iter->oncpu); if (cpu == -1) continue; err = cpu_function_call(cpu, __perf_pmu_output_stop, event); if (err == -EAGAIN) { rcu_read_unlock(); goto restart; } } rcu_read_unlock(); } /* * task tracking -- fork/exit * * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task */ struct perf_task_event { struct task_struct *task; struct perf_event_context *task_ctx; struct { struct perf_event_header header; u32 pid; u32 ppid; u32 tid; u32 ptid; u64 time; } event_id; }; static int perf_event_task_match(struct perf_event *event) { return event->attr.comm || event->attr.mmap || event->attr.mmap2 || event->attr.mmap_data || event->attr.task; } static void perf_event_task_output(struct perf_event *event, void *data) { struct perf_task_event *task_event = data; struct perf_output_handle handle; struct perf_sample_data sample; struct task_struct *task = task_event->task; int ret, size = task_event->event_id.header.size; if (!perf_event_task_match(event)) return; perf_event_header__init_id(&task_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, task_event->event_id.header.size); if (ret) goto out; task_event->event_id.pid = perf_event_pid(event, task); task_event->event_id.tid = perf_event_tid(event, task); if (task_event->event_id.header.type == PERF_RECORD_EXIT) { task_event->event_id.ppid = perf_event_pid(event, task->real_parent); task_event->event_id.ptid = perf_event_pid(event, task->real_parent); } else { /* PERF_RECORD_FORK */ task_event->event_id.ppid = perf_event_pid(event, current); task_event->event_id.ptid = perf_event_tid(event, current); } task_event->event_id.time = perf_event_clock(event); perf_output_put(&handle, task_event->event_id); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); out: task_event->event_id.header.size = size; } static void perf_event_task(struct task_struct *task, struct perf_event_context *task_ctx, int new) { struct perf_task_event task_event; if (!atomic_read(&nr_comm_events) && !atomic_read(&nr_mmap_events) && !atomic_read(&nr_task_events)) return; task_event = (struct perf_task_event){ .task = task, .task_ctx = task_ctx, .event_id = { .header = { .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT, .misc = 0, .size = sizeof(task_event.event_id), }, /* .pid */ /* .ppid */ /* .tid */ /* .ptid */ /* .time */ }, }; perf_iterate_sb(perf_event_task_output, &task_event, task_ctx); } void perf_event_fork(struct task_struct *task) { perf_event_task(task, NULL, 1); perf_event_namespaces(task); } /* * comm tracking */ struct perf_comm_event { struct task_struct *task; char *comm; int comm_size; struct { struct perf_event_header header; u32 pid; u32 tid; } event_id; }; static int perf_event_comm_match(struct perf_event *event) { return event->attr.comm; } static void perf_event_comm_output(struct perf_event *event, void *data) { struct perf_comm_event *comm_event = data; struct perf_output_handle handle; struct perf_sample_data sample; int size = comm_event->event_id.header.size; int ret; if (!perf_event_comm_match(event)) return; perf_event_header__init_id(&comm_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, comm_event->event_id.header.size); if (ret) goto out; comm_event->event_id.pid = perf_event_pid(event, comm_event->task); comm_event->event_id.tid = perf_event_tid(event, comm_event->task); perf_output_put(&handle, comm_event->event_id); __output_copy(&handle, comm_event->comm, comm_event->comm_size); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); out: comm_event->event_id.header.size = size; } static void perf_event_comm_event(struct perf_comm_event *comm_event) { char comm[TASK_COMM_LEN]; unsigned int size; memset(comm, 0, sizeof(comm)); strscpy(comm, comm_event->task->comm, sizeof(comm)); size = ALIGN(strlen(comm)+1, sizeof(u64)); comm_event->comm = comm; comm_event->comm_size = size; comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; perf_iterate_sb(perf_event_comm_output, comm_event, NULL); } void perf_event_comm(struct task_struct *task, bool exec) { struct perf_comm_event comm_event; if (!atomic_read(&nr_comm_events)) return; comm_event = (struct perf_comm_event){ .task = task, /* .comm */ /* .comm_size */ .event_id = { .header = { .type = PERF_RECORD_COMM, .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0, /* .size */ }, /* .pid */ /* .tid */ }, }; perf_event_comm_event(&comm_event); } /* * namespaces tracking */ struct perf_namespaces_event { struct task_struct *task; struct { struct perf_event_header header; u32 pid; u32 tid; u64 nr_namespaces; struct perf_ns_link_info link_info[NR_NAMESPACES]; } event_id; }; static int perf_event_namespaces_match(struct perf_event *event) { return event->attr.namespaces; } static void perf_event_namespaces_output(struct perf_event *event, void *data) { struct perf_namespaces_event *namespaces_event = data; struct perf_output_handle handle; struct perf_sample_data sample; u16 header_size = namespaces_event->event_id.header.size; int ret; if (!perf_event_namespaces_match(event)) return; perf_event_header__init_id(&namespaces_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, namespaces_event->event_id.header.size); if (ret) goto out; namespaces_event->event_id.pid = perf_event_pid(event, namespaces_event->task); namespaces_event->event_id.tid = perf_event_tid(event, namespaces_event->task); perf_output_put(&handle, namespaces_event->event_id); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); out: namespaces_event->event_id.header.size = header_size; } static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info, struct task_struct *task, const struct proc_ns_operations *ns_ops) { struct path ns_path; struct inode *ns_inode; int error; error = ns_get_path(&ns_path, task, ns_ops); if (!error) { ns_inode = ns_path.dentry->d_inode; ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev); ns_link_info->ino = ns_inode->i_ino; path_put(&ns_path); } } void perf_event_namespaces(struct task_struct *task) { struct perf_namespaces_event namespaces_event; struct perf_ns_link_info *ns_link_info; if (!atomic_read(&nr_namespaces_events)) return; namespaces_event = (struct perf_namespaces_event){ .task = task, .event_id = { .header = { .type = PERF_RECORD_NAMESPACES, .misc = 0, .size = sizeof(namespaces_event.event_id), }, /* .pid */ /* .tid */ .nr_namespaces = NR_NAMESPACES, /* .link_info[NR_NAMESPACES] */ }, }; ns_link_info = namespaces_event.event_id.link_info; perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX], task, &mntns_operations); #ifdef CONFIG_USER_NS perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX], task, &userns_operations); #endif #ifdef CONFIG_NET_NS perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX], task, &netns_operations); #endif #ifdef CONFIG_UTS_NS perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX], task, &utsns_operations); #endif #ifdef CONFIG_IPC_NS perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX], task, &ipcns_operations); #endif #ifdef CONFIG_PID_NS perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX], task, &pidns_operations); #endif #ifdef CONFIG_CGROUPS perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX], task, &cgroupns_operations); #endif perf_iterate_sb(perf_event_namespaces_output, &namespaces_event, NULL); } /* * cgroup tracking */ #ifdef CONFIG_CGROUP_PERF struct perf_cgroup_event { char *path; int path_size; struct { struct perf_event_header header; u64 id; char path[]; } event_id; }; static int perf_event_cgroup_match(struct perf_event *event) { return event->attr.cgroup; } static void perf_event_cgroup_output(struct perf_event *event, void *data) { struct perf_cgroup_event *cgroup_event = data; struct perf_output_handle handle; struct perf_sample_data sample; u16 header_size = cgroup_event->event_id.header.size; int ret; if (!perf_event_cgroup_match(event)) return; perf_event_header__init_id(&cgroup_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, cgroup_event->event_id.header.size); if (ret) goto out; perf_output_put(&handle, cgroup_event->event_id); __output_copy(&handle, cgroup_event->path, cgroup_event->path_size); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); out: cgroup_event->event_id.header.size = header_size; } static void perf_event_cgroup(struct cgroup *cgrp) { struct perf_cgroup_event cgroup_event; char path_enomem[16] = "//enomem"; char *pathname; size_t size; if (!atomic_read(&nr_cgroup_events)) return; cgroup_event = (struct perf_cgroup_event){ .event_id = { .header = { .type = PERF_RECORD_CGROUP, .misc = 0, .size = sizeof(cgroup_event.event_id), }, .id = cgroup_id(cgrp), }, }; pathname = kmalloc(PATH_MAX, GFP_KERNEL); if (pathname == NULL) { cgroup_event.path = path_enomem; } else { /* just to be sure to have enough space for alignment */ cgroup_path(cgrp, pathname, PATH_MAX - sizeof(u64)); cgroup_event.path = pathname; } /* * Since our buffer works in 8 byte units we need to align our string * size to a multiple of 8. However, we must guarantee the tail end is * zero'd out to avoid leaking random bits to userspace. */ size = strlen(cgroup_event.path) + 1; while (!IS_ALIGNED(size, sizeof(u64))) cgroup_event.path[size++] = '\0'; cgroup_event.event_id.header.size += size; cgroup_event.path_size = size; perf_iterate_sb(perf_event_cgroup_output, &cgroup_event, NULL); kfree(pathname); } #endif /* * mmap tracking */ struct perf_mmap_event { struct vm_area_struct *vma; const char *file_name; int file_size; int maj, min; u64 ino; u64 ino_generation; u32 prot, flags; u8 build_id[BUILD_ID_SIZE_MAX]; u32 build_id_size; struct { struct perf_event_header header; u32 pid; u32 tid; u64 start; u64 len; u64 pgoff; } event_id; }; static int perf_event_mmap_match(struct perf_event *event, void *data) { struct perf_mmap_event *mmap_event = data; struct vm_area_struct *vma = mmap_event->vma; int executable = vma->vm_flags & VM_EXEC; return (!executable && event->attr.mmap_data) || (executable && (event->attr.mmap || event->attr.mmap2)); } static void perf_event_mmap_output(struct perf_event *event, void *data) { struct perf_mmap_event *mmap_event = data; struct perf_output_handle handle; struct perf_sample_data sample; int size = mmap_event->event_id.header.size; u32 type = mmap_event->event_id.header.type; bool use_build_id; int ret; if (!perf_event_mmap_match(event, data)) return; if (event->attr.mmap2) { mmap_event->event_id.header.type = PERF_RECORD_MMAP2; mmap_event->event_id.header.size += sizeof(mmap_event->maj); mmap_event->event_id.header.size += sizeof(mmap_event->min); mmap_event->event_id.header.size += sizeof(mmap_event->ino); mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation); mmap_event->event_id.header.size += sizeof(mmap_event->prot); mmap_event->event_id.header.size += sizeof(mmap_event->flags); } perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, mmap_event->event_id.header.size); if (ret) goto out; mmap_event->event_id.pid = perf_event_pid(event, current); mmap_event->event_id.tid = perf_event_tid(event, current); use_build_id = event->attr.build_id && mmap_event->build_id_size; if (event->attr.mmap2 && use_build_id) mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_BUILD_ID; perf_output_put(&handle, mmap_event->event_id); if (event->attr.mmap2) { if (use_build_id) { u8 size[4] = { (u8) mmap_event->build_id_size, 0, 0, 0 }; __output_copy(&handle, size, 4); __output_copy(&handle, mmap_event->build_id, BUILD_ID_SIZE_MAX); } else { perf_output_put(&handle, mmap_event->maj); perf_output_put(&handle, mmap_event->min); perf_output_put(&handle, mmap_event->ino); perf_output_put(&handle, mmap_event->ino_generation); } perf_output_put(&handle, mmap_event->prot); perf_output_put(&handle, mmap_event->flags); } __output_copy(&handle, mmap_event->file_name, mmap_event->file_size); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); out: mmap_event->event_id.header.size = size; mmap_event->event_id.header.type = type; } static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) { struct vm_area_struct *vma = mmap_event->vma; struct file *file = vma->vm_file; int maj = 0, min = 0; u64 ino = 0, gen = 0; u32 prot = 0, flags = 0; unsigned int size; char tmp[16]; char *buf = NULL; char *name = NULL; if (vma->vm_flags & VM_READ) prot |= PROT_READ; if (vma->vm_flags & VM_WRITE) prot |= PROT_WRITE; if (vma->vm_flags & VM_EXEC) prot |= PROT_EXEC; if (vma->vm_flags & VM_MAYSHARE) flags = MAP_SHARED; else flags = MAP_PRIVATE; if (vma->vm_flags & VM_LOCKED) flags |= MAP_LOCKED; if (is_vm_hugetlb_page(vma)) flags |= MAP_HUGETLB; if (file) { struct inode *inode; dev_t dev; buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!buf) { name = "//enomem"; goto cpy_name; } /* * d_path() works from the end of the rb backwards, so we * need to add enough zero bytes after the string to handle * the 64bit alignment we do later. */ name = file_path(file, buf, PATH_MAX - sizeof(u64)); if (IS_ERR(name)) { name = "//toolong"; goto cpy_name; } inode = file_inode(vma->vm_file); dev = inode->i_sb->s_dev; ino = inode->i_ino; gen = inode->i_generation; maj = MAJOR(dev); min = MINOR(dev); goto got_name; } else { if (vma->vm_ops && vma->vm_ops->name) name = (char *) vma->vm_ops->name(vma); if (!name) name = (char *)arch_vma_name(vma); if (!name) { if (vma_is_initial_heap(vma)) name = "[heap]"; else if (vma_is_initial_stack(vma)) name = "[stack]"; else name = "//anon"; } } cpy_name: strscpy(tmp, name, sizeof(tmp)); name = tmp; got_name: /* * Since our buffer works in 8 byte units we need to align our string * size to a multiple of 8. However, we must guarantee the tail end is * zero'd out to avoid leaking random bits to userspace. */ size = strlen(name)+1; while (!IS_ALIGNED(size, sizeof(u64))) name[size++] = '\0'; mmap_event->file_name = name; mmap_event->file_size = size; mmap_event->maj = maj; mmap_event->min = min; mmap_event->ino = ino; mmap_event->ino_generation = gen; mmap_event->prot = prot; mmap_event->flags = flags; if (!(vma->vm_flags & VM_EXEC)) mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; if (atomic_read(&nr_build_id_events)) build_id_parse(vma, mmap_event->build_id, &mmap_event->build_id_size); perf_iterate_sb(perf_event_mmap_output, mmap_event, NULL); kfree(buf); } /* * Check whether inode and address range match filter criteria. */ static bool perf_addr_filter_match(struct perf_addr_filter *filter, struct file *file, unsigned long offset, unsigned long size) { /* d_inode(NULL) won't be equal to any mapped user-space file */ if (!filter->path.dentry) return false; if (d_inode(filter->path.dentry) != file_inode(file)) return false; if (filter->offset > offset + size) return false; if (filter->offset + filter->size < offset) return false; return true; } static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter, struct vm_area_struct *vma, struct perf_addr_filter_range *fr) { unsigned long vma_size = vma->vm_end - vma->vm_start; unsigned long off = vma->vm_pgoff << PAGE_SHIFT; struct file *file = vma->vm_file; if (!perf_addr_filter_match(filter, file, off, vma_size)) return false; if (filter->offset < off) { fr->start = vma->vm_start; fr->size = min(vma_size, filter->size - (off - filter->offset)); } else { fr->start = vma->vm_start + filter->offset - off; fr->size = min(vma->vm_end - fr->start, filter->size); } return true; } static void __perf_addr_filters_adjust(struct perf_event *event, void *data) { struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); struct vm_area_struct *vma = data; struct perf_addr_filter *filter; unsigned int restart = 0, count = 0; unsigned long flags; if (!has_addr_filter(event)) return; if (!vma->vm_file) return; raw_spin_lock_irqsave(&ifh->lock, flags); list_for_each_entry(filter, &ifh->list, entry) { if (perf_addr_filter_vma_adjust(filter, vma, &event->addr_filter_ranges[count])) restart++; count++; } if (restart) event->addr_filters_gen++; raw_spin_unlock_irqrestore(&ifh->lock, flags); if (restart) perf_event_stop(event, 1); } /* * Adjust all task's events' filters to the new vma */ static void perf_addr_filters_adjust(struct vm_area_struct *vma) { struct perf_event_context *ctx; /* * Data tracing isn't supported yet and as such there is no need * to keep track of anything that isn't related to executable code: */ if (!(vma->vm_flags & VM_EXEC)) return; rcu_read_lock(); ctx = rcu_dereference(current->perf_event_ctxp); if (ctx) perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true); rcu_read_unlock(); } void perf_event_mmap(struct vm_area_struct *vma) { struct perf_mmap_event mmap_event; if (!atomic_read(&nr_mmap_events)) return; mmap_event = (struct perf_mmap_event){ .vma = vma, /* .file_name */ /* .file_size */ .event_id = { .header = { .type = PERF_RECORD_MMAP, .misc = PERF_RECORD_MISC_USER, /* .size */ }, /* .pid */ /* .tid */ .start = vma->vm_start, .len = vma->vm_end - vma->vm_start, .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT, }, /* .maj (attr_mmap2 only) */ /* .min (attr_mmap2 only) */ /* .ino (attr_mmap2 only) */ /* .ino_generation (attr_mmap2 only) */ /* .prot (attr_mmap2 only) */ /* .flags (attr_mmap2 only) */ }; perf_addr_filters_adjust(vma); perf_event_mmap_event(&mmap_event); } void perf_event_aux_event(struct perf_event *event, unsigned long head, unsigned long size, u64 flags) { struct perf_output_handle handle; struct perf_sample_data sample; struct perf_aux_event { struct perf_event_header header; u64 offset; u64 size; u64 flags; } rec = { .header = { .type = PERF_RECORD_AUX, .misc = 0, .size = sizeof(rec), }, .offset = head, .size = size, .flags = flags, }; int ret; perf_event_header__init_id(&rec.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, rec.header.size); if (ret) return; perf_output_put(&handle, rec); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } /* * Lost/dropped samples logging */ void perf_log_lost_samples(struct perf_event *event, u64 lost) { struct perf_output_handle handle; struct perf_sample_data sample; int ret; struct { struct perf_event_header header; u64 lost; } lost_samples_event = { .header = { .type = PERF_RECORD_LOST_SAMPLES, .misc = 0, .size = sizeof(lost_samples_event), }, .lost = lost, }; perf_event_header__init_id(&lost_samples_event.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, lost_samples_event.header.size); if (ret) return; perf_output_put(&handle, lost_samples_event); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } /* * context_switch tracking */ struct perf_switch_event { struct task_struct *task; struct task_struct *next_prev; struct { struct perf_event_header header; u32 next_prev_pid; u32 next_prev_tid; } event_id; }; static int perf_event_switch_match(struct perf_event *event) { return event->attr.context_switch; } static void perf_event_switch_output(struct perf_event *event, void *data) { struct perf_switch_event *se = data; struct perf_output_handle handle; struct perf_sample_data sample; int ret; if (!perf_event_switch_match(event)) return; /* Only CPU-wide events are allowed to see next/prev pid/tid */ if (event->ctx->task) { se->event_id.header.type = PERF_RECORD_SWITCH; se->event_id.header.size = sizeof(se->event_id.header); } else { se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE; se->event_id.header.size = sizeof(se->event_id); se->event_id.next_prev_pid = perf_event_pid(event, se->next_prev); se->event_id.next_prev_tid = perf_event_tid(event, se->next_prev); } perf_event_header__init_id(&se->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size); if (ret) return; if (event->ctx->task) perf_output_put(&handle, se->event_id.header); else perf_output_put(&handle, se->event_id); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } static void perf_event_switch(struct task_struct *task, struct task_struct *next_prev, bool sched_in) { struct perf_switch_event switch_event; /* N.B. caller checks nr_switch_events != 0 */ switch_event = (struct perf_switch_event){ .task = task, .next_prev = next_prev, .event_id = { .header = { /* .type */ .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT, /* .size */ }, /* .next_prev_pid */ /* .next_prev_tid */ }, }; if (!sched_in && task->on_rq) { switch_event.event_id.header.misc |= PERF_RECORD_MISC_SWITCH_OUT_PREEMPT; } perf_iterate_sb(perf_event_switch_output, &switch_event, NULL); } /* * IRQ throttle logging */ static void perf_log_throttle(struct perf_event *event, int enable) { struct perf_output_handle handle; struct perf_sample_data sample; int ret; struct { struct perf_event_header header; u64 time; u64 id; u64 stream_id; } throttle_event = { .header = { .type = PERF_RECORD_THROTTLE, .misc = 0, .size = sizeof(throttle_event), }, .time = perf_event_clock(event), .id = primary_event_id(event), .stream_id = event->id, }; if (enable) throttle_event.header.type = PERF_RECORD_UNTHROTTLE; perf_event_header__init_id(&throttle_event.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, throttle_event.header.size); if (ret) return; perf_output_put(&handle, throttle_event); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } /* * ksymbol register/unregister tracking */ struct perf_ksymbol_event { const char *name; int name_len; struct { struct perf_event_header header; u64 addr; u32 len; u16 ksym_type; u16 flags; } event_id; }; static int perf_event_ksymbol_match(struct perf_event *event) { return event->attr.ksymbol; } static void perf_event_ksymbol_output(struct perf_event *event, void *data) { struct perf_ksymbol_event *ksymbol_event = data; struct perf_output_handle handle; struct perf_sample_data sample; int ret; if (!perf_event_ksymbol_match(event)) return; perf_event_header__init_id(&ksymbol_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, ksymbol_event->event_id.header.size); if (ret) return; perf_output_put(&handle, ksymbol_event->event_id); __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, const char *sym) { struct perf_ksymbol_event ksymbol_event; char name[KSYM_NAME_LEN]; u16 flags = 0; int name_len; if (!atomic_read(&nr_ksymbol_events)) return; if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX || ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN) goto err; strscpy(name, sym, KSYM_NAME_LEN); name_len = strlen(name) + 1; while (!IS_ALIGNED(name_len, sizeof(u64))) name[name_len++] = '\0'; BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64)); if (unregister) flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER; ksymbol_event = (struct perf_ksymbol_event){ .name = name, .name_len = name_len, .event_id = { .header = { .type = PERF_RECORD_KSYMBOL, .size = sizeof(ksymbol_event.event_id) + name_len, }, .addr = addr, .len = len, .ksym_type = ksym_type, .flags = flags, }, }; perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL); return; err: WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type); } /* * bpf program load/unload tracking */ struct perf_bpf_event { struct bpf_prog *prog; struct { struct perf_event_header header; u16 type; u16 flags; u32 id; u8 tag[BPF_TAG_SIZE]; } event_id; }; static int perf_event_bpf_match(struct perf_event *event) { return event->attr.bpf_event; } static void perf_event_bpf_output(struct perf_event *event, void *data) { struct perf_bpf_event *bpf_event = data; struct perf_output_handle handle; struct perf_sample_data sample; int ret; if (!perf_event_bpf_match(event)) return; perf_event_header__init_id(&bpf_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, bpf_event->event_id.header.size); if (ret) return; perf_output_put(&handle, bpf_event->event_id); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog, enum perf_bpf_event_type type) { bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD; int i; if (prog->aux->func_cnt == 0) { perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, (u64)(unsigned long)prog->bpf_func, prog->jited_len, unregister, prog->aux->ksym.name); } else { for (i = 0; i < prog->aux->func_cnt; i++) { struct bpf_prog *subprog = prog->aux->func[i]; perf_event_ksymbol( PERF_RECORD_KSYMBOL_TYPE_BPF, (u64)(unsigned long)subprog->bpf_func, subprog->jited_len, unregister, subprog->aux->ksym.name); } } } void perf_event_bpf_event(struct bpf_prog *prog, enum perf_bpf_event_type type, u16 flags) { struct perf_bpf_event bpf_event; if (type <= PERF_BPF_EVENT_UNKNOWN || type >= PERF_BPF_EVENT_MAX) return; switch (type) { case PERF_BPF_EVENT_PROG_LOAD: case PERF_BPF_EVENT_PROG_UNLOAD: if (atomic_read(&nr_ksymbol_events)) perf_event_bpf_emit_ksymbols(prog, type); break; default: break; } if (!atomic_read(&nr_bpf_events)) return; bpf_event = (struct perf_bpf_event){ .prog = prog, .event_id = { .header = { .type = PERF_RECORD_BPF_EVENT, .size = sizeof(bpf_event.event_id), }, .type = type, .flags = flags, .id = prog->aux->id, }, }; BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64)); memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE); perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL); } struct perf_text_poke_event { const void *old_bytes; const void *new_bytes; size_t pad; u16 old_len; u16 new_len; struct { struct perf_event_header header; u64 addr; } event_id; }; static int perf_event_text_poke_match(struct perf_event *event) { return event->attr.text_poke; } static void perf_event_text_poke_output(struct perf_event *event, void *data) { struct perf_text_poke_event *text_poke_event = data; struct perf_output_handle handle; struct perf_sample_data sample; u64 padding = 0; int ret; if (!perf_event_text_poke_match(event)) return; perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, text_poke_event->event_id.header.size); if (ret) return; perf_output_put(&handle, text_poke_event->event_id); perf_output_put(&handle, text_poke_event->old_len); perf_output_put(&handle, text_poke_event->new_len); __output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len); __output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len); if (text_poke_event->pad) __output_copy(&handle, &padding, text_poke_event->pad); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } void perf_event_text_poke(const void *addr, const void *old_bytes, size_t old_len, const void *new_bytes, size_t new_len) { struct perf_text_poke_event text_poke_event; size_t tot, pad; if (!atomic_read(&nr_text_poke_events)) return; tot = sizeof(text_poke_event.old_len) + old_len; tot += sizeof(text_poke_event.new_len) + new_len; pad = ALIGN(tot, sizeof(u64)) - tot; text_poke_event = (struct perf_text_poke_event){ .old_bytes = old_bytes, .new_bytes = new_bytes, .pad = pad, .old_len = old_len, .new_len = new_len, .event_id = { .header = { .type = PERF_RECORD_TEXT_POKE, .misc = PERF_RECORD_MISC_KERNEL, .size = sizeof(text_poke_event.event_id) + tot + pad, }, .addr = (unsigned long)addr, }, }; perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL); } void perf_event_itrace_started(struct perf_event *event) { event->attach_state |= PERF_ATTACH_ITRACE; } static void perf_log_itrace_start(struct perf_event *event) { struct perf_output_handle handle; struct perf_sample_data sample; struct perf_aux_event { struct perf_event_header header; u32 pid; u32 tid; } rec; int ret; if (event->parent) event = event->parent; if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) || event->attach_state & PERF_ATTACH_ITRACE) return; rec.header.type = PERF_RECORD_ITRACE_START; rec.header.misc = 0; rec.header.size = sizeof(rec); rec.pid = perf_event_pid(event, current); rec.tid = perf_event_tid(event, current); perf_event_header__init_id(&rec.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, rec.header.size); if (ret) return; perf_output_put(&handle, rec); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } void perf_report_aux_output_id(struct perf_event *event, u64 hw_id) { struct perf_output_handle handle; struct perf_sample_data sample; struct perf_aux_event { struct perf_event_header header; u64 hw_id; } rec; int ret; if (event->parent) event = event->parent; rec.header.type = PERF_RECORD_AUX_OUTPUT_HW_ID; rec.header.misc = 0; rec.header.size = sizeof(rec); rec.hw_id = hw_id; perf_event_header__init_id(&rec.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, rec.header.size); if (ret) return; perf_output_put(&handle, rec); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } EXPORT_SYMBOL_GPL(perf_report_aux_output_id); static int __perf_event_account_interrupt(struct perf_event *event, int throttle) { struct hw_perf_event *hwc = &event->hw; int ret = 0; u64 seq; seq = __this_cpu_read(perf_throttled_seq); if (seq != hwc->interrupts_seq) { hwc->interrupts_seq = seq; hwc->interrupts = 1; } else { hwc->interrupts++; if (unlikely(throttle && hwc->interrupts > max_samples_per_tick)) { __this_cpu_inc(perf_throttled_count); tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); hwc->interrupts = MAX_INTERRUPTS; perf_log_throttle(event, 0); ret = 1; } } if (event->attr.freq) { u64 now = perf_clock(); s64 delta = now - hwc->freq_time_stamp; hwc->freq_time_stamp = now; if (delta > 0 && delta < 2*TICK_NSEC) perf_adjust_period(event, delta, hwc->last_period, true); } return ret; } int perf_event_account_interrupt(struct perf_event *event) { return __perf_event_account_interrupt(event, 1); } static inline bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs) { /* * Due to interrupt latency (AKA "skid"), we may enter the * kernel before taking an overflow, even if the PMU is only * counting user events. */ if (event->attr.exclude_kernel && !user_mode(regs)) return false; return true; } /* * Generic event overflow handling, sampling. */ static int __perf_event_overflow(struct perf_event *event, int throttle, struct perf_sample_data *data, struct pt_regs *regs) { int events = atomic_read(&event->event_limit); int ret = 0; /* * Non-sampling counters might still use the PMI to fold short * hardware counters, ignore those. */ if (unlikely(!is_sampling_event(event))) return 0; ret = __perf_event_account_interrupt(event, throttle); /* * XXX event_limit might not quite work as expected on inherited * events */ event->pending_kill = POLL_IN; if (events && atomic_dec_and_test(&event->event_limit)) { ret = 1; event->pending_kill = POLL_HUP; perf_event_disable_inatomic(event); } if (event->attr.sigtrap) { /* * The desired behaviour of sigtrap vs invalid samples is a bit * tricky; on the one hand, one should not loose the SIGTRAP if * it is the first event, on the other hand, we should also not * trigger the WARN or override the data address. */ bool valid_sample = sample_is_allowed(event, regs); unsigned int pending_id = 1; if (regs) pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1; if (!event->pending_sigtrap) { event->pending_sigtrap = pending_id; local_inc(&event->ctx->nr_pending); } else if (event->attr.exclude_kernel && valid_sample) { /* * Should not be able to return to user space without * consuming pending_sigtrap; with exceptions: * * 1. Where !exclude_kernel, events can overflow again * in the kernel without returning to user space. * * 2. Events that can overflow again before the IRQ- * work without user space progress (e.g. hrtimer). * To approximate progress (with false negatives), * check 32-bit hash of the current IP. */ WARN_ON_ONCE(event->pending_sigtrap != pending_id); } event->pending_addr = 0; if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR)) event->pending_addr = data->addr; irq_work_queue(&event->pending_irq); } READ_ONCE(event->overflow_handler)(event, data, regs); if (*perf_event_fasync(event) && event->pending_kill) { event->pending_wakeup = 1; irq_work_queue(&event->pending_irq); } return ret; } int perf_event_overflow(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { return __perf_event_overflow(event, 1, data, regs); } /* * Generic software event infrastructure */ struct swevent_htable { struct swevent_hlist *swevent_hlist; struct mutex hlist_mutex; int hlist_refcount; /* Recursion avoidance in each contexts */ int recursion[PERF_NR_CONTEXTS]; }; static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); /* * We directly increment event->count and keep a second value in * event->hw.period_left to count intervals. This period event * is kept in the range [-sample_period, 0] so that we can use the * sign as trigger. */ u64 perf_swevent_set_period(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; u64 period = hwc->last_period; u64 nr, offset; s64 old, val; hwc->last_period = hwc->sample_period; old = local64_read(&hwc->period_left); do { val = old; if (val < 0) return 0; nr = div64_u64(period + val, period); offset = nr * period; val -= offset; } while (!local64_try_cmpxchg(&hwc->period_left, &old, val)); return nr; } static void perf_swevent_overflow(struct perf_event *event, u64 overflow, struct perf_sample_data *data, struct pt_regs *regs) { struct hw_perf_event *hwc = &event->hw; int throttle = 0; if (!overflow) overflow = perf_swevent_set_period(event); if (hwc->interrupts == MAX_INTERRUPTS) return; for (; overflow; overflow--) { if (__perf_event_overflow(event, throttle, data, regs)) { /* * We inhibit the overflow from happening when * hwc->interrupts == MAX_INTERRUPTS. */ break; } throttle = 1; } } static void perf_swevent_event(struct perf_event *event, u64 nr, struct perf_sample_data *data, struct pt_regs *regs) { struct hw_perf_event *hwc = &event->hw; local64_add(nr, &event->count); if (!regs) return; if (!is_sampling_event(event)) return; if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) { data->period = nr; return perf_swevent_overflow(event, 1, data, regs); } else data->period = event->hw.last_period; if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) return perf_swevent_overflow(event, 1, data, regs); if (local64_add_negative(nr, &hwc->period_left)) return; perf_swevent_overflow(event, 0, data, regs); } static int perf_exclude_event(struct perf_event *event, struct pt_regs *regs) { if (event->hw.state & PERF_HES_STOPPED) return 1; if (regs) { if (event->attr.exclude_user && user_mode(regs)) return 1; if (event->attr.exclude_kernel && !user_mode(regs)) return 1; } return 0; } static int perf_swevent_match(struct perf_event *event, enum perf_type_id type, u32 event_id, struct perf_sample_data *data, struct pt_regs *regs) { if (event->attr.type != type) return 0; if (event->attr.config != event_id) return 0; if (perf_exclude_event(event, regs)) return 0; return 1; } static inline u64 swevent_hash(u64 type, u32 event_id) { u64 val = event_id | (type << 32); return hash_64(val, SWEVENT_HLIST_BITS); } static inline struct hlist_head * __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id) { u64 hash = swevent_hash(type, event_id); return &hlist->heads[hash]; } /* For the read side: events when they trigger */ static inline struct hlist_head * find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id) { struct swevent_hlist *hlist; hlist = rcu_dereference(swhash->swevent_hlist); if (!hlist) return NULL; return __find_swevent_head(hlist, type, event_id); } /* For the event head insertion and removal in the hlist */ static inline struct hlist_head * find_swevent_head(struct swevent_htable *swhash, struct perf_event *event) { struct swevent_hlist *hlist; u32 event_id = event->attr.config; u64 type = event->attr.type; /* * Event scheduling is always serialized against hlist allocation * and release. Which makes the protected version suitable here. * The context lock guarantees that. */ hlist = rcu_dereference_protected(swhash->swevent_hlist, lockdep_is_held(&event->ctx->lock)); if (!hlist) return NULL; return __find_swevent_head(hlist, type, event_id); } static void do_perf_sw_event(enum perf_type_id type, u32 event_id, u64 nr, struct perf_sample_data *data, struct pt_regs *regs) { struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); struct perf_event *event; struct hlist_head *head; rcu_read_lock(); head = find_swevent_head_rcu(swhash, type, event_id); if (!head) goto end; hlist_for_each_entry_rcu(event, head, hlist_entry) { if (perf_swevent_match(event, type, event_id, data, regs)) perf_swevent_event(event, nr, data, regs); } end: rcu_read_unlock(); } DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]); int perf_swevent_get_recursion_context(void) { struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); return get_recursion_context(swhash->recursion); } EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); void perf_swevent_put_recursion_context(int rctx) { struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); put_recursion_context(swhash->recursion, rctx); } void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { struct perf_sample_data data; if (WARN_ON_ONCE(!regs)) return; perf_sample_data_init(&data, addr, 0); do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); } void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { int rctx; preempt_disable_notrace(); rctx = perf_swevent_get_recursion_context(); if (unlikely(rctx < 0)) goto fail; ___perf_sw_event(event_id, nr, regs, addr); perf_swevent_put_recursion_context(rctx); fail: preempt_enable_notrace(); } static void perf_swevent_read(struct perf_event *event) { } static int perf_swevent_add(struct perf_event *event, int flags) { struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); struct hw_perf_event *hwc = &event->hw; struct hlist_head *head; if (is_sampling_event(event)) { hwc->last_period = hwc->sample_period; perf_swevent_set_period(event); } hwc->state = !(flags & PERF_EF_START); head = find_swevent_head(swhash, event); if (WARN_ON_ONCE(!head)) return -EINVAL; hlist_add_head_rcu(&event->hlist_entry, head); perf_event_update_userpage(event); return 0; } static void perf_swevent_del(struct perf_event *event, int flags) { hlist_del_rcu(&event->hlist_entry); } static void perf_swevent_start(struct perf_event *event, int flags) { event->hw.state = 0; } static void perf_swevent_stop(struct perf_event *event, int flags) { event->hw.state = PERF_HES_STOPPED; } /* Deref the hlist from the update side */ static inline struct swevent_hlist * swevent_hlist_deref(struct swevent_htable *swhash) { return rcu_dereference_protected(swhash->swevent_hlist, lockdep_is_held(&swhash->hlist_mutex)); } static void swevent_hlist_release(struct swevent_htable *swhash) { struct swevent_hlist *hlist = swevent_hlist_deref(swhash); if (!hlist) return; RCU_INIT_POINTER(swhash->swevent_hlist, NULL); kfree_rcu(hlist, rcu_head); } static void swevent_hlist_put_cpu(int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); mutex_lock(&swhash->hlist_mutex); if (!--swhash->hlist_refcount) swevent_hlist_release(swhash); mutex_unlock(&swhash->hlist_mutex); } static void swevent_hlist_put(void) { int cpu; for_each_possible_cpu(cpu) swevent_hlist_put_cpu(cpu); } static int swevent_hlist_get_cpu(int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); int err = 0; mutex_lock(&swhash->hlist_mutex); if (!swevent_hlist_deref(swhash) && cpumask_test_cpu(cpu, perf_online_mask)) { struct swevent_hlist *hlist; hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); if (!hlist) { err = -ENOMEM; goto exit; } rcu_assign_pointer(swhash->swevent_hlist, hlist); } swhash->hlist_refcount++; exit: mutex_unlock(&swhash->hlist_mutex); return err; } static int swevent_hlist_get(void) { int err, cpu, failed_cpu; mutex_lock(&pmus_lock); for_each_possible_cpu(cpu) { err = swevent_hlist_get_cpu(cpu); if (err) { failed_cpu = cpu; goto fail; } } mutex_unlock(&pmus_lock); return 0; fail: for_each_possible_cpu(cpu) { if (cpu == failed_cpu) break; swevent_hlist_put_cpu(cpu); } mutex_unlock(&pmus_lock); return err; } struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; static void sw_perf_event_destroy(struct perf_event *event) { u64 event_id = event->attr.config; WARN_ON(event->parent); static_key_slow_dec(&perf_swevent_enabled[event_id]); swevent_hlist_put(); } static struct pmu perf_cpu_clock; /* fwd declaration */ static struct pmu perf_task_clock; static int perf_swevent_init(struct perf_event *event) { u64 event_id = event->attr.config; if (event->attr.type != PERF_TYPE_SOFTWARE) return -ENOENT; /* * no branch sampling for software events */ if (has_branch_stack(event)) return -EOPNOTSUPP; switch (event_id) { case PERF_COUNT_SW_CPU_CLOCK: event->attr.type = perf_cpu_clock.type; return -ENOENT; case PERF_COUNT_SW_TASK_CLOCK: event->attr.type = perf_task_clock.type; return -ENOENT; default: break; } if (event_id >= PERF_COUNT_SW_MAX) return -ENOENT; if (!event->parent) { int err; err = swevent_hlist_get(); if (err) return err; static_key_slow_inc(&perf_swevent_enabled[event_id]); event->destroy = sw_perf_event_destroy; } return 0; } static struct pmu perf_swevent = { .task_ctx_nr = perf_sw_context, .capabilities = PERF_PMU_CAP_NO_NMI, .event_init = perf_swevent_init, .add = perf_swevent_add, .del = perf_swevent_del, .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, }; #ifdef CONFIG_EVENT_TRACING static void tp_perf_event_destroy(struct perf_event *event) { perf_trace_destroy(event); } static int perf_tp_event_init(struct perf_event *event) { int err; if (event->attr.type != PERF_TYPE_TRACEPOINT) return -ENOENT; /* * no branch sampling for tracepoint events */ if (has_branch_stack(event)) return -EOPNOTSUPP; err = perf_trace_init(event); if (err) return err; event->destroy = tp_perf_event_destroy; return 0; } static struct pmu perf_tracepoint = { .task_ctx_nr = perf_sw_context, .event_init = perf_tp_event_init, .add = perf_trace_add, .del = perf_trace_del, .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, }; static int perf_tp_filter_match(struct perf_event *event, struct perf_sample_data *data) { void *record = data->raw->frag.data; /* only top level events have filters set */ if (event->parent) event = event->parent; if (likely(!event->filter) || filter_match_preds(event->filter, record)) return 1; return 0; } static int perf_tp_event_match(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { if (event->hw.state & PERF_HES_STOPPED) return 0; /* * If exclude_kernel, only trace user-space tracepoints (uprobes) */ if (event->attr.exclude_kernel && !user_mode(regs)) return 0; if (!perf_tp_filter_match(event, data)) return 0; return 1; } void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, struct trace_event_call *call, u64 count, struct pt_regs *regs, struct hlist_head *head, struct task_struct *task) { if (bpf_prog_array_valid(call)) { *(struct pt_regs **)raw_data = regs; if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) { perf_swevent_put_recursion_context(rctx); return; } } perf_tp_event(call->event.type, count, raw_data, size, regs, head, rctx, task); } EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit); static void __perf_tp_event_target_task(u64 count, void *record, struct pt_regs *regs, struct perf_sample_data *data, struct perf_event *event) { struct trace_entry *entry = record; if (event->attr.config != entry->type) return; /* Cannot deliver synchronous signal to other task. */ if (event->attr.sigtrap) return; if (perf_tp_event_match(event, data, regs)) perf_swevent_event(event, count, data, regs); } static void perf_tp_event_target_task(u64 count, void *record, struct pt_regs *regs, struct perf_sample_data *data, struct perf_event_context *ctx) { unsigned int cpu = smp_processor_id(); struct pmu *pmu = &perf_tracepoint; struct perf_event *event, *sibling; perf_event_groups_for_cpu_pmu(event, &ctx->pinned_groups, cpu, pmu) { __perf_tp_event_target_task(count, record, regs, data, event); for_each_sibling_event(sibling, event) __perf_tp_event_target_task(count, record, regs, data, sibling); } perf_event_groups_for_cpu_pmu(event, &ctx->flexible_groups, cpu, pmu) { __perf_tp_event_target_task(count, record, regs, data, event); for_each_sibling_event(sibling, event) __perf_tp_event_target_task(count, record, regs, data, sibling); } } void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, struct hlist_head *head, int rctx, struct task_struct *task) { struct perf_sample_data data; struct perf_event *event; struct perf_raw_record raw = { .frag = { .size = entry_size, .data = record, }, }; perf_sample_data_init(&data, 0, 0); perf_sample_save_raw_data(&data, &raw); perf_trace_buf_update(record, event_type); hlist_for_each_entry_rcu(event, head, hlist_entry) { if (perf_tp_event_match(event, &data, regs)) { perf_swevent_event(event, count, &data, regs); /* * Here use the same on-stack perf_sample_data, * some members in data are event-specific and * need to be re-computed for different sweveents. * Re-initialize data->sample_flags safely to avoid * the problem that next event skips preparing data * because data->sample_flags is set. */ perf_sample_data_init(&data, 0, 0); perf_sample_save_raw_data(&data, &raw); } } /* * If we got specified a target task, also iterate its context and * deliver this event there too. */ if (task && task != current) { struct perf_event_context *ctx; rcu_read_lock(); ctx = rcu_dereference(task->perf_event_ctxp); if (!ctx) goto unlock; raw_spin_lock(&ctx->lock); perf_tp_event_target_task(count, record, regs, &data, ctx); raw_spin_unlock(&ctx->lock); unlock: rcu_read_unlock(); } perf_swevent_put_recursion_context(rctx); } EXPORT_SYMBOL_GPL(perf_tp_event); #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) /* * Flags in config, used by dynamic PMU kprobe and uprobe * The flags should match following PMU_FORMAT_ATTR(). * * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe * if not set, create kprobe/uprobe * * The following values specify a reference counter (or semaphore in the * terminology of tools like dtrace, systemtap, etc.) Userspace Statically * Defined Tracepoints (USDT). Currently, we use 40 bit for the offset. * * PERF_UPROBE_REF_CTR_OFFSET_BITS # of bits in config as th offset * PERF_UPROBE_REF_CTR_OFFSET_SHIFT # of bits to shift left */ enum perf_probe_config { PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0, /* [k,u]retprobe */ PERF_UPROBE_REF_CTR_OFFSET_BITS = 32, PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS, }; PMU_FORMAT_ATTR(retprobe, "config:0"); #endif #ifdef CONFIG_KPROBE_EVENTS static struct attribute *kprobe_attrs[] = { &format_attr_retprobe.attr, NULL, }; static struct attribute_group kprobe_format_group = { .name = "format", .attrs = kprobe_attrs, }; static const struct attribute_group *kprobe_attr_groups[] = { &kprobe_format_group, NULL, }; static int perf_kprobe_event_init(struct perf_event *event); static struct pmu perf_kprobe = { .task_ctx_nr = perf_sw_context, .event_init = perf_kprobe_event_init, .add = perf_trace_add, .del = perf_trace_del, .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, .attr_groups = kprobe_attr_groups, }; static int perf_kprobe_event_init(struct perf_event *event) { int err; bool is_retprobe; if (event->attr.type != perf_kprobe.type) return -ENOENT; if (!perfmon_capable()) return -EACCES; /* * no branch sampling for probe events */ if (has_branch_stack(event)) return -EOPNOTSUPP; is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE; err = perf_kprobe_init(event, is_retprobe); if (err) return err; event->destroy = perf_kprobe_destroy; return 0; } #endif /* CONFIG_KPROBE_EVENTS */ #ifdef CONFIG_UPROBE_EVENTS PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63"); static struct attribute *uprobe_attrs[] = { &format_attr_retprobe.attr, &format_attr_ref_ctr_offset.attr, NULL, }; static struct attribute_group uprobe_format_group = { .name = "format", .attrs = uprobe_attrs, }; static const struct attribute_group *uprobe_attr_groups[] = { &uprobe_format_group, NULL, }; static int perf_uprobe_event_init(struct perf_event *event); static struct pmu perf_uprobe = { .task_ctx_nr = perf_sw_context, .event_init = perf_uprobe_event_init, .add = perf_trace_add, .del = perf_trace_del, .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, .attr_groups = uprobe_attr_groups, }; static int perf_uprobe_event_init(struct perf_event *event) { int err; unsigned long ref_ctr_offset; bool is_retprobe; if (event->attr.type != perf_uprobe.type) return -ENOENT; if (!perfmon_capable()) return -EACCES; /* * no branch sampling for probe events */ if (has_branch_stack(event)) return -EOPNOTSUPP; is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE; ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT; err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe); if (err) return err; event->destroy = perf_uprobe_destroy; return 0; } #endif /* CONFIG_UPROBE_EVENTS */ static inline void perf_tp_register(void) { perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); #ifdef CONFIG_KPROBE_EVENTS perf_pmu_register(&perf_kprobe, "kprobe", -1); #endif #ifdef CONFIG_UPROBE_EVENTS perf_pmu_register(&perf_uprobe, "uprobe", -1); #endif } static void perf_event_free_filter(struct perf_event *event) { ftrace_profile_free_filter(event); } #ifdef CONFIG_BPF_SYSCALL static void bpf_overflow_handler(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { struct bpf_perf_event_data_kern ctx = { .data = data, .event = event, }; struct bpf_prog *prog; int ret = 0; ctx.regs = perf_arch_bpf_user_pt_regs(regs); if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) goto out; rcu_read_lock(); prog = READ_ONCE(event->prog); if (prog) { perf_prepare_sample(data, event, regs); ret = bpf_prog_run(prog, &ctx); } rcu_read_unlock(); out: __this_cpu_dec(bpf_prog_active); if (!ret) return; event->orig_overflow_handler(event, data, regs); } static int perf_event_set_bpf_handler(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie) { if (event->overflow_handler_context) /* hw breakpoint or kernel counter */ return -EINVAL; if (event->prog) return -EEXIST; if (prog->type != BPF_PROG_TYPE_PERF_EVENT) return -EINVAL; if (event->attr.precise_ip && prog->call_get_stack && (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) || event->attr.exclude_callchain_kernel || event->attr.exclude_callchain_user)) { /* * On perf_event with precise_ip, calling bpf_get_stack() * may trigger unwinder warnings and occasional crashes. * bpf_get_[stack|stackid] works around this issue by using * callchain attached to perf_sample_data. If the * perf_event does not full (kernel and user) callchain * attached to perf_sample_data, do not allow attaching BPF * program that calls bpf_get_[stack|stackid]. */ return -EPROTO; } event->prog = prog; event->bpf_cookie = bpf_cookie; event->orig_overflow_handler = READ_ONCE(event->overflow_handler); WRITE_ONCE(event->overflow_handler, bpf_overflow_handler); return 0; } static void perf_event_free_bpf_handler(struct perf_event *event) { struct bpf_prog *prog = event->prog; if (!prog) return; WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler); event->prog = NULL; bpf_prog_put(prog); } #else static int perf_event_set_bpf_handler(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie) { return -EOPNOTSUPP; } static void perf_event_free_bpf_handler(struct perf_event *event) { } #endif /* * returns true if the event is a tracepoint, or a kprobe/upprobe created * with perf_event_open() */ static inline bool perf_event_is_tracing(struct perf_event *event) { if (event->pmu == &perf_tracepoint) return true; #ifdef CONFIG_KPROBE_EVENTS if (event->pmu == &perf_kprobe) return true; #endif #ifdef CONFIG_UPROBE_EVENTS if (event->pmu == &perf_uprobe) return true; #endif return false; } int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie) { bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp; if (!perf_event_is_tracing(event)) return perf_event_set_bpf_handler(event, prog, bpf_cookie); is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_KPROBE; is_uprobe = event->tp_event->flags & TRACE_EVENT_FL_UPROBE; is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT; is_syscall_tp = is_syscall_trace_event(event->tp_event); if (!is_kprobe && !is_uprobe && !is_tracepoint && !is_syscall_tp) /* bpf programs can only be attached to u/kprobe or tracepoint */ return -EINVAL; if (((is_kprobe || is_uprobe) && prog->type != BPF_PROG_TYPE_KPROBE) || (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) || (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) return -EINVAL; if (prog->type == BPF_PROG_TYPE_KPROBE && prog->aux->sleepable && !is_uprobe) /* only uprobe programs are allowed to be sleepable */ return -EINVAL; /* Kprobe override only works for kprobes, not uprobes. */ if (prog->kprobe_override && !is_kprobe) return -EINVAL; if (is_tracepoint || is_syscall_tp) { int off = trace_event_get_offsets(event->tp_event); if (prog->aux->max_ctx_offset > off) return -EACCES; } return perf_event_attach_bpf_prog(event, prog, bpf_cookie); } void perf_event_free_bpf_prog(struct perf_event *event) { if (!perf_event_is_tracing(event)) { perf_event_free_bpf_handler(event); return; } perf_event_detach_bpf_prog(event); } #else static inline void perf_tp_register(void) { } static void perf_event_free_filter(struct perf_event *event) { } int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie) { return -ENOENT; } void perf_event_free_bpf_prog(struct perf_event *event) { } #endif /* CONFIG_EVENT_TRACING */ #ifdef CONFIG_HAVE_HW_BREAKPOINT void perf_bp_event(struct perf_event *bp, void *data) { struct perf_sample_data sample; struct pt_regs *regs = data; perf_sample_data_init(&sample, bp->attr.bp_addr, 0); if (!bp->hw.state && !perf_exclude_event(bp, regs)) perf_swevent_event(bp, 1, &sample, regs); } #endif /* * Allocate a new address filter */ static struct perf_addr_filter * perf_addr_filter_new(struct perf_event *event, struct list_head *filters) { int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu); struct perf_addr_filter *filter; filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node); if (!filter) return NULL; INIT_LIST_HEAD(&filter->entry); list_add_tail(&filter->entry, filters); return filter; } static void free_filters_list(struct list_head *filters) { struct perf_addr_filter *filter, *iter; list_for_each_entry_safe(filter, iter, filters, entry) { path_put(&filter->path); list_del(&filter->entry); kfree(filter); } } /* * Free existing address filters and optionally install new ones */ static void perf_addr_filters_splice(struct perf_event *event, struct list_head *head) { unsigned long flags; LIST_HEAD(list); if (!has_addr_filter(event)) return; /* don't bother with children, they don't have their own filters */ if (event->parent) return; raw_spin_lock_irqsave(&event->addr_filters.lock, flags); list_splice_init(&event->addr_filters.list, &list); if (head) list_splice(head, &event->addr_filters.list); raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags); free_filters_list(&list); } /* * Scan through mm's vmas and see if one of them matches the * @filter; if so, adjust filter's address range. * Called with mm::mmap_lock down for reading. */ static void perf_addr_filter_apply(struct perf_addr_filter *filter, struct mm_struct *mm, struct perf_addr_filter_range *fr) { struct vm_area_struct *vma; VMA_ITERATOR(vmi, mm, 0); for_each_vma(vmi, vma) { if (!vma->vm_file) continue; if (perf_addr_filter_vma_adjust(filter, vma, fr)) return; } } /* * Update event's address range filters based on the * task's existing mappings, if any. */ static void perf_event_addr_filters_apply(struct perf_event *event) { struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); struct task_struct *task = READ_ONCE(event->ctx->task); struct perf_addr_filter *filter; struct mm_struct *mm = NULL; unsigned int count = 0; unsigned long flags; /* * We may observe TASK_TOMBSTONE, which means that the event tear-down * will stop on the parent's child_mutex that our caller is also holding */ if (task == TASK_TOMBSTONE) return; if (ifh->nr_file_filters) { mm = get_task_mm(task); if (!mm) goto restart; mmap_read_lock(mm); } raw_spin_lock_irqsave(&ifh->lock, flags); list_for_each_entry(filter, &ifh->list, entry) { if (filter->path.dentry) { /* * Adjust base offset if the filter is associated to a * binary that needs to be mapped: */ event->addr_filter_ranges[count].start = 0; event->addr_filter_ranges[count].size = 0; perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]); } else { event->addr_filter_ranges[count].start = filter->offset; event->addr_filter_ranges[count].size = filter->size; } count++; } event->addr_filters_gen++; raw_spin_unlock_irqrestore(&ifh->lock, flags); if (ifh->nr_file_filters) { mmap_read_unlock(mm); mmput(mm); } restart: perf_event_stop(event, 1); } /* * Address range filtering: limiting the data to certain * instruction address ranges. Filters are ioctl()ed to us from * userspace as ascii strings. * * Filter string format: * * ACTION RANGE_SPEC * where ACTION is one of the * * "filter": limit the trace to this region * * "start": start tracing from this address * * "stop": stop tracing at this address/region; * RANGE_SPEC is * * for kernel addresses: <start address>[/<size>] * * for object files: <start address>[/<size>]@</path/to/object/file> * * if <size> is not specified or is zero, the range is treated as a single * address; not valid for ACTION=="filter". */ enum { IF_ACT_NONE = -1, IF_ACT_FILTER, IF_ACT_START, IF_ACT_STOP, IF_SRC_FILE, IF_SRC_KERNEL, IF_SRC_FILEADDR, IF_SRC_KERNELADDR, }; enum { IF_STATE_ACTION = 0, IF_STATE_SOURCE, IF_STATE_END, }; static const match_table_t if_tokens = { { IF_ACT_FILTER, "filter" }, { IF_ACT_START, "start" }, { IF_ACT_STOP, "stop" }, { IF_SRC_FILE, "%u/%u@%s" }, { IF_SRC_KERNEL, "%u/%u" }, { IF_SRC_FILEADDR, "%u@%s" }, { IF_SRC_KERNELADDR, "%u" }, { IF_ACT_NONE, NULL }, }; /* * Address filter string parser */ static int perf_event_parse_addr_filter(struct perf_event *event, char *fstr, struct list_head *filters) { struct perf_addr_filter *filter = NULL; char *start, *orig, *filename = NULL; substring_t args[MAX_OPT_ARGS]; int state = IF_STATE_ACTION, token; unsigned int kernel = 0; int ret = -EINVAL; orig = fstr = kstrdup(fstr, GFP_KERNEL); if (!fstr) return -ENOMEM; while ((start = strsep(&fstr, " ,\n")) != NULL) { static const enum perf_addr_filter_action_t actions[] = { [IF_ACT_FILTER] = PERF_ADDR_FILTER_ACTION_FILTER, [IF_ACT_START] = PERF_ADDR_FILTER_ACTION_START, [IF_ACT_STOP] = PERF_ADDR_FILTER_ACTION_STOP, }; ret = -EINVAL; if (!*start) continue; /* filter definition begins */ if (state == IF_STATE_ACTION) { filter = perf_addr_filter_new(event, filters); if (!filter) goto fail; } token = match_token(start, if_tokens, args); switch (token) { case IF_ACT_FILTER: case IF_ACT_START: case IF_ACT_STOP: if (state != IF_STATE_ACTION) goto fail; filter->action = actions[token]; state = IF_STATE_SOURCE; break; case IF_SRC_KERNELADDR: case IF_SRC_KERNEL: kernel = 1; fallthrough; case IF_SRC_FILEADDR: case IF_SRC_FILE: if (state != IF_STATE_SOURCE) goto fail; *args[0].to = 0; ret = kstrtoul(args[0].from, 0, &filter->offset); if (ret) goto fail; if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) { *args[1].to = 0; ret = kstrtoul(args[1].from, 0, &filter->size); if (ret) goto fail; } if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) { int fpos = token == IF_SRC_FILE ? 2 : 1; kfree(filename); filename = match_strdup(&args[fpos]); if (!filename) { ret = -ENOMEM; goto fail; } } state = IF_STATE_END; break; default: goto fail; } /* * Filter definition is fully parsed, validate and install it. * Make sure that it doesn't contradict itself or the event's * attribute. */ if (state == IF_STATE_END) { ret = -EINVAL; /* * ACTION "filter" must have a non-zero length region * specified. */ if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER && !filter->size) goto fail; if (!kernel) { if (!filename) goto fail; /* * For now, we only support file-based filters * in per-task events; doing so for CPU-wide * events requires additional context switching * trickery, since same object code will be * mapped at different virtual addresses in * different processes. */ ret = -EOPNOTSUPP; if (!event->ctx->task) goto fail; /* look up the path and grab its inode */ ret = kern_path(filename, LOOKUP_FOLLOW, &filter->path); if (ret) goto fail; ret = -EINVAL; if (!filter->path.dentry || !S_ISREG(d_inode(filter->path.dentry) ->i_mode)) goto fail; event->addr_filters.nr_file_filters++; } /* ready to consume more filters */ kfree(filename); filename = NULL; state = IF_STATE_ACTION; filter = NULL; kernel = 0; } } if (state != IF_STATE_ACTION) goto fail; kfree(filename); kfree(orig); return 0; fail: kfree(filename); free_filters_list(filters); kfree(orig); return ret; } static int perf_event_set_addr_filter(struct perf_event *event, char *filter_str) { LIST_HEAD(filters); int ret; /* * Since this is called in perf_ioctl() path, we're already holding * ctx::mutex. */ lockdep_assert_held(&event->ctx->mutex); if (WARN_ON_ONCE(event->parent)) return -EINVAL; ret = perf_event_parse_addr_filter(event, filter_str, &filters); if (ret) goto fail_clear_files; ret = event->pmu->addr_filters_validate(&filters); if (ret) goto fail_free_filters; /* remove existing filters, if any */ perf_addr_filters_splice(event, &filters); /* install new filters */ perf_event_for_each_child(event, perf_event_addr_filters_apply); return ret; fail_free_filters: free_filters_list(&filters); fail_clear_files: event->addr_filters.nr_file_filters = 0; return ret; } static int perf_event_set_filter(struct perf_event *event, void __user *arg) { int ret = -EINVAL; char *filter_str; filter_str = strndup_user(arg, PAGE_SIZE); if (IS_ERR(filter_str)) return PTR_ERR(filter_str); #ifdef CONFIG_EVENT_TRACING if (perf_event_is_tracing(event)) { struct perf_event_context *ctx = event->ctx; /* * Beware, here be dragons!! * * the tracepoint muck will deadlock against ctx->mutex, but * the tracepoint stuff does not actually need it. So * temporarily drop ctx->mutex. As per perf_event_ctx_lock() we * already have a reference on ctx. * * This can result in event getting moved to a different ctx, * but that does not affect the tracepoint state. */ mutex_unlock(&ctx->mutex); ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); mutex_lock(&ctx->mutex); } else #endif if (has_addr_filter(event)) ret = perf_event_set_addr_filter(event, filter_str); kfree(filter_str); return ret; } /* * hrtimer based swevent callback */ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) { enum hrtimer_restart ret = HRTIMER_RESTART; struct perf_sample_data data; struct pt_regs *regs; struct perf_event *event; u64 period; event = container_of(hrtimer, struct perf_event, hw.hrtimer); if (event->state != PERF_EVENT_STATE_ACTIVE) return HRTIMER_NORESTART; event->pmu->read(event); perf_sample_data_init(&data, 0, event->hw.last_period); regs = get_irq_regs(); if (regs && !perf_exclude_event(event, regs)) { if (!(event->attr.exclude_idle && is_idle_task(current))) if (__perf_event_overflow(event, 1, &data, regs)) ret = HRTIMER_NORESTART; } period = max_t(u64, 10000, event->hw.sample_period); hrtimer_forward_now(hrtimer, ns_to_ktime(period)); return ret; } static void perf_swevent_start_hrtimer(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; s64 period; if (!is_sampling_event(event)) return; period = local64_read(&hwc->period_left); if (period) { if (period < 0) period = 10000; local64_set(&hwc->period_left, 0); } else { period = max_t(u64, 10000, hwc->sample_period); } hrtimer_start(&hwc->hrtimer, ns_to_ktime(period), HRTIMER_MODE_REL_PINNED_HARD); } static void perf_swevent_cancel_hrtimer(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; if (is_sampling_event(event)) { ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); local64_set(&hwc->period_left, ktime_to_ns(remaining)); hrtimer_cancel(&hwc->hrtimer); } } static void perf_swevent_init_hrtimer(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; if (!is_sampling_event(event)) return; hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); hwc->hrtimer.function = perf_swevent_hrtimer; /* * Since hrtimers have a fixed rate, we can do a static freq->period * mapping and avoid the whole period adjust feedback stuff. */ if (event->attr.freq) { long freq = event->attr.sample_freq; event->attr.sample_period = NSEC_PER_SEC / freq; hwc->sample_period = event->attr.sample_period; local64_set(&hwc->period_left, hwc->sample_period); hwc->last_period = hwc->sample_period; event->attr.freq = 0; } } /* * Software event: cpu wall time clock */ static void cpu_clock_event_update(struct perf_event *event) { s64 prev; u64 now; now = local_clock(); prev = local64_xchg(&event->hw.prev_count, now); local64_add(now - prev, &event->count); } static void cpu_clock_event_start(struct perf_event *event, int flags) { local64_set(&event->hw.prev_count, local_clock()); perf_swevent_start_hrtimer(event); } static void cpu_clock_event_stop(struct perf_event *event, int flags) { perf_swevent_cancel_hrtimer(event); cpu_clock_event_update(event); } static int cpu_clock_event_add(struct perf_event *event, int flags) { if (flags & PERF_EF_START) cpu_clock_event_start(event, flags); perf_event_update_userpage(event); return 0; } static void cpu_clock_event_del(struct perf_event *event, int flags) { cpu_clock_event_stop(event, flags); } static void cpu_clock_event_read(struct perf_event *event) { cpu_clock_event_update(event); } static int cpu_clock_event_init(struct perf_event *event) { if (event->attr.type != perf_cpu_clock.type) return -ENOENT; if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) return -ENOENT; /* * no branch sampling for software events */ if (has_branch_stack(event)) return -EOPNOTSUPP; perf_swevent_init_hrtimer(event); return 0; } static struct pmu perf_cpu_clock = { .task_ctx_nr = perf_sw_context, .capabilities = PERF_PMU_CAP_NO_NMI, .dev = PMU_NULL_DEV, .event_init = cpu_clock_event_init, .add = cpu_clock_event_add, .del = cpu_clock_event_del, .start = cpu_clock_event_start, .stop = cpu_clock_event_stop, .read = cpu_clock_event_read, }; /* * Software event: task time clock */ static void task_clock_event_update(struct perf_event *event, u64 now) { u64 prev; s64 delta; prev = local64_xchg(&event->hw.prev_count, now); delta = now - prev; local64_add(delta, &event->count); } static void task_clock_event_start(struct perf_event *event, int flags) { local64_set(&event->hw.prev_count, event->ctx->time); perf_swevent_start_hrtimer(event); } static void task_clock_event_stop(struct perf_event *event, int flags) { perf_swevent_cancel_hrtimer(event); task_clock_event_update(event, event->ctx->time); } static int task_clock_event_add(struct perf_event *event, int flags) { if (flags & PERF_EF_START) task_clock_event_start(event, flags); perf_event_update_userpage(event); return 0; } static void task_clock_event_del(struct perf_event *event, int flags) { task_clock_event_stop(event, PERF_EF_UPDATE); } static void task_clock_event_read(struct perf_event *event) { u64 now = perf_clock(); u64 delta = now - event->ctx->timestamp; u64 time = event->ctx->time + delta; task_clock_event_update(event, time); } static int task_clock_event_init(struct perf_event *event) { if (event->attr.type != perf_task_clock.type) return -ENOENT; if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) return -ENOENT; /* * no branch sampling for software events */ if (has_branch_stack(event)) return -EOPNOTSUPP; perf_swevent_init_hrtimer(event); return 0; } static struct pmu perf_task_clock = { .task_ctx_nr = perf_sw_context, .capabilities = PERF_PMU_CAP_NO_NMI, .dev = PMU_NULL_DEV, .event_init = task_clock_event_init, .add = task_clock_event_add, .del = task_clock_event_del, .start = task_clock_event_start, .stop = task_clock_event_stop, .read = task_clock_event_read, }; static void perf_pmu_nop_void(struct pmu *pmu) { } static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags) { } static int perf_pmu_nop_int(struct pmu *pmu) { return 0; } static int perf_event_nop_int(struct perf_event *event, u64 value) { return 0; } static DEFINE_PER_CPU(unsigned int, nop_txn_flags); static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags) { __this_cpu_write(nop_txn_flags, flags); if (flags & ~PERF_PMU_TXN_ADD) return; perf_pmu_disable(pmu); } static int perf_pmu_commit_txn(struct pmu *pmu) { unsigned int flags = __this_cpu_read(nop_txn_flags); __this_cpu_write(nop_txn_flags, 0); if (flags & ~PERF_PMU_TXN_ADD) return 0; perf_pmu_enable(pmu); return 0; } static void perf_pmu_cancel_txn(struct pmu *pmu) { unsigned int flags = __this_cpu_read(nop_txn_flags); __this_cpu_write(nop_txn_flags, 0); if (flags & ~PERF_PMU_TXN_ADD) return; perf_pmu_enable(pmu); } static int perf_event_idx_default(struct perf_event *event) { return 0; } static void free_pmu_context(struct pmu *pmu) { free_percpu(pmu->cpu_pmu_context); } /* * Let userspace know that this PMU supports address range filtering: */ static ssize_t nr_addr_filters_show(struct device *dev, struct device_attribute *attr, char *page) { struct pmu *pmu = dev_get_drvdata(dev); return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters); } DEVICE_ATTR_RO(nr_addr_filters); static struct idr pmu_idr; static ssize_t type_show(struct device *dev, struct device_attribute *attr, char *page) { struct pmu *pmu = dev_get_drvdata(dev); return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->type); } static DEVICE_ATTR_RO(type); static ssize_t perf_event_mux_interval_ms_show(struct device *dev, struct device_attribute *attr, char *page) { struct pmu *pmu = dev_get_drvdata(dev); return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->hrtimer_interval_ms); } static DEFINE_MUTEX(mux_interval_mutex); static ssize_t perf_event_mux_interval_ms_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct pmu *pmu = dev_get_drvdata(dev); int timer, cpu, ret; ret = kstrtoint(buf, 0, &timer); if (ret) return ret; if (timer < 1) return -EINVAL; /* same value, noting to do */ if (timer == pmu->hrtimer_interval_ms) return count; mutex_lock(&mux_interval_mutex); pmu->hrtimer_interval_ms = timer; /* update all cpuctx for this PMU */ cpus_read_lock(); for_each_online_cpu(cpu) { struct perf_cpu_pmu_context *cpc; cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu); cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpc); } cpus_read_unlock(); mutex_unlock(&mux_interval_mutex); return count; } static DEVICE_ATTR_RW(perf_event_mux_interval_ms); static struct attribute *pmu_dev_attrs[] = { &dev_attr_type.attr, &dev_attr_perf_event_mux_interval_ms.attr, NULL, }; ATTRIBUTE_GROUPS(pmu_dev); static int pmu_bus_running; static struct bus_type pmu_bus = { .name = "event_source", .dev_groups = pmu_dev_groups, }; static void pmu_dev_release(struct device *dev) { kfree(dev); } static int pmu_dev_alloc(struct pmu *pmu) { int ret = -ENOMEM; pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL); if (!pmu->dev) goto out; pmu->dev->groups = pmu->attr_groups; device_initialize(pmu->dev); dev_set_drvdata(pmu->dev, pmu); pmu->dev->bus = &pmu_bus; pmu->dev->parent = pmu->parent; pmu->dev->release = pmu_dev_release; ret = dev_set_name(pmu->dev, "%s", pmu->name); if (ret) goto free_dev; ret = device_add(pmu->dev); if (ret) goto free_dev; /* For PMUs with address filters, throw in an extra attribute: */ if (pmu->nr_addr_filters) ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters); if (ret) goto del_dev; if (pmu->attr_update) ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update); if (ret) goto del_dev; out: return ret; del_dev: device_del(pmu->dev); free_dev: put_device(pmu->dev); goto out; } static struct lock_class_key cpuctx_mutex; static struct lock_class_key cpuctx_lock; int perf_pmu_register(struct pmu *pmu, const char *name, int type) { int cpu, ret, max = PERF_TYPE_MAX; mutex_lock(&pmus_lock); ret = -ENOMEM; pmu->pmu_disable_count = alloc_percpu(int); if (!pmu->pmu_disable_count) goto unlock; pmu->type = -1; if (WARN_ONCE(!name, "Can not register anonymous pmu.\n")) { ret = -EINVAL; goto free_pdc; } pmu->name = name; if (type >= 0) max = type; ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL); if (ret < 0) goto free_pdc; WARN_ON(type >= 0 && ret != type); type = ret; pmu->type = type; if (pmu_bus_running && !pmu->dev) { ret = pmu_dev_alloc(pmu); if (ret) goto free_idr; } ret = -ENOMEM; pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context); if (!pmu->cpu_pmu_context) goto free_dev; for_each_possible_cpu(cpu) { struct perf_cpu_pmu_context *cpc; cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu); __perf_init_event_pmu_context(&cpc->epc, pmu); __perf_mux_hrtimer_init(cpc, cpu); } if (!pmu->start_txn) { if (pmu->pmu_enable) { /* * If we have pmu_enable/pmu_disable calls, install * transaction stubs that use that to try and batch * hardware accesses. */ pmu->start_txn = perf_pmu_start_txn; pmu->commit_txn = perf_pmu_commit_txn; pmu->cancel_txn = perf_pmu_cancel_txn; } else { pmu->start_txn = perf_pmu_nop_txn; pmu->commit_txn = perf_pmu_nop_int; pmu->cancel_txn = perf_pmu_nop_void; } } if (!pmu->pmu_enable) { pmu->pmu_enable = perf_pmu_nop_void; pmu->pmu_disable = perf_pmu_nop_void; } if (!pmu->check_period) pmu->check_period = perf_event_nop_int; if (!pmu->event_idx) pmu->event_idx = perf_event_idx_default; list_add_rcu(&pmu->entry, &pmus); atomic_set(&pmu->exclusive_cnt, 0); ret = 0; unlock: mutex_unlock(&pmus_lock); return ret; free_dev: if (pmu->dev && pmu->dev != PMU_NULL_DEV) { device_del(pmu->dev); put_device(pmu->dev); } free_idr: idr_remove(&pmu_idr, pmu->type); free_pdc: free_percpu(pmu->pmu_disable_count); goto unlock; } EXPORT_SYMBOL_GPL(perf_pmu_register); void perf_pmu_unregister(struct pmu *pmu) { mutex_lock(&pmus_lock); list_del_rcu(&pmu->entry); /* * We dereference the pmu list under both SRCU and regular RCU, so * synchronize against both of those. */ synchronize_srcu(&pmus_srcu); synchronize_rcu(); free_percpu(pmu->pmu_disable_count); idr_remove(&pmu_idr, pmu->type); if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) { if (pmu->nr_addr_filters) device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); device_del(pmu->dev); put_device(pmu->dev); } free_pmu_context(pmu); mutex_unlock(&pmus_lock); } EXPORT_SYMBOL_GPL(perf_pmu_unregister); static inline bool has_extended_regs(struct perf_event *event) { return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) || (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK); } static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) { struct perf_event_context *ctx = NULL; int ret; if (!try_module_get(pmu->module)) return -ENODEV; /* * A number of pmu->event_init() methods iterate the sibling_list to, * for example, validate if the group fits on the PMU. Therefore, * if this is a sibling event, acquire the ctx->mutex to protect * the sibling_list. */ if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) { /* * This ctx->mutex can nest when we're called through * inheritance. See the perf_event_ctx_lock_nested() comment. */ ctx = perf_event_ctx_lock_nested(event->group_leader, SINGLE_DEPTH_NESTING); BUG_ON(!ctx); } event->pmu = pmu; ret = pmu->event_init(event); if (ctx) perf_event_ctx_unlock(event->group_leader, ctx); if (!ret) { if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) && has_extended_regs(event)) ret = -EOPNOTSUPP; if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE && event_has_any_exclude_flag(event)) ret = -EINVAL; if (ret && event->destroy) event->destroy(event); } if (ret) module_put(pmu->module); return ret; } static struct pmu *perf_init_event(struct perf_event *event) { bool extended_type = false; int idx, type, ret; struct pmu *pmu; idx = srcu_read_lock(&pmus_srcu); /* * Save original type before calling pmu->event_init() since certain * pmus overwrites event->attr.type to forward event to another pmu. */ event->orig_type = event->attr.type; /* Try parent's PMU first: */ if (event->parent && event->parent->pmu) { pmu = event->parent->pmu; ret = perf_try_init_event(pmu, event); if (!ret) goto unlock; } /* * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE * are often aliases for PERF_TYPE_RAW. */ type = event->attr.type; if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE) { type = event->attr.config >> PERF_PMU_TYPE_SHIFT; if (!type) { type = PERF_TYPE_RAW; } else { extended_type = true; event->attr.config &= PERF_HW_EVENT_MASK; } } again: rcu_read_lock(); pmu = idr_find(&pmu_idr, type); rcu_read_unlock(); if (pmu) { if (event->attr.type != type && type != PERF_TYPE_RAW && !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE)) goto fail; ret = perf_try_init_event(pmu, event); if (ret == -ENOENT && event->attr.type != type && !extended_type) { type = event->attr.type; goto again; } if (ret) pmu = ERR_PTR(ret); goto unlock; } list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) { ret = perf_try_init_event(pmu, event); if (!ret) goto unlock; if (ret != -ENOENT) { pmu = ERR_PTR(ret); goto unlock; } } fail: pmu = ERR_PTR(-ENOENT); unlock: srcu_read_unlock(&pmus_srcu, idx); return pmu; } static void attach_sb_event(struct perf_event *event) { struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu); raw_spin_lock(&pel->lock); list_add_rcu(&event->sb_list, &pel->list); raw_spin_unlock(&pel->lock); } /* * We keep a list of all !task (and therefore per-cpu) events * that need to receive side-band records. * * This avoids having to scan all the various PMU per-cpu contexts * looking for them. */ static void account_pmu_sb_event(struct perf_event *event) { if (is_sb_event(event)) attach_sb_event(event); } /* Freq events need the tick to stay alive (see perf_event_task_tick). */ static void account_freq_event_nohz(void) { #ifdef CONFIG_NO_HZ_FULL /* Lock so we don't race with concurrent unaccount */ spin_lock(&nr_freq_lock); if (atomic_inc_return(&nr_freq_events) == 1) tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS); spin_unlock(&nr_freq_lock); #endif } static void account_freq_event(void) { if (tick_nohz_full_enabled()) account_freq_event_nohz(); else atomic_inc(&nr_freq_events); } static void account_event(struct perf_event *event) { bool inc = false; if (event->parent) return; if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB)) inc = true; if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); if (event->attr.build_id) atomic_inc(&nr_build_id_events); if (event->attr.comm) atomic_inc(&nr_comm_events); if (event->attr.namespaces) atomic_inc(&nr_namespaces_events); if (event->attr.cgroup) atomic_inc(&nr_cgroup_events); if (event->attr.task) atomic_inc(&nr_task_events); if (event->attr.freq) account_freq_event(); if (event->attr.context_switch) { atomic_inc(&nr_switch_events); inc = true; } if (has_branch_stack(event)) inc = true; if (is_cgroup_event(event)) inc = true; if (event->attr.ksymbol) atomic_inc(&nr_ksymbol_events); if (event->attr.bpf_event) atomic_inc(&nr_bpf_events); if (event->attr.text_poke) atomic_inc(&nr_text_poke_events); if (inc) { /* * We need the mutex here because static_branch_enable() * must complete *before* the perf_sched_count increment * becomes visible. */ if (atomic_inc_not_zero(&perf_sched_count)) goto enabled; mutex_lock(&perf_sched_mutex); if (!atomic_read(&perf_sched_count)) { static_branch_enable(&perf_sched_events); /* * Guarantee that all CPUs observe they key change and * call the perf scheduling hooks before proceeding to * install events that need them. */ synchronize_rcu(); } /* * Now that we have waited for the sync_sched(), allow further * increments to by-pass the mutex. */ atomic_inc(&perf_sched_count); mutex_unlock(&perf_sched_mutex); } enabled: account_pmu_sb_event(event); } /* * Allocate and initialize an event structure */ static struct perf_event * perf_event_alloc(struct perf_event_attr *attr, int cpu, struct task_struct *task, struct perf_event *group_leader, struct perf_event *parent_event, perf_overflow_handler_t overflow_handler, void *context, int cgroup_fd) { struct pmu *pmu; struct perf_event *event; struct hw_perf_event *hwc; long err = -EINVAL; int node; if ((unsigned)cpu >= nr_cpu_ids) { if (!task || cpu != -1) return ERR_PTR(-EINVAL); } if (attr->sigtrap && !task) { /* Requires a task: avoid signalling random tasks. */ return ERR_PTR(-EINVAL); } node = (cpu >= 0) ? cpu_to_node(cpu) : -1; event = kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO, node); if (!event) return ERR_PTR(-ENOMEM); /* * Single events are their own group leaders, with an * empty sibling list: */ if (!group_leader) group_leader = event; mutex_init(&event->child_mutex); INIT_LIST_HEAD(&event->child_list); INIT_LIST_HEAD(&event->event_entry); INIT_LIST_HEAD(&event->sibling_list); INIT_LIST_HEAD(&event->active_list); init_event_group(event); INIT_LIST_HEAD(&event->rb_entry); INIT_LIST_HEAD(&event->active_entry); INIT_LIST_HEAD(&event->addr_filters.list); INIT_HLIST_NODE(&event->hlist_entry); init_waitqueue_head(&event->waitq); init_irq_work(&event->pending_irq, perf_pending_irq); init_task_work(&event->pending_task, perf_pending_task); mutex_init(&event->mmap_mutex); raw_spin_lock_init(&event->addr_filters.lock); atomic_long_set(&event->refcount, 1); event->cpu = cpu; event->attr = *attr; event->group_leader = group_leader; event->pmu = NULL; event->oncpu = -1; event->parent = parent_event; event->ns = get_pid_ns(task_active_pid_ns(current)); event->id = atomic64_inc_return(&perf_event_id); event->state = PERF_EVENT_STATE_INACTIVE; if (parent_event) event->event_caps = parent_event->event_caps; if (task) { event->attach_state = PERF_ATTACH_TASK; /* * XXX pmu::event_init needs to know what task to account to * and we cannot use the ctx information because we need the * pmu before we get a ctx. */ event->hw.target = get_task_struct(task); } event->clock = &local_clock; if (parent_event) event->clock = parent_event->clock; if (!overflow_handler && parent_event) { overflow_handler = parent_event->overflow_handler; context = parent_event->overflow_handler_context; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING) if (overflow_handler == bpf_overflow_handler) { struct bpf_prog *prog = parent_event->prog; bpf_prog_inc(prog); event->prog = prog; event->orig_overflow_handler = parent_event->orig_overflow_handler; } #endif } if (overflow_handler) { event->overflow_handler = overflow_handler; event->overflow_handler_context = context; } else if (is_write_backward(event)){ event->overflow_handler = perf_event_output_backward; event->overflow_handler_context = NULL; } else { event->overflow_handler = perf_event_output_forward; event->overflow_handler_context = NULL; } perf_event__state_init(event); pmu = NULL; hwc = &event->hw; hwc->sample_period = attr->sample_period; if (attr->freq && attr->sample_freq) hwc->sample_period = 1; hwc->last_period = hwc->sample_period; local64_set(&hwc->period_left, hwc->sample_period); /* * We currently do not support PERF_SAMPLE_READ on inherited events. * See perf_output_read(). */ if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ)) goto err_ns; if (!has_branch_stack(event)) event->attr.branch_sample_type = 0; pmu = perf_init_event(event); if (IS_ERR(pmu)) { err = PTR_ERR(pmu); goto err_ns; } /* * Disallow uncore-task events. Similarly, disallow uncore-cgroup * events (they don't make sense as the cgroup will be different * on other CPUs in the uncore mask). */ if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) { err = -EINVAL; goto err_pmu; } if (event->attr.aux_output && !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) { err = -EOPNOTSUPP; goto err_pmu; } if (cgroup_fd != -1) { err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader); if (err) goto err_pmu; } err = exclusive_event_init(event); if (err) goto err_pmu; if (has_addr_filter(event)) { event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters, sizeof(struct perf_addr_filter_range), GFP_KERNEL); if (!event->addr_filter_ranges) { err = -ENOMEM; goto err_per_task; } /* * Clone the parent's vma offsets: they are valid until exec() * even if the mm is not shared with the parent. */ if (event->parent) { struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); raw_spin_lock_irq(&ifh->lock); memcpy(event->addr_filter_ranges, event->parent->addr_filter_ranges, pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range)); raw_spin_unlock_irq(&ifh->lock); } /* force hw sync on the address filters */ event->addr_filters_gen = 1; } if (!event->parent) { if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { err = get_callchain_buffers(attr->sample_max_stack); if (err) goto err_addr_filters; } } err = security_perf_event_alloc(event); if (err) goto err_callchain_buffer; /* symmetric to unaccount_event() in _free_event() */ account_event(event); return event; err_callchain_buffer: if (!event->parent) { if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) put_callchain_buffers(); } err_addr_filters: kfree(event->addr_filter_ranges); err_per_task: exclusive_event_destroy(event); err_pmu: if (is_cgroup_event(event)) perf_detach_cgroup(event); if (event->destroy) event->destroy(event); module_put(pmu->module); err_ns: if (event->hw.target) put_task_struct(event->hw.target); call_rcu(&event->rcu_head, free_event_rcu); return ERR_PTR(err); } static int perf_copy_attr(struct perf_event_attr __user *uattr, struct perf_event_attr *attr) { u32 size; int ret; /* Zero the full structure, so that a short copy will be nice. */ memset(attr, 0, sizeof(*attr)); ret = get_user(size, &uattr->size); if (ret) return ret; /* ABI compatibility quirk: */ if (!size) size = PERF_ATTR_SIZE_VER0; if (size < PERF_ATTR_SIZE_VER0 || size > PAGE_SIZE) goto err_size; ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); if (ret) { if (ret == -E2BIG) goto err_size; return ret; } attr->size = size; if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) return -EINVAL; if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) return -EINVAL; if (attr->read_format & ~(PERF_FORMAT_MAX-1)) return -EINVAL; if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) { u64 mask = attr->branch_sample_type; /* only using defined bits */ if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1)) return -EINVAL; /* at least one branch bit must be set */ if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL)) return -EINVAL; /* propagate priv level, when not set for branch */ if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) { /* exclude_kernel checked on syscall entry */ if (!attr->exclude_kernel) mask |= PERF_SAMPLE_BRANCH_KERNEL; if (!attr->exclude_user) mask |= PERF_SAMPLE_BRANCH_USER; if (!attr->exclude_hv) mask |= PERF_SAMPLE_BRANCH_HV; /* * adjust user setting (for HW filter setup) */ attr->branch_sample_type = mask; } /* privileged levels capture (kernel, hv): check permissions */ if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) { ret = perf_allow_kernel(attr); if (ret) return ret; } } if (attr->sample_type & PERF_SAMPLE_REGS_USER) { ret = perf_reg_validate(attr->sample_regs_user); if (ret) return ret; } if (attr->sample_type & PERF_SAMPLE_STACK_USER) { if (!arch_perf_have_user_stack_dump()) return -ENOSYS; /* * We have __u32 type for the size, but so far * we can only use __u16 as maximum due to the * __u16 sample size limit. */ if (attr->sample_stack_user >= USHRT_MAX) return -EINVAL; else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64))) return -EINVAL; } if (!attr->sample_max_stack) attr->sample_max_stack = sysctl_perf_event_max_stack; if (attr->sample_type & PERF_SAMPLE_REGS_INTR) ret = perf_reg_validate(attr->sample_regs_intr); #ifndef CONFIG_CGROUP_PERF if (attr->sample_type & PERF_SAMPLE_CGROUP) return -EINVAL; #endif if ((attr->sample_type & PERF_SAMPLE_WEIGHT) && (attr->sample_type & PERF_SAMPLE_WEIGHT_STRUCT)) return -EINVAL; if (!attr->inherit && attr->inherit_thread) return -EINVAL; if (attr->remove_on_exec && attr->enable_on_exec) return -EINVAL; if (attr->sigtrap && !attr->remove_on_exec) return -EINVAL; out: return ret; err_size: put_user(sizeof(*attr), &uattr->size); ret = -E2BIG; goto out; } static void mutex_lock_double(struct mutex *a, struct mutex *b) { if (b < a) swap(a, b); mutex_lock(a); mutex_lock_nested(b, SINGLE_DEPTH_NESTING); } static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event) { struct perf_buffer *rb = NULL; int ret = -EINVAL; if (!output_event) { mutex_lock(&event->mmap_mutex); goto set; } /* don't allow circular references */ if (event == output_event) goto out; /* * Don't allow cross-cpu buffers */ if (output_event->cpu != event->cpu) goto out; /* * If its not a per-cpu rb, it must be the same task. */ if (output_event->cpu == -1 && output_event->hw.target != event->hw.target) goto out; /* * Mixing clocks in the same buffer is trouble you don't need. */ if (output_event->clock != event->clock) goto out; /* * Either writing ring buffer from beginning or from end. * Mixing is not allowed. */ if (is_write_backward(output_event) != is_write_backward(event)) goto out; /* * If both events generate aux data, they must be on the same PMU */ if (has_aux(event) && has_aux(output_event) && event->pmu != output_event->pmu) goto out; /* * Hold both mmap_mutex to serialize against perf_mmap_close(). Since * output_event is already on rb->event_list, and the list iteration * restarts after every removal, it is guaranteed this new event is * observed *OR* if output_event is already removed, it's guaranteed we * observe !rb->mmap_count. */ mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex); set: /* Can't redirect output if we've got an active mmap() */ if (atomic_read(&event->mmap_count)) goto unlock; if (output_event) { /* get the rb we want to redirect to */ rb = ring_buffer_get(output_event); if (!rb) goto unlock; /* did we race against perf_mmap_close() */ if (!atomic_read(&rb->mmap_count)) { ring_buffer_put(rb); goto unlock; } } ring_buffer_attach(event, rb); ret = 0; unlock: mutex_unlock(&event->mmap_mutex); if (output_event) mutex_unlock(&output_event->mmap_mutex); out: return ret; } static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id) { bool nmi_safe = false; switch (clk_id) { case CLOCK_MONOTONIC: event->clock = &ktime_get_mono_fast_ns; nmi_safe = true; break; case CLOCK_MONOTONIC_RAW: event->clock = &ktime_get_raw_fast_ns; nmi_safe = true; break; case CLOCK_REALTIME: event->clock = &ktime_get_real_ns; break; case CLOCK_BOOTTIME: event->clock = &ktime_get_boottime_ns; break; case CLOCK_TAI: event->clock = &ktime_get_clocktai_ns; break; default: return -EINVAL; } if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI)) return -EINVAL; return 0; } static bool perf_check_permission(struct perf_event_attr *attr, struct task_struct *task) { unsigned int ptrace_mode = PTRACE_MODE_READ_REALCREDS; bool is_capable = perfmon_capable(); if (attr->sigtrap) { /* * perf_event_attr::sigtrap sends signals to the other task. * Require the current task to also have CAP_KILL. */ rcu_read_lock(); is_capable &= ns_capable(__task_cred(task)->user_ns, CAP_KILL); rcu_read_unlock(); /* * If the required capabilities aren't available, checks for * ptrace permissions: upgrade to ATTACH, since sending signals * can effectively change the target task. */ ptrace_mode = PTRACE_MODE_ATTACH_REALCREDS; } /* * Preserve ptrace permission check for backwards compatibility. The * ptrace check also includes checks that the current task and other * task have matching uids, and is therefore not done here explicitly. */ return is_capable || ptrace_may_access(task, ptrace_mode); } /** * sys_perf_event_open - open a performance event, associate it to a task/cpu * * @attr_uptr: event_id type attributes for monitoring/sampling * @pid: target pid * @cpu: target cpu * @group_fd: group leader event fd * @flags: perf event open flags */ SYSCALL_DEFINE5(perf_event_open, struct perf_event_attr __user *, attr_uptr, pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) { struct perf_event *group_leader = NULL, *output_event = NULL; struct perf_event_pmu_context *pmu_ctx; struct perf_event *event, *sibling; struct perf_event_attr attr; struct perf_event_context *ctx; struct file *event_file = NULL; struct fd group = {NULL, 0}; struct task_struct *task = NULL; struct pmu *pmu; int event_fd; int move_group = 0; int err; int f_flags = O_RDWR; int cgroup_fd = -1; /* for future expandability... */ if (flags & ~PERF_FLAG_ALL) return -EINVAL; err = perf_copy_attr(attr_uptr, &attr); if (err) return err; /* Do we allow access to perf_event_open(2) ? */ err = security_perf_event_open(&attr, PERF_SECURITY_OPEN); if (err) return err; if (!attr.exclude_kernel) { err = perf_allow_kernel(&attr); if (err) return err; } if (attr.namespaces) { if (!perfmon_capable()) return -EACCES; } if (attr.freq) { if (attr.sample_freq > sysctl_perf_event_sample_rate) return -EINVAL; } else { if (attr.sample_period & (1ULL << 63)) return -EINVAL; } /* Only privileged users can get physical addresses */ if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) { err = perf_allow_kernel(&attr); if (err) return err; } /* REGS_INTR can leak data, lockdown must prevent this */ if (attr.sample_type & PERF_SAMPLE_REGS_INTR) { err = security_locked_down(LOCKDOWN_PERF); if (err) return err; } /* * In cgroup mode, the pid argument is used to pass the fd * opened to the cgroup directory in cgroupfs. The cpu argument * designates the cpu on which to monitor threads from that * cgroup. */ if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) return -EINVAL; if (flags & PERF_FLAG_FD_CLOEXEC) f_flags |= O_CLOEXEC; event_fd = get_unused_fd_flags(f_flags); if (event_fd < 0) return event_fd; if (group_fd != -1) { err = perf_fget_light(group_fd, &group); if (err) goto err_fd; group_leader = group.file->private_data; if (flags & PERF_FLAG_FD_OUTPUT) output_event = group_leader; if (flags & PERF_FLAG_FD_NO_GROUP) group_leader = NULL; } if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) { task = find_lively_task_by_vpid(pid); if (IS_ERR(task)) { err = PTR_ERR(task); goto err_group_fd; } } if (task && group_leader && group_leader->attr.inherit != attr.inherit) { err = -EINVAL; goto err_task; } if (flags & PERF_FLAG_PID_CGROUP) cgroup_fd = pid; event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL, NULL, cgroup_fd); if (IS_ERR(event)) { err = PTR_ERR(event); goto err_task; } if (is_sampling_event(event)) { if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) { err = -EOPNOTSUPP; goto err_alloc; } } /* * Special case software events and allow them to be part of * any hardware group. */ pmu = event->pmu; if (attr.use_clockid) { err = perf_event_set_clock(event, attr.clockid); if (err) goto err_alloc; } if (pmu->task_ctx_nr == perf_sw_context) event->event_caps |= PERF_EV_CAP_SOFTWARE; if (task) { err = down_read_interruptible(&task->signal->exec_update_lock); if (err) goto err_alloc; /* * We must hold exec_update_lock across this and any potential * perf_install_in_context() call for this new event to * serialize against exec() altering our credentials (and the * perf_event_exit_task() that could imply). */ err = -EACCES; if (!perf_check_permission(&attr, task)) goto err_cred; } /* * Get the target context (task or percpu): */ ctx = find_get_context(task, event); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); goto err_cred; } mutex_lock(&ctx->mutex); if (ctx->task == TASK_TOMBSTONE) { err = -ESRCH; goto err_locked; } if (!task) { /* * Check if the @cpu we're creating an event for is online. * * We use the perf_cpu_context::ctx::mutex to serialize against * the hotplug notifiers. See perf_event_{init,exit}_cpu(). */ struct perf_cpu_context *cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu); if (!cpuctx->online) { err = -ENODEV; goto err_locked; } } if (group_leader) { err = -EINVAL; /* * Do not allow a recursive hierarchy (this new sibling * becoming part of another group-sibling): */ if (group_leader->group_leader != group_leader) goto err_locked; /* All events in a group should have the same clock */ if (group_leader->clock != event->clock) goto err_locked; /* * Make sure we're both events for the same CPU; * grouping events for different CPUs is broken; since * you can never concurrently schedule them anyhow. */ if (group_leader->cpu != event->cpu) goto err_locked; /* * Make sure we're both on the same context; either task or cpu. */ if (group_leader->ctx != ctx) goto err_locked; /* * Only a group leader can be exclusive or pinned */ if (attr.exclusive || attr.pinned) goto err_locked; if (is_software_event(event) && !in_software_context(group_leader)) { /* * If the event is a sw event, but the group_leader * is on hw context. * * Allow the addition of software events to hw * groups, this is safe because software events * never fail to schedule. * * Note the comment that goes with struct * perf_event_pmu_context. */ pmu = group_leader->pmu_ctx->pmu; } else if (!is_software_event(event)) { if (is_software_event(group_leader) && (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) { /* * In case the group is a pure software group, and we * try to add a hardware event, move the whole group to * the hardware context. */ move_group = 1; } /* Don't allow group of multiple hw events from different pmus */ if (!in_software_context(group_leader) && group_leader->pmu_ctx->pmu != pmu) goto err_locked; } } /* * Now that we're certain of the pmu; find the pmu_ctx. */ pmu_ctx = find_get_pmu_context(pmu, ctx, event); if (IS_ERR(pmu_ctx)) { err = PTR_ERR(pmu_ctx); goto err_locked; } event->pmu_ctx = pmu_ctx; if (output_event) { err = perf_event_set_output(event, output_event); if (err) goto err_context; } if (!perf_event_validate_size(event)) { err = -E2BIG; goto err_context; } if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) { err = -EINVAL; goto err_context; } /* * Must be under the same ctx::mutex as perf_install_in_context(), * because we need to serialize with concurrent event creation. */ if (!exclusive_event_installable(event, ctx)) { err = -EBUSY; goto err_context; } WARN_ON_ONCE(ctx->parent_ctx); event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, f_flags); if (IS_ERR(event_file)) { err = PTR_ERR(event_file); event_file = NULL; goto err_context; } /* * This is the point on no return; we cannot fail hereafter. This is * where we start modifying current state. */ if (move_group) { perf_remove_from_context(group_leader, 0); put_pmu_ctx(group_leader->pmu_ctx); for_each_sibling_event(sibling, group_leader) { perf_remove_from_context(sibling, 0); put_pmu_ctx(sibling->pmu_ctx); } /* * Install the group siblings before the group leader. * * Because a group leader will try and install the entire group * (through the sibling list, which is still in-tact), we can * end up with siblings installed in the wrong context. * * By installing siblings first we NO-OP because they're not * reachable through the group lists. */ for_each_sibling_event(sibling, group_leader) { sibling->pmu_ctx = pmu_ctx; get_pmu_ctx(pmu_ctx); perf_event__state_init(sibling); perf_install_in_context(ctx, sibling, sibling->cpu); } /* * Removing from the context ends up with disabled * event. What we want here is event in the initial * startup state, ready to be add into new context. */ group_leader->pmu_ctx = pmu_ctx; get_pmu_ctx(pmu_ctx); perf_event__state_init(group_leader); perf_install_in_context(ctx, group_leader, group_leader->cpu); } /* * Precalculate sample_data sizes; do while holding ctx::mutex such * that we're serialized against further additions and before * perf_install_in_context() which is the point the event is active and * can use these values. */ perf_event__header_size(event); perf_event__id_header_size(event); event->owner = current; perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); if (task) { up_read(&task->signal->exec_update_lock); put_task_struct(task); } mutex_lock(&current->perf_event_mutex); list_add_tail(&event->owner_entry, &current->perf_event_list); mutex_unlock(&current->perf_event_mutex); /* * Drop the reference on the group_event after placing the * new event on the sibling_list. This ensures destruction * of the group leader will find the pointer to itself in * perf_group_detach(). */ fdput(group); fd_install(event_fd, event_file); return event_fd; err_context: put_pmu_ctx(event->pmu_ctx); event->pmu_ctx = NULL; /* _free_event() */ err_locked: mutex_unlock(&ctx->mutex); perf_unpin_context(ctx); put_ctx(ctx); err_cred: if (task) up_read(&task->signal->exec_update_lock); err_alloc: free_event(event); err_task: if (task) put_task_struct(task); err_group_fd: fdput(group); err_fd: put_unused_fd(event_fd); return err; } /** * perf_event_create_kernel_counter * * @attr: attributes of the counter to create * @cpu: cpu in which the counter is bound * @task: task to profile (NULL for percpu) * @overflow_handler: callback to trigger when we hit the event * @context: context data could be used in overflow_handler callback */ struct perf_event * perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, struct task_struct *task, perf_overflow_handler_t overflow_handler, void *context) { struct perf_event_pmu_context *pmu_ctx; struct perf_event_context *ctx; struct perf_event *event; struct pmu *pmu; int err; /* * Grouping is not supported for kernel events, neither is 'AUX', * make sure the caller's intentions are adjusted. */ if (attr->aux_output) return ERR_PTR(-EINVAL); event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler, context, -1); if (IS_ERR(event)) { err = PTR_ERR(event); goto err; } /* Mark owner so we could distinguish it from user events. */ event->owner = TASK_TOMBSTONE; pmu = event->pmu; if (pmu->task_ctx_nr == perf_sw_context) event->event_caps |= PERF_EV_CAP_SOFTWARE; /* * Get the target context (task or percpu): */ ctx = find_get_context(task, event); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); goto err_alloc; } WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); if (ctx->task == TASK_TOMBSTONE) { err = -ESRCH; goto err_unlock; } pmu_ctx = find_get_pmu_context(pmu, ctx, event); if (IS_ERR(pmu_ctx)) { err = PTR_ERR(pmu_ctx); goto err_unlock; } event->pmu_ctx = pmu_ctx; if (!task) { /* * Check if the @cpu we're creating an event for is online. * * We use the perf_cpu_context::ctx::mutex to serialize against * the hotplug notifiers. See perf_event_{init,exit}_cpu(). */ struct perf_cpu_context *cpuctx = container_of(ctx, struct perf_cpu_context, ctx); if (!cpuctx->online) { err = -ENODEV; goto err_pmu_ctx; } } if (!exclusive_event_installable(event, ctx)) { err = -EBUSY; goto err_pmu_ctx; } perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); return event; err_pmu_ctx: put_pmu_ctx(pmu_ctx); event->pmu_ctx = NULL; /* _free_event() */ err_unlock: mutex_unlock(&ctx->mutex); perf_unpin_context(ctx); put_ctx(ctx); err_alloc: free_event(event); err: return ERR_PTR(err); } EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); static void __perf_pmu_remove(struct perf_event_context *ctx, int cpu, struct pmu *pmu, struct perf_event_groups *groups, struct list_head *events) { struct perf_event *event, *sibling; perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) { perf_remove_from_context(event, 0); put_pmu_ctx(event->pmu_ctx); list_add(&event->migrate_entry, events); for_each_sibling_event(sibling, event) { perf_remove_from_context(sibling, 0); put_pmu_ctx(sibling->pmu_ctx); list_add(&sibling->migrate_entry, events); } } } static void __perf_pmu_install_event(struct pmu *pmu, struct perf_event_context *ctx, int cpu, struct perf_event *event) { struct perf_event_pmu_context *epc; event->cpu = cpu; epc = find_get_pmu_context(pmu, ctx, event); event->pmu_ctx = epc; if (event->state >= PERF_EVENT_STATE_OFF) event->state = PERF_EVENT_STATE_INACTIVE; perf_install_in_context(ctx, event, cpu); } static void __perf_pmu_install(struct perf_event_context *ctx, int cpu, struct pmu *pmu, struct list_head *events) { struct perf_event *event, *tmp; /* * Re-instate events in 2 passes. * * Skip over group leaders and only install siblings on this first * pass, siblings will not get enabled without a leader, however a * leader will enable its siblings, even if those are still on the old * context. */ list_for_each_entry_safe(event, tmp, events, migrate_entry) { if (event->group_leader == event) continue; list_del(&event->migrate_entry); __perf_pmu_install_event(pmu, ctx, cpu, event); } /* * Once all the siblings are setup properly, install the group leaders * to make it go. */ list_for_each_entry_safe(event, tmp, events, migrate_entry) { list_del(&event->migrate_entry); __perf_pmu_install_event(pmu, ctx, cpu, event); } } void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) { struct perf_event_context *src_ctx, *dst_ctx; LIST_HEAD(events); src_ctx = &per_cpu_ptr(&perf_cpu_context, src_cpu)->ctx; dst_ctx = &per_cpu_ptr(&perf_cpu_context, dst_cpu)->ctx; /* * See perf_event_ctx_lock() for comments on the details * of swizzling perf_event::ctx. */ mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex); __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->pinned_groups, &events); __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->flexible_groups, &events); if (!list_empty(&events)) { /* * Wait for the events to quiesce before re-instating them. */ synchronize_rcu(); __perf_pmu_install(dst_ctx, dst_cpu, pmu, &events); } mutex_unlock(&dst_ctx->mutex); mutex_unlock(&src_ctx->mutex); } EXPORT_SYMBOL_GPL(perf_pmu_migrate_context); static void sync_child_event(struct perf_event *child_event) { struct perf_event *parent_event = child_event->parent; u64 child_val; if (child_event->attr.inherit_stat) { struct task_struct *task = child_event->ctx->task; if (task && task != TASK_TOMBSTONE) perf_event_read_event(child_event, task); } child_val = perf_event_count(child_event); /* * Add back the child's count to the parent's count: */ atomic64_add(child_val, &parent_event->child_count); atomic64_add(child_event->total_time_enabled, &parent_event->child_total_time_enabled); atomic64_add(child_event->total_time_running, &parent_event->child_total_time_running); } static void perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event *parent_event = event->parent; unsigned long detach_flags = 0; if (parent_event) { /* * Do not destroy the 'original' grouping; because of the * context switch optimization the original events could've * ended up in a random child task. * * If we were to destroy the original group, all group related * operations would cease to function properly after this * random child dies. * * Do destroy all inherited groups, we don't care about those * and being thorough is better. */ detach_flags = DETACH_GROUP | DETACH_CHILD; mutex_lock(&parent_event->child_mutex); } perf_remove_from_context(event, detach_flags); raw_spin_lock_irq(&ctx->lock); if (event->state > PERF_EVENT_STATE_EXIT) perf_event_set_state(event, PERF_EVENT_STATE_EXIT); raw_spin_unlock_irq(&ctx->lock); /* * Child events can be freed. */ if (parent_event) { mutex_unlock(&parent_event->child_mutex); /* * Kick perf_poll() for is_event_hup(); */ perf_event_wakeup(parent_event); free_event(event); put_event(parent_event); return; } /* * Parent events are governed by their filedesc, retain them. */ perf_event_wakeup(event); } static void perf_event_exit_task_context(struct task_struct *child) { struct perf_event_context *child_ctx, *clone_ctx = NULL; struct perf_event *child_event, *next; WARN_ON_ONCE(child != current); child_ctx = perf_pin_task_context(child); if (!child_ctx) return; /* * In order to reduce the amount of tricky in ctx tear-down, we hold * ctx::mutex over the entire thing. This serializes against almost * everything that wants to access the ctx. * * The exception is sys_perf_event_open() / * perf_event_create_kernel_count() which does find_get_context() * without ctx::mutex (it cannot because of the move_group double mutex * lock thing). See the comments in perf_install_in_context(). */ mutex_lock(&child_ctx->mutex); /* * In a single ctx::lock section, de-schedule the events and detach the * context from the task such that we cannot ever get it scheduled back * in. */ raw_spin_lock_irq(&child_ctx->lock); task_ctx_sched_out(child_ctx, EVENT_ALL); /* * Now that the context is inactive, destroy the task <-> ctx relation * and mark the context dead. */ RCU_INIT_POINTER(child->perf_event_ctxp, NULL); put_ctx(child_ctx); /* cannot be last */ WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE); put_task_struct(current); /* cannot be last */ clone_ctx = unclone_ctx(child_ctx); raw_spin_unlock_irq(&child_ctx->lock); if (clone_ctx) put_ctx(clone_ctx); /* * Report the task dead after unscheduling the events so that we * won't get any samples after PERF_RECORD_EXIT. We can however still * get a few PERF_RECORD_READ events. */ perf_event_task(child, child_ctx, 0); list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry) perf_event_exit_event(child_event, child_ctx); mutex_unlock(&child_ctx->mutex); put_ctx(child_ctx); } /* * When a child task exits, feed back event values to parent events. * * Can be called with exec_update_lock held when called from * setup_new_exec(). */ void perf_event_exit_task(struct task_struct *child) { struct perf_event *event, *tmp; mutex_lock(&child->perf_event_mutex); list_for_each_entry_safe(event, tmp, &child->perf_event_list, owner_entry) { list_del_init(&event->owner_entry); /* * Ensure the list deletion is visible before we clear * the owner, closes a race against perf_release() where * we need to serialize on the owner->perf_event_mutex. */ smp_store_release(&event->owner, NULL); } mutex_unlock(&child->perf_event_mutex); perf_event_exit_task_context(child); /* * The perf_event_exit_task_context calls perf_event_task * with child's task_ctx, which generates EXIT events for * child contexts and sets child->perf_event_ctxp[] to NULL. * At this point we need to send EXIT events to cpu contexts. */ perf_event_task(child, NULL, 0); } static void perf_free_event(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event *parent = event->parent; if (WARN_ON_ONCE(!parent)) return; mutex_lock(&parent->child_mutex); list_del_init(&event->child_list); mutex_unlock(&parent->child_mutex); put_event(parent); raw_spin_lock_irq(&ctx->lock); perf_group_detach(event); list_del_event(event, ctx); raw_spin_unlock_irq(&ctx->lock); free_event(event); } /* * Free a context as created by inheritance by perf_event_init_task() below, * used by fork() in case of fail. * * Even though the task has never lived, the context and events have been * exposed through the child_list, so we must take care tearing it all down. */ void perf_event_free_task(struct task_struct *task) { struct perf_event_context *ctx; struct perf_event *event, *tmp; ctx = rcu_access_pointer(task->perf_event_ctxp); if (!ctx) return; mutex_lock(&ctx->mutex); raw_spin_lock_irq(&ctx->lock); /* * Destroy the task <-> ctx relation and mark the context dead. * * This is important because even though the task hasn't been * exposed yet the context has been (through child_list). */ RCU_INIT_POINTER(task->perf_event_ctxp, NULL); WRITE_ONCE(ctx->task, TASK_TOMBSTONE); put_task_struct(task); /* cannot be last */ raw_spin_unlock_irq(&ctx->lock); list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) perf_free_event(event, ctx); mutex_unlock(&ctx->mutex); /* * perf_event_release_kernel() could've stolen some of our * child events and still have them on its free_list. In that * case we must wait for these events to have been freed (in * particular all their references to this task must've been * dropped). * * Without this copy_process() will unconditionally free this * task (irrespective of its reference count) and * _free_event()'s put_task_struct(event->hw.target) will be a * use-after-free. * * Wait for all events to drop their context reference. */ wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1); put_ctx(ctx); /* must be last */ } void perf_event_delayed_put(struct task_struct *task) { WARN_ON_ONCE(task->perf_event_ctxp); } struct file *perf_event_get(unsigned int fd) { struct file *file = fget(fd); if (!file) return ERR_PTR(-EBADF); if (file->f_op != &perf_fops) { fput(file); return ERR_PTR(-EBADF); } return file; } const struct perf_event *perf_get_event(struct file *file) { if (file->f_op != &perf_fops) return ERR_PTR(-EINVAL); return file->private_data; } const struct perf_event_attr *perf_event_attrs(struct perf_event *event) { if (!event) return ERR_PTR(-EINVAL); return &event->attr; } /* * Inherit an event from parent task to child task. * * Returns: * - valid pointer on success * - NULL for orphaned events * - IS_ERR() on error */ static struct perf_event * inherit_event(struct perf_event *parent_event, struct task_struct *parent, struct perf_event_context *parent_ctx, struct task_struct *child, struct perf_event *group_leader, struct perf_event_context *child_ctx) { enum perf_event_state parent_state = parent_event->state; struct perf_event_pmu_context *pmu_ctx; struct perf_event *child_event; unsigned long flags; /* * Instead of creating recursive hierarchies of events, * we link inherited events back to the original parent, * which has a filp for sure, which we use as the reference * count: */ if (parent_event->parent) parent_event = parent_event->parent; child_event = perf_event_alloc(&parent_event->attr, parent_event->cpu, child, group_leader, parent_event, NULL, NULL, -1); if (IS_ERR(child_event)) return child_event; pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event); if (IS_ERR(pmu_ctx)) { free_event(child_event); return ERR_CAST(pmu_ctx); } child_event->pmu_ctx = pmu_ctx; /* * is_orphaned_event() and list_add_tail(&parent_event->child_list) * must be under the same lock in order to serialize against * perf_event_release_kernel(), such that either we must observe * is_orphaned_event() or they will observe us on the child_list. */ mutex_lock(&parent_event->child_mutex); if (is_orphaned_event(parent_event) || !atomic_long_inc_not_zero(&parent_event->refcount)) { mutex_unlock(&parent_event->child_mutex); /* task_ctx_data is freed with child_ctx */ free_event(child_event); return NULL; } get_ctx(child_ctx); /* * Make the child state follow the state of the parent event, * not its attr.disabled bit. We hold the parent's mutex, * so we won't race with perf_event_{en, dis}able_family. */ if (parent_state >= PERF_EVENT_STATE_INACTIVE) child_event->state = PERF_EVENT_STATE_INACTIVE; else child_event->state = PERF_EVENT_STATE_OFF; if (parent_event->attr.freq) { u64 sample_period = parent_event->hw.sample_period; struct hw_perf_event *hwc = &child_event->hw; hwc->sample_period = sample_period; hwc->last_period = sample_period; local64_set(&hwc->period_left, sample_period); } child_event->ctx = child_ctx; child_event->overflow_handler = parent_event->overflow_handler; child_event->overflow_handler_context = parent_event->overflow_handler_context; /* * Precalculate sample_data sizes */ perf_event__header_size(child_event); perf_event__id_header_size(child_event); /* * Link it up in the child's context: */ raw_spin_lock_irqsave(&child_ctx->lock, flags); add_event_to_ctx(child_event, child_ctx); child_event->attach_state |= PERF_ATTACH_CHILD; raw_spin_unlock_irqrestore(&child_ctx->lock, flags); /* * Link this into the parent event's child list */ list_add_tail(&child_event->child_list, &parent_event->child_list); mutex_unlock(&parent_event->child_mutex); return child_event; } /* * Inherits an event group. * * This will quietly suppress orphaned events; !inherit_event() is not an error. * This matches with perf_event_release_kernel() removing all child events. * * Returns: * - 0 on success * - <0 on error */ static int inherit_group(struct perf_event *parent_event, struct task_struct *parent, struct perf_event_context *parent_ctx, struct task_struct *child, struct perf_event_context *child_ctx) { struct perf_event *leader; struct perf_event *sub; struct perf_event *child_ctr; leader = inherit_event(parent_event, parent, parent_ctx, child, NULL, child_ctx); if (IS_ERR(leader)) return PTR_ERR(leader); /* * @leader can be NULL here because of is_orphaned_event(). In this * case inherit_event() will create individual events, similar to what * perf_group_detach() would do anyway. */ for_each_sibling_event(sub, parent_event) { child_ctr = inherit_event(sub, parent, parent_ctx, child, leader, child_ctx); if (IS_ERR(child_ctr)) return PTR_ERR(child_ctr); if (sub->aux_event == parent_event && child_ctr && !perf_get_aux_event(child_ctr, leader)) return -EINVAL; } if (leader) leader->group_generation = parent_event->group_generation; return 0; } /* * Creates the child task context and tries to inherit the event-group. * * Clears @inherited_all on !attr.inherited or error. Note that we'll leave * inherited_all set when we 'fail' to inherit an orphaned event; this is * consistent with perf_event_release_kernel() removing all child events. * * Returns: * - 0 on success * - <0 on error */ static int inherit_task_group(struct perf_event *event, struct task_struct *parent, struct perf_event_context *parent_ctx, struct task_struct *child, u64 clone_flags, int *inherited_all) { struct perf_event_context *child_ctx; int ret; if (!event->attr.inherit || (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD)) || /* Do not inherit if sigtrap and signal handlers were cleared. */ (event->attr.sigtrap && (clone_flags & CLONE_CLEAR_SIGHAND))) { *inherited_all = 0; return 0; } child_ctx = child->perf_event_ctxp; if (!child_ctx) { /* * This is executed from the parent task context, so * inherit events that have been marked for cloning. * First allocate and initialize a context for the * child. */ child_ctx = alloc_perf_context(child); if (!child_ctx) return -ENOMEM; child->perf_event_ctxp = child_ctx; } ret = inherit_group(event, parent, parent_ctx, child, child_ctx); if (ret) *inherited_all = 0; return ret; } /* * Initialize the perf_event context in task_struct */ static int perf_event_init_context(struct task_struct *child, u64 clone_flags) { struct perf_event_context *child_ctx, *parent_ctx; struct perf_event_context *cloned_ctx; struct perf_event *event; struct task_struct *parent = current; int inherited_all = 1; unsigned long flags; int ret = 0; if (likely(!parent->perf_event_ctxp)) return 0; /* * If the parent's context is a clone, pin it so it won't get * swapped under us. */ parent_ctx = perf_pin_task_context(parent); if (!parent_ctx) return 0; /* * No need to check if parent_ctx != NULL here; since we saw * it non-NULL earlier, the only reason for it to become NULL * is if we exit, and since we're currently in the middle of * a fork we can't be exiting at the same time. */ /* * Lock the parent list. No need to lock the child - not PID * hashed yet and not running, so nobody can access it. */ mutex_lock(&parent_ctx->mutex); /* * We dont have to disable NMIs - we are only looking at * the list, not manipulating it: */ perf_event_groups_for_each(event, &parent_ctx->pinned_groups) { ret = inherit_task_group(event, parent, parent_ctx, child, clone_flags, &inherited_all); if (ret) goto out_unlock; } /* * We can't hold ctx->lock when iterating the ->flexible_group list due * to allocations, but we need to prevent rotation because * rotate_ctx() will change the list from interrupt context. */ raw_spin_lock_irqsave(&parent_ctx->lock, flags); parent_ctx->rotate_disable = 1; raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); perf_event_groups_for_each(event, &parent_ctx->flexible_groups) { ret = inherit_task_group(event, parent, parent_ctx, child, clone_flags, &inherited_all); if (ret) goto out_unlock; } raw_spin_lock_irqsave(&parent_ctx->lock, flags); parent_ctx->rotate_disable = 0; child_ctx = child->perf_event_ctxp; if (child_ctx && inherited_all) { /* * Mark the child context as a clone of the parent * context, or of whatever the parent is a clone of. * * Note that if the parent is a clone, the holding of * parent_ctx->lock avoids it from being uncloned. */ cloned_ctx = parent_ctx->parent_ctx; if (cloned_ctx) { child_ctx->parent_ctx = cloned_ctx; child_ctx->parent_gen = parent_ctx->parent_gen; } else { child_ctx->parent_ctx = parent_ctx; child_ctx->parent_gen = parent_ctx->generation; } get_ctx(child_ctx->parent_ctx); } raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); out_unlock: mutex_unlock(&parent_ctx->mutex); perf_unpin_context(parent_ctx); put_ctx(parent_ctx); return ret; } /* * Initialize the perf_event context in task_struct */ int perf_event_init_task(struct task_struct *child, u64 clone_flags) { int ret; child->perf_event_ctxp = NULL; mutex_init(&child->perf_event_mutex); INIT_LIST_HEAD(&child->perf_event_list); ret = perf_event_init_context(child, clone_flags); if (ret) { perf_event_free_task(child); return ret; } return 0; } static void __init perf_event_init_all_cpus(void) { struct swevent_htable *swhash; struct perf_cpu_context *cpuctx; int cpu; zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL); for_each_possible_cpu(cpu) { swhash = &per_cpu(swevent_htable, cpu); mutex_init(&swhash->hlist_mutex); INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu)); raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu)); INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu)); cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); __perf_event_init_context(&cpuctx->ctx); lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask); cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default); cpuctx->heap = cpuctx->heap_default; } } static void perf_swevent_init_cpu(unsigned int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); mutex_lock(&swhash->hlist_mutex); if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) { struct swevent_hlist *hlist; hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu)); WARN_ON(!hlist); rcu_assign_pointer(swhash->swevent_hlist, hlist); } mutex_unlock(&swhash->hlist_mutex); } #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE static void __perf_event_exit_context(void *__info) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *ctx = __info; struct perf_event *event; raw_spin_lock(&ctx->lock); ctx_sched_out(ctx, EVENT_TIME); list_for_each_entry(event, &ctx->event_list, event_entry) __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP); raw_spin_unlock(&ctx->lock); } static void perf_event_exit_cpu_context(int cpu) { struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; // XXX simplify cpuctx->online mutex_lock(&pmus_lock); cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); ctx = &cpuctx->ctx; mutex_lock(&ctx->mutex); smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); cpuctx->online = 0; mutex_unlock(&ctx->mutex); cpumask_clear_cpu(cpu, perf_online_mask); mutex_unlock(&pmus_lock); } #else static void perf_event_exit_cpu_context(int cpu) { } #endif int perf_event_init_cpu(unsigned int cpu) { struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; perf_swevent_init_cpu(cpu); mutex_lock(&pmus_lock); cpumask_set_cpu(cpu, perf_online_mask); cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); ctx = &cpuctx->ctx; mutex_lock(&ctx->mutex); cpuctx->online = 1; mutex_unlock(&ctx->mutex); mutex_unlock(&pmus_lock); return 0; } int perf_event_exit_cpu(unsigned int cpu) { perf_event_exit_cpu_context(cpu); return 0; } static int perf_reboot(struct notifier_block *notifier, unsigned long val, void *v) { int cpu; for_each_online_cpu(cpu) perf_event_exit_cpu(cpu); return NOTIFY_OK; } /* * Run the perf reboot notifier at the very last possible moment so that * the generic watchdog code runs as long as possible. */ static struct notifier_block perf_reboot_notifier = { .notifier_call = perf_reboot, .priority = INT_MIN, }; void __init perf_event_init(void) { int ret; idr_init(&pmu_idr); perf_event_init_all_cpus(); init_srcu_struct(&pmus_srcu); perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE); perf_pmu_register(&perf_cpu_clock, "cpu_clock", -1); perf_pmu_register(&perf_task_clock, "task_clock", -1); perf_tp_register(); perf_event_init_cpu(smp_processor_id()); register_reboot_notifier(&perf_reboot_notifier); ret = init_hw_breakpoint(); WARN(ret, "hw_breakpoint initialization failed with: %d", ret); perf_event_cache = KMEM_CACHE(perf_event, SLAB_PANIC); /* * Build time assertion that we keep the data_head at the intended * location. IOW, validation we got the __reserved[] size right. */ BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head)) != 1024); } ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr, char *page) { struct perf_pmu_events_attr *pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr); if (pmu_attr->event_str) return sprintf(page, "%s\n", pmu_attr->event_str); return 0; } EXPORT_SYMBOL_GPL(perf_event_sysfs_show); static int __init perf_event_sysfs_init(void) { struct pmu *pmu; int ret; mutex_lock(&pmus_lock); ret = bus_register(&pmu_bus); if (ret) goto unlock; list_for_each_entry(pmu, &pmus, entry) { if (pmu->dev) continue; ret = pmu_dev_alloc(pmu); WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret); } pmu_bus_running = 1; ret = 0; unlock: mutex_unlock(&pmus_lock); return ret; } device_initcall(perf_event_sysfs_init); #ifdef CONFIG_CGROUP_PERF static struct cgroup_subsys_state * perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { struct perf_cgroup *jc; jc = kzalloc(sizeof(*jc), GFP_KERNEL); if (!jc) return ERR_PTR(-ENOMEM); jc->info = alloc_percpu(struct perf_cgroup_info); if (!jc->info) { kfree(jc); return ERR_PTR(-ENOMEM); } return &jc->css; } static void perf_cgroup_css_free(struct cgroup_subsys_state *css) { struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css); free_percpu(jc->info); kfree(jc); } static int perf_cgroup_css_online(struct cgroup_subsys_state *css) { perf_event_cgroup(css->cgroup); return 0; } static int __perf_cgroup_move(void *info) { struct task_struct *task = info; preempt_disable(); perf_cgroup_switch(task); preempt_enable(); return 0; } static void perf_cgroup_attach(struct cgroup_taskset *tset) { struct task_struct *task; struct cgroup_subsys_state *css; cgroup_taskset_for_each(task, css, tset) task_function_call(task, __perf_cgroup_move, task); } struct cgroup_subsys perf_event_cgrp_subsys = { .css_alloc = perf_cgroup_css_alloc, .css_free = perf_cgroup_css_free, .css_online = perf_cgroup_css_online, .attach = perf_cgroup_attach, /* * Implicitly enable on dfl hierarchy so that perf events can * always be filtered by cgroup2 path as long as perf_event * controller is not mounted on a legacy hierarchy. */ .implicit_on_dfl = true, .threaded = true, }; #endif /* CONFIG_CGROUP_PERF */ DEFINE_STATIC_CALL_RET0(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t);
387 387 387 387 387 481 481 481 481 481 166 370 481 481 481 481 481 481 481 481 481 481 481 481 481 481 481 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 // SPDX-License-Identifier: GPL-2.0 /* * Interface between ext4 and JBD */ #include "ext4_jbd2.h" #include <trace/events/ext4.h> int ext4_inode_journal_mode(struct inode *inode) { if (EXT4_JOURNAL(inode) == NULL) return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ /* We do not support data journalling with delayed allocation */ if (!S_ISREG(inode->i_mode) || ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && !test_opt(inode->i_sb, DELALLOC))) { /* We do not support data journalling for encrypted data */ if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ } if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ BUG(); } /* Just increment the non-pointer handle value */ static handle_t *ext4_get_nojournal(void) { handle_t *handle = current->journal_info; unsigned long ref_cnt = (unsigned long)handle; BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT); ref_cnt++; handle = (handle_t *)ref_cnt; current->journal_info = handle; return handle; } /* Decrement the non-pointer handle value */ static void ext4_put_nojournal(handle_t *handle) { unsigned long ref_cnt = (unsigned long)handle; BUG_ON(ref_cnt == 0); ref_cnt--; handle = (handle_t *)ref_cnt; current->journal_info = handle; } /* * Wrappers for jbd2_journal_start/end. */ static int ext4_journal_check_start(struct super_block *sb) { journal_t *journal; might_sleep(); if (unlikely(ext4_forced_shutdown(sb))) return -EIO; if (WARN_ON_ONCE(sb_rdonly(sb))) return -EROFS; WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE); journal = EXT4_SB(sb)->s_journal; /* * Special case here: if the journal has aborted behind our * backs (eg. EIO in the commit thread), then we still need to * take the FS itself readonly cleanly. */ if (journal && is_journal_aborted(journal)) { ext4_abort(sb, -journal->j_errno, "Detected aborted journal"); return -EROFS; } return 0; } handle_t *__ext4_journal_start_sb(struct inode *inode, struct super_block *sb, unsigned int line, int type, int blocks, int rsv_blocks, int revoke_creds) { journal_t *journal; int err; if (inode) trace_ext4_journal_start_inode(inode, blocks, rsv_blocks, revoke_creds, type, _RET_IP_); else trace_ext4_journal_start_sb(sb, blocks, rsv_blocks, revoke_creds, type, _RET_IP_); err = ext4_journal_check_start(sb); if (err < 0) return ERR_PTR(err); journal = EXT4_SB(sb)->s_journal; if (!journal || (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) return ext4_get_nojournal(); return jbd2__journal_start(journal, blocks, rsv_blocks, revoke_creds, GFP_NOFS, type, line); } int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) { struct super_block *sb; int err; int rc; if (!ext4_handle_valid(handle)) { ext4_put_nojournal(handle); return 0; } err = handle->h_err; if (!handle->h_transaction) { rc = jbd2_journal_stop(handle); return err ? err : rc; } sb = handle->h_transaction->t_journal->j_private; rc = jbd2_journal_stop(handle); if (!err) err = rc; if (err) __ext4_std_error(sb, where, line, err); return err; } handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line, int type) { struct super_block *sb; int err; if (!ext4_handle_valid(handle)) return ext4_get_nojournal(); sb = handle->h_journal->j_private; trace_ext4_journal_start_reserved(sb, jbd2_handle_buffer_credits(handle), _RET_IP_); err = ext4_journal_check_start(sb); if (err < 0) { jbd2_journal_free_reserved(handle); return ERR_PTR(err); } err = jbd2_journal_start_reserved(handle, type, line); if (err < 0) return ERR_PTR(err); return handle; } int __ext4_journal_ensure_credits(handle_t *handle, int check_cred, int extend_cred, int revoke_cred) { if (!ext4_handle_valid(handle)) return 0; if (is_handle_aborted(handle)) return -EROFS; if (jbd2_handle_buffer_credits(handle) >= check_cred && handle->h_revoke_credits >= revoke_cred) return 0; extend_cred = max(0, extend_cred - jbd2_handle_buffer_credits(handle)); revoke_cred = max(0, revoke_cred - handle->h_revoke_credits); return ext4_journal_extend(handle, extend_cred, revoke_cred); } static void ext4_journal_abort_handle(const char *caller, unsigned int line, const char *err_fn, struct buffer_head *bh, handle_t *handle, int err) { char nbuf[16]; const char *errstr = ext4_decode_error(NULL, err, nbuf); BUG_ON(!ext4_handle_valid(handle)); if (bh) BUFFER_TRACE(bh, "abort"); if (!handle->h_err) handle->h_err = err; if (is_handle_aborted(handle)) return; printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n", caller, line, errstr, err_fn); jbd2_journal_abort_handle(handle); } static void ext4_check_bdev_write_error(struct super_block *sb) { struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; struct ext4_sb_info *sbi = EXT4_SB(sb); int err; /* * If the block device has write error flag, it may have failed to * async write out metadata buffers in the background. In this case, * we could read old data from disk and write it out again, which * may lead to on-disk filesystem inconsistency. */ if (errseq_check(&mapping->wb_err, READ_ONCE(sbi->s_bdev_wb_err))) { spin_lock(&sbi->s_bdev_wb_lock); err = errseq_check_and_advance(&mapping->wb_err, &sbi->s_bdev_wb_err); spin_unlock(&sbi->s_bdev_wb_lock); if (err) ext4_error_err(sb, -err, "Error while async write back metadata"); } } int __ext4_journal_get_write_access(const char *where, unsigned int line, handle_t *handle, struct super_block *sb, struct buffer_head *bh, enum ext4_journal_trigger_type trigger_type) { int err; might_sleep(); ext4_check_bdev_write_error(sb); if (ext4_handle_valid(handle)) { err = jbd2_journal_get_write_access(handle, bh); if (err) { ext4_journal_abort_handle(where, line, __func__, bh, handle, err); return err; } } if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb)) return 0; BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT); jbd2_journal_set_triggers(bh, &EXT4_SB(sb)->s_journal_triggers[trigger_type].tr_triggers); return 0; } /* * The ext4 forget function must perform a revoke if we are freeing data * which has been journaled. Metadata (eg. indirect blocks) must be * revoked in all cases. * * "bh" may be NULL: a metadata block may have been freed from memory * but there may still be a record of it in the journal, and that record * still needs to be revoked. */ int __ext4_forget(const char *where, unsigned int line, handle_t *handle, int is_metadata, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t blocknr) { int err; might_sleep(); trace_ext4_forget(inode, is_metadata, blocknr); BUFFER_TRACE(bh, "enter"); ext4_debug("forgetting bh %p: is_metadata=%d, mode %o, data mode %x\n", bh, is_metadata, inode->i_mode, test_opt(inode->i_sb, DATA_FLAGS)); /* In the no journal case, we can just do a bforget and return */ if (!ext4_handle_valid(handle)) { bforget(bh); return 0; } /* Never use the revoke function if we are doing full data * journaling: there is no need to, and a V1 superblock won't * support it. Otherwise, only skip the revoke on un-journaled * data blocks. */ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || (!is_metadata && !ext4_should_journal_data(inode))) { if (bh) { BUFFER_TRACE(bh, "call jbd2_journal_forget"); err = jbd2_journal_forget(handle, bh); if (err) ext4_journal_abort_handle(where, line, __func__, bh, handle, err); return err; } return 0; } /* * data!=journal && (is_metadata || should_journal_data(inode)) */ BUFFER_TRACE(bh, "call jbd2_journal_revoke"); err = jbd2_journal_revoke(handle, blocknr, bh); if (err) { ext4_journal_abort_handle(where, line, __func__, bh, handle, err); __ext4_error(inode->i_sb, where, line, true, -err, 0, "error %d when attempting revoke", err); } BUFFER_TRACE(bh, "exit"); return err; } int __ext4_journal_get_create_access(const char *where, unsigned int line, handle_t *handle, struct super_block *sb, struct buffer_head *bh, enum ext4_journal_trigger_type trigger_type) { int err; if (!ext4_handle_valid(handle)) return 0; err = jbd2_journal_get_create_access(handle, bh); if (err) { ext4_journal_abort_handle(where, line, __func__, bh, handle, err); return err; } if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb)) return 0; BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT); jbd2_journal_set_triggers(bh, &EXT4_SB(sb)->s_journal_triggers[trigger_type].tr_triggers); return 0; } int __ext4_handle_dirty_metadata(const char *where, unsigned int line, handle_t *handle, struct inode *inode, struct buffer_head *bh) { int err = 0; might_sleep(); set_buffer_meta(bh); set_buffer_prio(bh); set_buffer_uptodate(bh); if (ext4_handle_valid(handle)) { err = jbd2_journal_dirty_metadata(handle, bh); /* Errors can only happen due to aborted journal or a nasty bug */ if (!is_handle_aborted(handle) && WARN_ON_ONCE(err)) { ext4_journal_abort_handle(where, line, __func__, bh, handle, err); if (inode == NULL) { pr_err("EXT4: jbd2_journal_dirty_metadata " "failed: handle type %u started at " "line %u, credits %u/%u, errcode %d", handle->h_type, handle->h_line_no, handle->h_requested_credits, jbd2_handle_buffer_credits(handle), err); return err; } ext4_error_inode(inode, where, line, bh->b_blocknr, "journal_dirty_metadata failed: " "handle type %u started at line %u, " "credits %u/%u, errcode %d", handle->h_type, handle->h_line_no, handle->h_requested_credits, jbd2_handle_buffer_credits(handle), err); } } else { if (inode) mark_buffer_dirty_inode(bh, inode); else mark_buffer_dirty(bh); if (inode && inode_needs_sync(inode)) { sync_dirty_buffer(bh); if (buffer_req(bh) && !buffer_uptodate(bh)) { ext4_error_inode_err(inode, where, line, bh->b_blocknr, EIO, "IO error syncing itable block"); err = -EIO; } } } return err; }
64 370 370 481 481 370 370 370 370 370 21 21 21 21 21 370 370 370 23 23 23 24 24 24 24 24 21 21 1 1 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 20 21 20 20 20 21 20 20 20 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/super.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/inode.c * * Copyright (C) 1991, 1992 Linus Torvalds * * Big-endian to little-endian byte-swapping/bitmaps by * David S. Miller (davem@caip.rutgers.edu), 1995 */ #include <linux/module.h> #include <linux/string.h> #include <linux/fs.h> #include <linux/time.h> #include <linux/vmalloc.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/parser.h> #include <linux/buffer_head.h> #include <linux/exportfs.h> #include <linux/vfs.h> #include <linux/random.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/quotaops.h> #include <linux/seq_file.h> #include <linux/ctype.h> #include <linux/log2.h> #include <linux/crc16.h> #include <linux/dax.h> #include <linux/uaccess.h> #include <linux/iversion.h> #include <linux/unicode.h> #include <linux/part_stat.h> #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/fsnotify.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include "ext4.h" #include "ext4_extents.h" /* Needed for trace points definition */ #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" #include "mballoc.h" #include "fsmap.h" #define CREATE_TRACE_POINTS #include <trace/events/ext4.h> static struct ext4_lazy_init *ext4_li_info; static DEFINE_MUTEX(ext4_li_mtx); static struct ratelimit_state ext4_mount_msg_ratelimit; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); static int ext4_show_options(struct seq_file *seq, struct dentry *root); static void ext4_update_super(struct super_block *sb); static int ext4_commit_super(struct super_block *sb); static int ext4_mark_recovery_complete(struct super_block *sb, struct ext4_super_block *es); static int ext4_clear_journal_err(struct super_block *sb, struct ext4_super_block *es); static int ext4_sync_fs(struct super_block *sb, int wait); static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); static int ext4_unfreeze(struct super_block *sb); static int ext4_freeze(struct super_block *sb); static inline int ext2_feature_set_ok(struct super_block *sb); static inline int ext3_feature_set_ok(struct super_block *sb); static void ext4_destroy_lazyinit_thread(void); static void ext4_unregister_li_request(struct super_block *sb); static void ext4_clear_request_list(void); static struct inode *ext4_get_journal_inode(struct super_block *sb, unsigned int journal_inum); static int ext4_validate_options(struct fs_context *fc); static int ext4_check_opt_consistency(struct fs_context *fc, struct super_block *sb); static void ext4_apply_options(struct fs_context *fc, struct super_block *sb); static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param); static int ext4_get_tree(struct fs_context *fc); static int ext4_reconfigure(struct fs_context *fc); static void ext4_fc_free(struct fs_context *fc); static int ext4_init_fs_context(struct fs_context *fc); static void ext4_kill_sb(struct super_block *sb); static const struct fs_parameter_spec ext4_param_specs[]; /* * Lock ordering * * page fault path: * mmap_lock -> sb_start_pagefault -> invalidate_lock (r) -> transaction start * -> page lock -> i_data_sem (rw) * * buffered write path: * sb_start_write -> i_mutex -> mmap_lock * sb_start_write -> i_mutex -> transaction start -> page lock -> * i_data_sem (rw) * * truncate: * sb_start_write -> i_mutex -> invalidate_lock (w) -> i_mmap_rwsem (w) -> * page lock * sb_start_write -> i_mutex -> invalidate_lock (w) -> transaction start -> * i_data_sem (rw) * * direct IO: * sb_start_write -> i_mutex -> mmap_lock * sb_start_write -> i_mutex -> transaction start -> i_data_sem (rw) * * writepages: * transaction start -> page lock(s) -> i_data_sem (rw) */ static const struct fs_context_operations ext4_context_ops = { .parse_param = ext4_parse_param, .get_tree = ext4_get_tree, .reconfigure = ext4_reconfigure, .free = ext4_fc_free, }; #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2) static struct file_system_type ext2_fs_type = { .owner = THIS_MODULE, .name = "ext2", .init_fs_context = ext4_init_fs_context, .parameters = ext4_param_specs, .kill_sb = ext4_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("ext2"); MODULE_ALIAS("ext2"); #define IS_EXT2_SB(sb) ((sb)->s_type == &ext2_fs_type) #else #define IS_EXT2_SB(sb) (0) #endif static struct file_system_type ext3_fs_type = { .owner = THIS_MODULE, .name = "ext3", .init_fs_context = ext4_init_fs_context, .parameters = ext4_param_specs, .kill_sb = ext4_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("ext3"); MODULE_ALIAS("ext3"); #define IS_EXT3_SB(sb) ((sb)->s_type == &ext3_fs_type) static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io) { /* * buffer's verified bit is no longer valid after reading from * disk again due to write out error, clear it to make sure we * recheck the buffer contents. */ clear_buffer_verified(bh); bh->b_end_io = end_io ? end_io : end_buffer_read_sync; get_bh(bh); submit_bh(REQ_OP_READ | op_flags, bh); } void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io) { BUG_ON(!buffer_locked(bh)); if (ext4_buffer_uptodate(bh)) { unlock_buffer(bh); return; } __ext4_read_bh(bh, op_flags, end_io); } int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io) { BUG_ON(!buffer_locked(bh)); if (ext4_buffer_uptodate(bh)) { unlock_buffer(bh); return 0; } __ext4_read_bh(bh, op_flags, end_io); wait_on_buffer(bh); if (buffer_uptodate(bh)) return 0; return -EIO; } int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait) { lock_buffer(bh); if (!wait) { ext4_read_bh_nowait(bh, op_flags, NULL); return 0; } return ext4_read_bh(bh, op_flags, NULL); } /* * This works like __bread_gfp() except it uses ERR_PTR for error * returns. Currently with sb_bread it's impossible to distinguish * between ENOMEM and EIO situations (since both result in a NULL * return. */ static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb, sector_t block, blk_opf_t op_flags, gfp_t gfp) { struct buffer_head *bh; int ret; bh = sb_getblk_gfp(sb, block, gfp); if (bh == NULL) return ERR_PTR(-ENOMEM); if (ext4_buffer_uptodate(bh)) return bh; ret = ext4_read_bh_lock(bh, REQ_META | op_flags, true); if (ret) { put_bh(bh); return ERR_PTR(ret); } return bh; } struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block, blk_opf_t op_flags) { gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_inode->i_mapping, ~__GFP_FS) | __GFP_MOVABLE; return __ext4_sb_bread_gfp(sb, block, op_flags, gfp); } struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, sector_t block) { gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_inode->i_mapping, ~__GFP_FS); return __ext4_sb_bread_gfp(sb, block, 0, gfp); } void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block) { struct buffer_head *bh = bdev_getblk(sb->s_bdev, block, sb->s_blocksize, GFP_NOWAIT | __GFP_NOWARN); if (likely(bh)) { if (trylock_buffer(bh)) ext4_read_bh_nowait(bh, REQ_RAHEAD, NULL); brelse(bh); } } static int ext4_verify_csum_type(struct super_block *sb, struct ext4_super_block *es) { if (!ext4_has_feature_metadata_csum(sb)) return 1; return es->s_checksum_type == EXT4_CRC32C_CHKSUM; } __le32 ext4_superblock_csum(struct super_block *sb, struct ext4_super_block *es) { struct ext4_sb_info *sbi = EXT4_SB(sb); int offset = offsetof(struct ext4_super_block, s_checksum); __u32 csum; csum = ext4_chksum(sbi, ~0, (char *)es, offset); return cpu_to_le32(csum); } static int ext4_superblock_csum_verify(struct super_block *sb, struct ext4_super_block *es) { if (!ext4_has_metadata_csum(sb)) return 1; return es->s_checksum == ext4_superblock_csum(sb, es); } void ext4_superblock_csum_set(struct super_block *sb) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; if (!ext4_has_metadata_csum(sb)) return; es->s_checksum = ext4_superblock_csum(sb, es); } ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, struct ext4_group_desc *bg) { return le32_to_cpu(bg->bg_block_bitmap_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0); } ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, struct ext4_group_desc *bg) { return le32_to_cpu(bg->bg_inode_bitmap_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0); } ext4_fsblk_t ext4_inode_table(struct super_block *sb, struct ext4_group_desc *bg) { return le32_to_cpu(bg->bg_inode_table_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0); } __u32 ext4_free_group_clusters(struct super_block *sb, struct ext4_group_desc *bg) { return le16_to_cpu(bg->bg_free_blocks_count_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0); } __u32 ext4_free_inodes_count(struct super_block *sb, struct ext4_group_desc *bg) { return le16_to_cpu(bg->bg_free_inodes_count_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0); } __u32 ext4_used_dirs_count(struct super_block *sb, struct ext4_group_desc *bg) { return le16_to_cpu(bg->bg_used_dirs_count_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0); } __u32 ext4_itable_unused_count(struct super_block *sb, struct ext4_group_desc *bg) { return le16_to_cpu(bg->bg_itable_unused_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0); } void ext4_block_bitmap_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk) { bg->bg_block_bitmap_lo = cpu_to_le32((u32)blk); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_block_bitmap_hi = cpu_to_le32(blk >> 32); } void ext4_inode_bitmap_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk) { bg->bg_inode_bitmap_lo = cpu_to_le32((u32)blk); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_inode_bitmap_hi = cpu_to_le32(blk >> 32); } void ext4_inode_table_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk) { bg->bg_inode_table_lo = cpu_to_le32((u32)blk); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_inode_table_hi = cpu_to_le32(blk >> 32); } void ext4_free_group_clusters_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count) { bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16); } void ext4_free_inodes_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count) { bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16); } void ext4_used_dirs_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count) { bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16); } void ext4_itable_unused_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count) { bg->bg_itable_unused_lo = cpu_to_le16((__u16)count); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); } static void __ext4_update_tstamp(__le32 *lo, __u8 *hi, time64_t now) { now = clamp_val(now, 0, (1ull << 40) - 1); *lo = cpu_to_le32(lower_32_bits(now)); *hi = upper_32_bits(now); } static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi) { return ((time64_t)(*hi) << 32) + le32_to_cpu(*lo); } #define ext4_update_tstamp(es, tstamp) \ __ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi, \ ktime_get_real_seconds()) #define ext4_get_tstamp(es, tstamp) \ __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi) #define EXT4_SB_REFRESH_INTERVAL_SEC (3600) /* seconds (1 hour) */ #define EXT4_SB_REFRESH_INTERVAL_KB (16384) /* kilobytes (16MB) */ /* * The ext4_maybe_update_superblock() function checks and updates the * superblock if needed. * * This function is designed to update the on-disk superblock only under * certain conditions to prevent excessive disk writes and unnecessary * waking of the disk from sleep. The superblock will be updated if: * 1. More than an hour has passed since the last superblock update, and * 2. More than 16MB have been written since the last superblock update. * * @sb: The superblock */ static void ext4_maybe_update_superblock(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; journal_t *journal = sbi->s_journal; time64_t now; __u64 last_update; __u64 lifetime_write_kbytes; __u64 diff_size; if (sb_rdonly(sb) || !(sb->s_flags & SB_ACTIVE) || !journal || (journal->j_flags & JBD2_UNMOUNT)) return; now = ktime_get_real_seconds(); last_update = ext4_get_tstamp(es, s_wtime); if (likely(now - last_update < EXT4_SB_REFRESH_INTERVAL_SEC)) return; lifetime_write_kbytes = sbi->s_kbytes_written + ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) - sbi->s_sectors_written_start) >> 1); /* Get the number of kilobytes not written to disk to account * for statistics and compare with a multiple of 16 MB. This * is used to determine when the next superblock commit should * occur (i.e. not more often than once per 16MB if there was * less written in an hour). */ diff_size = lifetime_write_kbytes - le64_to_cpu(es->s_kbytes_written); if (diff_size > EXT4_SB_REFRESH_INTERVAL_KB) schedule_work(&EXT4_SB(sb)->s_sb_upd_work); } /* * The del_gendisk() function uninitializes the disk-specific data * structures, including the bdi structure, without telling anyone * else. Once this happens, any attempt to call mark_buffer_dirty() * (for example, by ext4_commit_super), will cause a kernel OOPS. * This is a kludge to prevent these oops until we can put in a proper * hook in del_gendisk() to inform the VFS and file system layers. */ static int block_device_ejected(struct super_block *sb) { struct inode *bd_inode = sb->s_bdev->bd_inode; struct backing_dev_info *bdi = inode_to_bdi(bd_inode); return bdi->dev == NULL; } static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); int error = is_journal_aborted(journal); struct ext4_journal_cb_entry *jce; BUG_ON(txn->t_state == T_FINISHED); ext4_process_freed_data(sb, txn->t_tid); ext4_maybe_update_superblock(sb); spin_lock(&sbi->s_md_lock); while (!list_empty(&txn->t_private_list)) { jce = list_entry(txn->t_private_list.next, struct ext4_journal_cb_entry, jce_list); list_del_init(&jce->jce_list); spin_unlock(&sbi->s_md_lock); jce->jce_func(sb, jce, error); spin_lock(&sbi->s_md_lock); } spin_unlock(&sbi->s_md_lock); } /* * This writepage callback for write_cache_pages() * takes care of a few cases after page cleaning. * * write_cache_pages() already checks for dirty pages * and calls clear_page_dirty_for_io(), which we want, * to write protect the pages. * * However, we may have to redirty a page (see below.) */ static int ext4_journalled_writepage_callback(struct folio *folio, struct writeback_control *wbc, void *data) { transaction_t *transaction = (transaction_t *) data; struct buffer_head *bh, *head; struct journal_head *jh; bh = head = folio_buffers(folio); do { /* * We have to redirty a page in these cases: * 1) If buffer is dirty, it means the page was dirty because it * contains a buffer that needs checkpointing. So the dirty bit * needs to be preserved so that checkpointing writes the buffer * properly. * 2) If buffer is not part of the committing transaction * (we may have just accidentally come across this buffer because * inode range tracking is not exact) or if the currently running * transaction already contains this buffer as well, dirty bit * needs to be preserved so that the buffer gets writeprotected * properly on running transaction's commit. */ jh = bh2jh(bh); if (buffer_dirty(bh) || (jh && (jh->b_transaction != transaction || jh->b_next_transaction))) { folio_redirty_for_writepage(wbc, folio); goto out; } } while ((bh = bh->b_this_page) != head); out: return AOP_WRITEPAGE_ACTIVATE; } static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode) { struct address_space *mapping = jinode->i_vfs_inode->i_mapping; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, .range_start = jinode->i_dirty_start, .range_end = jinode->i_dirty_end, }; return write_cache_pages(mapping, &wbc, ext4_journalled_writepage_callback, jinode->i_transaction); } static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) { int ret; if (ext4_should_journal_data(jinode->i_vfs_inode)) ret = ext4_journalled_submit_inode_data_buffers(jinode); else ret = ext4_normal_submit_inode_data_buffers(jinode); return ret; } static int ext4_journal_finish_inode_data_buffers(struct jbd2_inode *jinode) { int ret = 0; if (!ext4_should_journal_data(jinode->i_vfs_inode)) ret = jbd2_journal_finish_inode_data_buffers(jinode); return ret; } static bool system_going_down(void) { return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF || system_state == SYSTEM_RESTART; } struct ext4_err_translation { int code; int errno; }; #define EXT4_ERR_TRANSLATE(err) { .code = EXT4_ERR_##err, .errno = err } static struct ext4_err_translation err_translation[] = { EXT4_ERR_TRANSLATE(EIO), EXT4_ERR_TRANSLATE(ENOMEM), EXT4_ERR_TRANSLATE(EFSBADCRC), EXT4_ERR_TRANSLATE(EFSCORRUPTED), EXT4_ERR_TRANSLATE(ENOSPC), EXT4_ERR_TRANSLATE(ENOKEY), EXT4_ERR_TRANSLATE(EROFS), EXT4_ERR_TRANSLATE(EFBIG), EXT4_ERR_TRANSLATE(EEXIST), EXT4_ERR_TRANSLATE(ERANGE), EXT4_ERR_TRANSLATE(EOVERFLOW), EXT4_ERR_TRANSLATE(EBUSY), EXT4_ERR_TRANSLATE(ENOTDIR), EXT4_ERR_TRANSLATE(ENOTEMPTY), EXT4_ERR_TRANSLATE(ESHUTDOWN), EXT4_ERR_TRANSLATE(EFAULT), }; static int ext4_errno_to_code(int errno) { int i; for (i = 0; i < ARRAY_SIZE(err_translation); i++) if (err_translation[i].errno == errno) return err_translation[i].code; return EXT4_ERR_UNKNOWN; } static void save_error_info(struct super_block *sb, int error, __u32 ino, __u64 block, const char *func, unsigned int line) { struct ext4_sb_info *sbi = EXT4_SB(sb); /* We default to EFSCORRUPTED error... */ if (error == 0) error = EFSCORRUPTED; spin_lock(&sbi->s_error_lock); sbi->s_add_error_count++; sbi->s_last_error_code = error; sbi->s_last_error_line = line; sbi->s_last_error_ino = ino; sbi->s_last_error_block = block; sbi->s_last_error_func = func; sbi->s_last_error_time = ktime_get_real_seconds(); if (!sbi->s_first_error_time) { sbi->s_first_error_code = error; sbi->s_first_error_line = line; sbi->s_first_error_ino = ino; sbi->s_first_error_block = block; sbi->s_first_error_func = func; sbi->s_first_error_time = sbi->s_last_error_time; } spin_unlock(&sbi->s_error_lock); } /* Deal with the reporting of failure conditions on a filesystem such as * inconsistencies detected or read IO failures. * * On ext2, we can store the error state of the filesystem in the * superblock. That is not possible on ext4, because we may have other * write ordering constraints on the superblock which prevent us from * writing it out straight away; and given that the journal is about to * be aborted, we can't rely on the current, or future, transactions to * write out the superblock safely. * * We'll just use the jbd2_journal_abort() error code to record an error in * the journal instead. On recovery, the journal will complain about * that error until we've noted it down and cleared it. * * If force_ro is set, we unconditionally force the filesystem into an * ABORT|READONLY state, unless the error response on the fs has been set to * panic in which case we take the easy way out and panic immediately. This is * used to deal with unrecoverable failures such as journal IO errors or ENOMEM * at a critical moment in log management. */ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, __u32 ino, __u64 block, const char *func, unsigned int line) { journal_t *journal = EXT4_SB(sb)->s_journal; bool continue_fs = !force_ro && test_opt(sb, ERRORS_CONT); EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; if (test_opt(sb, WARN_ON_ERROR)) WARN_ON_ONCE(1); if (!continue_fs && !sb_rdonly(sb)) { set_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags); if (journal) jbd2_journal_abort(journal, -EIO); } if (!bdev_read_only(sb->s_bdev)) { save_error_info(sb, error, ino, block, func, line); /* * In case the fs should keep running, we need to writeout * superblock through the journal. Due to lock ordering * constraints, it may not be safe to do it right here so we * defer superblock flushing to a workqueue. */ if (continue_fs && journal) schedule_work(&EXT4_SB(sb)->s_sb_upd_work); else ext4_commit_super(sb); } /* * We force ERRORS_RO behavior when system is rebooting. Otherwise we * could panic during 'reboot -f' as the underlying device got already * disabled. */ if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) { panic("EXT4-fs (device %s): panic forced after error\n", sb->s_id); } if (sb_rdonly(sb) || continue_fs) return; ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); /* * Make sure updated value of ->s_mount_flags will be visible before * ->s_flags update */ smp_wmb(); sb->s_flags |= SB_RDONLY; } static void update_super_work(struct work_struct *work) { struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info, s_sb_upd_work); journal_t *journal = sbi->s_journal; handle_t *handle; /* * If the journal is still running, we have to write out superblock * through the journal to avoid collisions of other journalled sb * updates. * * We use directly jbd2 functions here to avoid recursing back into * ext4 error handling code during handling of previous errors. */ if (!sb_rdonly(sbi->s_sb) && journal) { struct buffer_head *sbh = sbi->s_sbh; bool call_notify_err = false; handle = jbd2_journal_start(journal, 1); if (IS_ERR(handle)) goto write_directly; if (jbd2_journal_get_write_access(handle, sbh)) { jbd2_journal_stop(handle); goto write_directly; } if (sbi->s_add_error_count > 0) call_notify_err = true; ext4_update_super(sbi->s_sb); if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) { ext4_msg(sbi->s_sb, KERN_ERR, "previous I/O error to " "superblock detected"); clear_buffer_write_io_error(sbh); set_buffer_uptodate(sbh); } if (jbd2_journal_dirty_metadata(handle, sbh)) { jbd2_journal_stop(handle); goto write_directly; } jbd2_journal_stop(handle); if (call_notify_err) ext4_notify_error_sysfs(sbi); return; } write_directly: /* * Write through journal failed. Write sb directly to get error info * out and hope for the best. */ ext4_commit_super(sbi->s_sb); ext4_notify_error_sysfs(sbi); } #define ext4_error_ratelimit(sb) \ ___ratelimit(&(EXT4_SB(sb)->s_err_ratelimit_state), \ "EXT4-fs error") void __ext4_error(struct super_block *sb, const char *function, unsigned int line, bool force_ro, int error, __u64 block, const char *fmt, ...) { struct va_format vaf; va_list args; if (unlikely(ext4_forced_shutdown(sb))) return; trace_ext4_error(sb, function, line); if (ext4_error_ratelimit(sb)) { va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n", sb->s_id, function, line, current->comm, &vaf); va_end(args); } fsnotify_sb_error(sb, NULL, error ? error : EFSCORRUPTED); ext4_handle_error(sb, force_ro, error, 0, block, function, line); } void __ext4_error_inode(struct inode *inode, const char *function, unsigned int line, ext4_fsblk_t block, int error, const char *fmt, ...) { va_list args; struct va_format vaf; if (unlikely(ext4_forced_shutdown(inode->i_sb))) return; trace_ext4_error(inode->i_sb, function, line); if (ext4_error_ratelimit(inode->i_sb)) { va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; if (block) printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " "inode #%lu: block %llu: comm %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, block, current->comm, &vaf); else printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " "inode #%lu: comm %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, current->comm, &vaf); va_end(args); } fsnotify_sb_error(inode->i_sb, inode, error ? error : EFSCORRUPTED); ext4_handle_error(inode->i_sb, false, error, inode->i_ino, block, function, line); } void __ext4_error_file(struct file *file, const char *function, unsigned int line, ext4_fsblk_t block, const char *fmt, ...) { va_list args; struct va_format vaf; struct inode *inode = file_inode(file); char pathname[80], *path; if (unlikely(ext4_forced_shutdown(inode->i_sb))) return; trace_ext4_error(inode->i_sb, function, line); if (ext4_error_ratelimit(inode->i_sb)) { path = file_path(file, pathname, sizeof(pathname)); if (IS_ERR(path)) path = "(unknown)"; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; if (block) printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: " "block %llu: comm %s: path %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, block, current->comm, path, &vaf); else printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: " "comm %s: path %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, current->comm, path, &vaf); va_end(args); } fsnotify_sb_error(inode->i_sb, inode, EFSCORRUPTED); ext4_handle_error(inode->i_sb, false, EFSCORRUPTED, inode->i_ino, block, function, line); } const char *ext4_decode_error(struct super_block *sb, int errno, char nbuf[16]) { char *errstr = NULL; switch (errno) { case -EFSCORRUPTED: errstr = "Corrupt filesystem"; break; case -EFSBADCRC: errstr = "Filesystem failed CRC"; break; case -EIO: errstr = "IO failure"; break; case -ENOMEM: errstr = "Out of memory"; break; case -EROFS: if (!sb || (EXT4_SB(sb)->s_journal && EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT)) errstr = "Journal has aborted"; else errstr = "Readonly filesystem"; break; default: /* If the caller passed in an extra buffer for unknown * errors, textualise them now. Else we just return * NULL. */ if (nbuf) { /* Check for truncated error codes... */ if (snprintf(nbuf, 16, "error %d", -errno) >= 0) errstr = nbuf; } break; } return errstr; } /* __ext4_std_error decodes expected errors from journaling functions * automatically and invokes the appropriate error response. */ void __ext4_std_error(struct super_block *sb, const char *function, unsigned int line, int errno) { char nbuf[16]; const char *errstr; if (unlikely(ext4_forced_shutdown(sb))) return; /* Special case: if the error is EROFS, and we're not already * inside a transaction, then there's really no point in logging * an error. */ if (errno == -EROFS && journal_current_handle() == NULL && sb_rdonly(sb)) return; if (ext4_error_ratelimit(sb)) { errstr = ext4_decode_error(sb, errno, nbuf); printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n", sb->s_id, function, line, errstr); } fsnotify_sb_error(sb, NULL, errno ? errno : EFSCORRUPTED); ext4_handle_error(sb, false, -errno, 0, 0, function, line); } void __ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...) { struct va_format vaf; va_list args; if (sb) { atomic_inc(&EXT4_SB(sb)->s_msg_count); if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), "EXT4-fs")) return; } va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; if (sb) printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf); else printk("%sEXT4-fs: %pV\n", prefix, &vaf); va_end(args); } static int ext4_warning_ratelimit(struct super_block *sb) { atomic_inc(&EXT4_SB(sb)->s_warning_count); return ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), "EXT4-fs warning"); } void __ext4_warning(struct super_block *sb, const char *function, unsigned int line, const char *fmt, ...) { struct va_format vaf; va_list args; if (!ext4_warning_ratelimit(sb)) return; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n", sb->s_id, function, line, &vaf); va_end(args); } void __ext4_warning_inode(const struct inode *inode, const char *function, unsigned int line, const char *fmt, ...) { struct va_format vaf; va_list args; if (!ext4_warning_ratelimit(inode->i_sb)) return; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: " "inode #%lu: comm %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, current->comm, &vaf); va_end(args); } void __ext4_grp_locked_error(const char *function, unsigned int line, struct super_block *sb, ext4_group_t grp, unsigned long ino, ext4_fsblk_t block, const char *fmt, ...) __releases(bitlock) __acquires(bitlock) { struct va_format vaf; va_list args; if (unlikely(ext4_forced_shutdown(sb))) return; trace_ext4_error(sb, function, line); if (ext4_error_ratelimit(sb)) { va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ", sb->s_id, function, line, grp); if (ino) printk(KERN_CONT "inode %lu: ", ino); if (block) printk(KERN_CONT "block %llu:", (unsigned long long) block); printk(KERN_CONT "%pV\n", &vaf); va_end(args); } if (test_opt(sb, ERRORS_CONT)) { if (test_opt(sb, WARN_ON_ERROR)) WARN_ON_ONCE(1); EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; if (!bdev_read_only(sb->s_bdev)) { save_error_info(sb, EFSCORRUPTED, ino, block, function, line); schedule_work(&EXT4_SB(sb)->s_sb_upd_work); } return; } ext4_unlock_group(sb, grp); ext4_handle_error(sb, false, EFSCORRUPTED, ino, block, function, line); /* * We only get here in the ERRORS_RO case; relocking the group * may be dangerous, but nothing bad will happen since the * filesystem will have already been marked read/only and the * journal has been aborted. We return 1 as a hint to callers * who might what to use the return value from * ext4_grp_locked_error() to distinguish between the * ERRORS_CONT and ERRORS_RO case, and perhaps return more * aggressively from the ext4 function in question, with a * more appropriate error code. */ ext4_lock_group(sb, grp); return; } void ext4_mark_group_bitmap_corrupted(struct super_block *sb, ext4_group_t group, unsigned int flags) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); int ret; if (!grp || !gdp) return; if (flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) { ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); if (!ret) percpu_counter_sub(&sbi->s_freeclusters_counter, grp->bb_free); } if (flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) { ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); if (!ret && gdp) { int count; count = ext4_free_inodes_count(sb, gdp); percpu_counter_sub(&sbi->s_freeinodes_counter, count); } } } void ext4_update_dynamic_rev(struct super_block *sb) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV) return; ext4_warning(sb, "updating to rev %d because of new feature flag, " "running e2fsck is recommended", EXT4_DYNAMIC_REV); es->s_first_ino = cpu_to_le32(EXT4_GOOD_OLD_FIRST_INO); es->s_inode_size = cpu_to_le16(EXT4_GOOD_OLD_INODE_SIZE); es->s_rev_level = cpu_to_le32(EXT4_DYNAMIC_REV); /* leave es->s_feature_*compat flags alone */ /* es->s_uuid will be set by e2fsck if empty */ /* * The rest of the superblock fields should be zero, and if not it * means they are likely already in use, so leave them alone. We * can leave it up to e2fsck to clean up any inconsistencies there. */ } static inline struct inode *orphan_list_entry(struct list_head *l) { return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode; } static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi) { struct list_head *l; ext4_msg(sb, KERN_ERR, "sb orphan head is %d", le32_to_cpu(sbi->s_es->s_last_orphan)); printk(KERN_ERR "sb_info orphan list:\n"); list_for_each(l, &sbi->s_orphan) { struct inode *inode = orphan_list_entry(l); printk(KERN_ERR " " "inode %s:%lu at %p: mode %o, nlink %d, next %d\n", inode->i_sb->s_id, inode->i_ino, inode, inode->i_mode, inode->i_nlink, NEXT_ORPHAN(inode)); } } #ifdef CONFIG_QUOTA static int ext4_quota_off(struct super_block *sb, int type); static inline void ext4_quotas_off(struct super_block *sb, int type) { BUG_ON(type > EXT4_MAXQUOTAS); /* Use our quota_off function to clear inode flags etc. */ for (type--; type >= 0; type--) ext4_quota_off(sb, type); } /* * This is a helper function which is used in the mount/remount * codepaths (which holds s_umount) to fetch the quota file name. */ static inline char *get_qf_name(struct super_block *sb, struct ext4_sb_info *sbi, int type) { return rcu_dereference_protected(sbi->s_qf_names[type], lockdep_is_held(&sb->s_umount)); } #else static inline void ext4_quotas_off(struct super_block *sb, int type) { } #endif static int ext4_percpu_param_init(struct ext4_sb_info *sbi) { ext4_fsblk_t block; int err; block = ext4_count_free_clusters(sbi->s_sb); ext4_free_blocks_count_set(sbi->s_es, EXT4_C2B(sbi, block)); err = percpu_counter_init(&sbi->s_freeclusters_counter, block, GFP_KERNEL); if (!err) { unsigned long freei = ext4_count_free_inodes(sbi->s_sb); sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, GFP_KERNEL); } if (!err) err = percpu_counter_init(&sbi->s_dirs_counter, ext4_count_dirs(sbi->s_sb), GFP_KERNEL); if (!err) err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, GFP_KERNEL); if (!err) err = percpu_counter_init(&sbi->s_sra_exceeded_retry_limit, 0, GFP_KERNEL); if (!err) err = percpu_init_rwsem(&sbi->s_writepages_rwsem); if (err) ext4_msg(sbi->s_sb, KERN_ERR, "insufficient memory"); return err; } static void ext4_percpu_param_destroy(struct ext4_sb_info *sbi) { percpu_counter_destroy(&sbi->s_freeclusters_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); percpu_counter_destroy(&sbi->s_dirtyclusters_counter); percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit); percpu_free_rwsem(&sbi->s_writepages_rwsem); } static void ext4_group_desc_free(struct ext4_sb_info *sbi) { struct buffer_head **group_desc; int i; rcu_read_lock(); group_desc = rcu_dereference(sbi->s_group_desc); for (i = 0; i < sbi->s_gdb_count; i++) brelse(group_desc[i]); kvfree(group_desc); rcu_read_unlock(); } static void ext4_flex_groups_free(struct ext4_sb_info *sbi) { struct flex_groups **flex_groups; int i; rcu_read_lock(); flex_groups = rcu_dereference(sbi->s_flex_groups); if (flex_groups) { for (i = 0; i < sbi->s_flex_groups_allocated; i++) kvfree(flex_groups[i]); kvfree(flex_groups); } rcu_read_unlock(); } static void ext4_put_super(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; int aborted = 0; int err; /* * Unregister sysfs before destroying jbd2 journal. * Since we could still access attr_journal_task attribute via sysfs * path which could have sbi->s_journal->j_task as NULL * Unregister sysfs before flush sbi->s_sb_upd_work. * Since user may read /proc/fs/ext4/xx/mb_groups during umount, If * read metadata verify failed then will queue error work. * update_super_work will call start_this_handle may trigger * BUG_ON. */ ext4_unregister_sysfs(sb); if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs unmount")) ext4_msg(sb, KERN_INFO, "unmounting filesystem %pU.", &sb->s_uuid); ext4_unregister_li_request(sb); ext4_quotas_off(sb, EXT4_MAXQUOTAS); flush_work(&sbi->s_sb_upd_work); destroy_workqueue(sbi->rsv_conversion_wq); ext4_release_orphan_info(sb); if (sbi->s_journal) { aborted = is_journal_aborted(sbi->s_journal); err = jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; if ((err < 0) && !aborted) { ext4_abort(sb, -err, "Couldn't clean up the journal"); } } ext4_es_unregister_shrinker(sbi); timer_shutdown_sync(&sbi->s_err_report); ext4_release_system_zone(sb); ext4_mb_release(sb); ext4_ext_release(sb); if (!sb_rdonly(sb) && !aborted) { ext4_clear_feature_journal_needs_recovery(sb); ext4_clear_feature_orphan_present(sb); es->s_state = cpu_to_le16(sbi->s_mount_state); } if (!sb_rdonly(sb)) ext4_commit_super(sb); ext4_group_desc_free(sbi); ext4_flex_groups_free(sbi); ext4_percpu_param_destroy(sbi); #ifdef CONFIG_QUOTA for (int i = 0; i < EXT4_MAXQUOTAS; i++) kfree(get_qf_name(sb, sbi, i)); #endif /* Debugging code just in case the in-memory inode orphan list * isn't empty. The on-disk one can be non-empty if we've * detected an error and taken the fs readonly, but the * in-memory list had better be clean by this point. */ if (!list_empty(&sbi->s_orphan)) dump_orphan_list(sb, sbi); ASSERT(list_empty(&sbi->s_orphan)); sync_blockdev(sb->s_bdev); invalidate_bdev(sb->s_bdev); if (sbi->s_journal_bdev_handle) { /* * Invalidate the journal device's buffers. We don't want them * floating about in memory - the physical journal device may * hotswapped, and it breaks the `ro-after' testing code. */ sync_blockdev(sbi->s_journal_bdev_handle->bdev); invalidate_bdev(sbi->s_journal_bdev_handle->bdev); } ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); sbi->s_ea_inode_cache = NULL; ext4_xattr_destroy_cache(sbi->s_ea_block_cache); sbi->s_ea_block_cache = NULL; ext4_stop_mmpd(sbi); brelse(sbi->s_sbh); sb->s_fs_info = NULL; /* * Now that we are completely done shutting down the * superblock, we need to actually destroy the kobject. */ kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); kfree(sbi->s_blockgroup_lock); fs_put_dax(sbi->s_daxdev, NULL); fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); #if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); #endif kfree(sbi); } static struct kmem_cache *ext4_inode_cachep; /* * Called inside transaction, so use GFP_NOFS */ static struct inode *ext4_alloc_inode(struct super_block *sb) { struct ext4_inode_info *ei; ei = alloc_inode_sb(sb, ext4_inode_cachep, GFP_NOFS); if (!ei) return NULL; inode_set_iversion(&ei->vfs_inode, 1); ei->i_flags = 0; spin_lock_init(&ei->i_raw_lock); ei->i_prealloc_node = RB_ROOT; atomic_set(&ei->i_prealloc_active, 0); rwlock_init(&ei->i_prealloc_lock); ext4_es_init_tree(&ei->i_es_tree); rwlock_init(&ei->i_es_lock); INIT_LIST_HEAD(&ei->i_es_list); ei->i_es_all_nr = 0; ei->i_es_shk_nr = 0; ei->i_es_shrink_lblk = 0; ei->i_reserved_data_blocks = 0; spin_lock_init(&(ei->i_block_reservation_lock)); ext4_init_pending_tree(&ei->i_pending_tree); #ifdef CONFIG_QUOTA ei->i_reserved_quota = 0; memset(&ei->i_dquot, 0, sizeof(ei->i_dquot)); #endif ei->jinode = NULL; INIT_LIST_HEAD(&ei->i_rsv_conversion_list); spin_lock_init(&ei->i_completed_io_lock); ei->i_sync_tid = 0; ei->i_datasync_tid = 0; atomic_set(&ei->i_unwritten, 0); INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); ext4_fc_init_inode(&ei->vfs_inode); mutex_init(&ei->i_fc_lock); return &ei->vfs_inode; } static int ext4_drop_inode(struct inode *inode) { int drop = generic_drop_inode(inode); if (!drop) drop = fscrypt_drop_inode(inode); trace_ext4_drop_inode(inode, drop); return drop; } static void ext4_free_in_core_inode(struct inode *inode) { fscrypt_free_inode(inode); if (!list_empty(&(EXT4_I(inode)->i_fc_list))) { pr_warn("%s: inode %ld still in fc list", __func__, inode->i_ino); } kmem_cache_free(ext4_inode_cachep, EXT4_I(inode)); } static void ext4_destroy_inode(struct inode *inode) { if (!list_empty(&(EXT4_I(inode)->i_orphan))) { ext4_msg(inode->i_sb, KERN_ERR, "Inode %lu (%p): orphan list check failed!", inode->i_ino, EXT4_I(inode)); print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4, EXT4_I(inode), sizeof(struct ext4_inode_info), true); dump_stack(); } if (EXT4_I(inode)->i_reserved_data_blocks) ext4_msg(inode->i_sb, KERN_ERR, "Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!", inode->i_ino, EXT4_I(inode), EXT4_I(inode)->i_reserved_data_blocks); } static void ext4_shutdown(struct super_block *sb) { ext4_force_shutdown(sb, EXT4_GOING_FLAGS_NOLOGFLUSH); } static void init_once(void *foo) { struct ext4_inode_info *ei = foo; INIT_LIST_HEAD(&ei->i_orphan); init_rwsem(&ei->xattr_sem); init_rwsem(&ei->i_data_sem); inode_init_once(&ei->vfs_inode); ext4_fc_init_inode(&ei->vfs_inode); } static int __init init_inodecache(void) { ext4_inode_cachep = kmem_cache_create_usercopy("ext4_inode_cache", sizeof(struct ext4_inode_info), 0, (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| SLAB_ACCOUNT), offsetof(struct ext4_inode_info, i_data), sizeof_field(struct ext4_inode_info, i_data), init_once); if (ext4_inode_cachep == NULL) return -ENOMEM; return 0; } static void destroy_inodecache(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(ext4_inode_cachep); } void ext4_clear_inode(struct inode *inode) { ext4_fc_del(inode); invalidate_inode_buffers(inode); clear_inode(inode); ext4_discard_preallocations(inode, 0); ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); dquot_drop(inode); if (EXT4_I(inode)->jinode) { jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), EXT4_I(inode)->jinode); jbd2_free_inode(EXT4_I(inode)->jinode); EXT4_I(inode)->jinode = NULL; } fscrypt_put_encryption_info(inode); fsverity_cleanup_inode(inode); } static struct inode *ext4_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { struct inode *inode; /* * Currently we don't know the generation for parent directory, so * a generation of 0 means "accept any" */ inode = ext4_iget(sb, ino, EXT4_IGET_HANDLE); if (IS_ERR(inode)) return ERR_CAST(inode); if (generation && inode->i_generation != generation) { iput(inode); return ERR_PTR(-ESTALE); } return inode; } static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return generic_fh_to_dentry(sb, fid, fh_len, fh_type, ext4_nfs_get_inode); } static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return generic_fh_to_parent(sb, fid, fh_len, fh_type, ext4_nfs_get_inode); } static int ext4_nfs_commit_metadata(struct inode *inode) { struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL }; trace_ext4_nfs_commit_metadata(inode); return ext4_write_inode(inode, &wbc); } #ifdef CONFIG_QUOTA static const char * const quotatypes[] = INITQFNAMES; #define QTYPE2NAME(t) (quotatypes[t]) static int ext4_write_dquot(struct dquot *dquot); static int ext4_acquire_dquot(struct dquot *dquot); static int ext4_release_dquot(struct dquot *dquot); static int ext4_mark_dquot_dirty(struct dquot *dquot); static int ext4_write_info(struct super_block *sb, int type); static int ext4_quota_on(struct super_block *sb, int type, int format_id, const struct path *path); static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off); static ssize_t ext4_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off); static int ext4_quota_enable(struct super_block *sb, int type, int format_id, unsigned int flags); static struct dquot **ext4_get_dquots(struct inode *inode) { return EXT4_I(inode)->i_dquot; } static const struct dquot_operations ext4_quota_operations = { .get_reserved_space = ext4_get_reserved_space, .write_dquot = ext4_write_dquot, .acquire_dquot = ext4_acquire_dquot, .release_dquot = ext4_release_dquot, .mark_dirty = ext4_mark_dquot_dirty, .write_info = ext4_write_info, .alloc_dquot = dquot_alloc, .destroy_dquot = dquot_destroy, .get_projid = ext4_get_projid, .get_inode_usage = ext4_get_inode_usage, .get_next_id = dquot_get_next_id, }; static const struct quotactl_ops ext4_qctl_operations = { .quota_on = ext4_quota_on, .quota_off = ext4_quota_off, .quota_sync = dquot_quota_sync, .get_state = dquot_get_state, .set_info = dquot_set_dqinfo, .get_dqblk = dquot_get_dqblk, .set_dqblk = dquot_set_dqblk, .get_nextdqblk = dquot_get_next_dqblk, }; #endif static const struct super_operations ext4_sops = { .alloc_inode = ext4_alloc_inode, .free_inode = ext4_free_in_core_inode, .destroy_inode = ext4_destroy_inode, .write_inode = ext4_write_inode, .dirty_inode = ext4_dirty_inode, .drop_inode = ext4_drop_inode, .evict_inode = ext4_evict_inode, .put_super = ext4_put_super, .sync_fs = ext4_sync_fs, .freeze_fs = ext4_freeze, .unfreeze_fs = ext4_unfreeze, .statfs = ext4_statfs, .show_options = ext4_show_options, .shutdown = ext4_shutdown, #ifdef CONFIG_QUOTA .quota_read = ext4_quota_read, .quota_write = ext4_quota_write, .get_dquots = ext4_get_dquots, #endif }; static const struct export_operations ext4_export_ops = { .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = ext4_fh_to_dentry, .fh_to_parent = ext4_fh_to_parent, .get_parent = ext4_get_parent, .commit_metadata = ext4_nfs_commit_metadata, }; enum { Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, Opt_resgid, Opt_resuid, Opt_sb, Opt_nouid32, Opt_debug, Opt_removed, Opt_user_xattr, Opt_acl, Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev, Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption, Opt_inlinecrypt, Opt_usrjquota, Opt_grpjquota, Opt_quota, Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error, Opt_nowarn_on_error, Opt_mblk_io_submit, Opt_debug_want_extra_isize, Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, Opt_inode_readahead_blks, Opt_journal_ioprio, Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan, Opt_errors, Opt_data, Opt_data_err, Opt_jqfmt, Opt_dax_type, #ifdef CONFIG_EXT4_DEBUG Opt_fc_debug_max_replay, Opt_fc_debug_force #endif }; static const struct constant_table ext4_param_errors[] = { {"continue", EXT4_MOUNT_ERRORS_CONT}, {"panic", EXT4_MOUNT_ERRORS_PANIC}, {"remount-ro", EXT4_MOUNT_ERRORS_RO}, {} }; static const struct constant_table ext4_param_data[] = { {"journal", EXT4_MOUNT_JOURNAL_DATA}, {"ordered", EXT4_MOUNT_ORDERED_DATA}, {"writeback", EXT4_MOUNT_WRITEBACK_DATA}, {} }; static const struct constant_table ext4_param_data_err[] = { {"abort", Opt_data_err_abort}, {"ignore", Opt_data_err_ignore}, {} }; static const struct constant_table ext4_param_jqfmt[] = { {"vfsold", QFMT_VFS_OLD}, {"vfsv0", QFMT_VFS_V0}, {"vfsv1", QFMT_VFS_V1}, {} }; static const struct constant_table ext4_param_dax[] = { {"always", Opt_dax_always}, {"inode", Opt_dax_inode}, {"never", Opt_dax_never}, {} }; /* String parameter that allows empty argument */ #define fsparam_string_empty(NAME, OPT) \ __fsparam(fs_param_is_string, NAME, OPT, fs_param_can_be_empty, NULL) /* * Mount option specification * We don't use fsparam_flag_no because of the way we set the * options and the way we show them in _ext4_show_options(). To * keep the changes to a minimum, let's keep the negative options * separate for now. */ static const struct fs_parameter_spec ext4_param_specs[] = { fsparam_flag ("bsddf", Opt_bsd_df), fsparam_flag ("minixdf", Opt_minix_df), fsparam_flag ("grpid", Opt_grpid), fsparam_flag ("bsdgroups", Opt_grpid), fsparam_flag ("nogrpid", Opt_nogrpid), fsparam_flag ("sysvgroups", Opt_nogrpid), fsparam_u32 ("resgid", Opt_resgid), fsparam_u32 ("resuid", Opt_resuid), fsparam_u32 ("sb", Opt_sb), fsparam_enum ("errors", Opt_errors, ext4_param_errors), fsparam_flag ("nouid32", Opt_nouid32), fsparam_flag ("debug", Opt_debug), fsparam_flag ("oldalloc", Opt_removed), fsparam_flag ("orlov", Opt_removed), fsparam_flag ("user_xattr", Opt_user_xattr), fsparam_flag ("acl", Opt_acl), fsparam_flag ("norecovery", Opt_noload), fsparam_flag ("noload", Opt_noload), fsparam_flag ("bh", Opt_removed), fsparam_flag ("nobh", Opt_removed), fsparam_u32 ("commit", Opt_commit), fsparam_u32 ("min_batch_time", Opt_min_batch_time), fsparam_u32 ("max_batch_time", Opt_max_batch_time), fsparam_u32 ("journal_dev", Opt_journal_dev), fsparam_bdev ("journal_path", Opt_journal_path), fsparam_flag ("journal_checksum", Opt_journal_checksum), fsparam_flag ("nojournal_checksum", Opt_nojournal_checksum), fsparam_flag ("journal_async_commit",Opt_journal_async_commit), fsparam_flag ("abort", Opt_abort), fsparam_enum ("data", Opt_data, ext4_param_data), fsparam_enum ("data_err", Opt_data_err, ext4_param_data_err), fsparam_string_empty ("usrjquota", Opt_usrjquota), fsparam_string_empty ("grpjquota", Opt_grpjquota), fsparam_enum ("jqfmt", Opt_jqfmt, ext4_param_jqfmt), fsparam_flag ("grpquota", Opt_grpquota), fsparam_flag ("quota", Opt_quota), fsparam_flag ("noquota", Opt_noquota), fsparam_flag ("usrquota", Opt_usrquota), fsparam_flag ("prjquota", Opt_prjquota), fsparam_flag ("barrier", Opt_barrier), fsparam_u32 ("barrier", Opt_barrier), fsparam_flag ("nobarrier", Opt_nobarrier), fsparam_flag ("i_version", Opt_removed), fsparam_flag ("dax", Opt_dax), fsparam_enum ("dax", Opt_dax_type, ext4_param_dax), fsparam_u32 ("stripe", Opt_stripe), fsparam_flag ("delalloc", Opt_delalloc), fsparam_flag ("nodelalloc", Opt_nodelalloc), fsparam_flag ("warn_on_error", Opt_warn_on_error), fsparam_flag ("nowarn_on_error", Opt_nowarn_on_error), fsparam_u32 ("debug_want_extra_isize", Opt_debug_want_extra_isize), fsparam_flag ("mblk_io_submit", Opt_removed), fsparam_flag ("nomblk_io_submit", Opt_removed), fsparam_flag ("block_validity", Opt_block_validity), fsparam_flag ("noblock_validity", Opt_noblock_validity), fsparam_u32 ("inode_readahead_blks", Opt_inode_readahead_blks), fsparam_u32 ("journal_ioprio", Opt_journal_ioprio), fsparam_u32 ("auto_da_alloc", Opt_auto_da_alloc), fsparam_flag ("auto_da_alloc", Opt_auto_da_alloc), fsparam_flag ("noauto_da_alloc", Opt_noauto_da_alloc), fsparam_flag ("dioread_nolock", Opt_dioread_nolock), fsparam_flag ("nodioread_nolock", Opt_dioread_lock), fsparam_flag ("dioread_lock", Opt_dioread_lock), fsparam_flag ("discard", Opt_discard), fsparam_flag ("nodiscard", Opt_nodiscard), fsparam_u32 ("init_itable", Opt_init_itable), fsparam_flag ("init_itable", Opt_init_itable), fsparam_flag ("noinit_itable", Opt_noinit_itable), #ifdef CONFIG_EXT4_DEBUG fsparam_flag ("fc_debug_force", Opt_fc_debug_force), fsparam_u32 ("fc_debug_max_replay", Opt_fc_debug_max_replay), #endif fsparam_u32 ("max_dir_size_kb", Opt_max_dir_size_kb), fsparam_flag ("test_dummy_encryption", Opt_test_dummy_encryption), fsparam_string ("test_dummy_encryption", Opt_test_dummy_encryption), fsparam_flag ("inlinecrypt", Opt_inlinecrypt), fsparam_flag ("nombcache", Opt_nombcache), fsparam_flag ("no_mbcache", Opt_nombcache), /* for backward compatibility */ fsparam_flag ("prefetch_block_bitmaps", Opt_removed), fsparam_flag ("no_prefetch_block_bitmaps", Opt_no_prefetch_block_bitmaps), fsparam_s32 ("mb_optimize_scan", Opt_mb_optimize_scan), fsparam_string ("check", Opt_removed), /* mount option from ext2/3 */ fsparam_flag ("nocheck", Opt_removed), /* mount option from ext2/3 */ fsparam_flag ("reservation", Opt_removed), /* mount option from ext2/3 */ fsparam_flag ("noreservation", Opt_removed), /* mount option from ext2/3 */ fsparam_u32 ("journal", Opt_removed), /* mount option from ext2/3 */ {} }; #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) #define MOPT_SET 0x0001 #define MOPT_CLEAR 0x0002 #define MOPT_NOSUPPORT 0x0004 #define MOPT_EXPLICIT 0x0008 #ifdef CONFIG_QUOTA #define MOPT_Q 0 #define MOPT_QFMT 0x0010 #else #define MOPT_Q MOPT_NOSUPPORT #define MOPT_QFMT MOPT_NOSUPPORT #endif #define MOPT_NO_EXT2 0x0020 #define MOPT_NO_EXT3 0x0040 #define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3) #define MOPT_SKIP 0x0080 #define MOPT_2 0x0100 static const struct mount_opts { int token; int mount_opt; int flags; } ext4_mount_opts[] = { {Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET}, {Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR}, {Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET}, {Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR}, {Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET}, {Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR}, {Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_EXT4_ONLY | MOPT_SET}, {Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_EXT4_ONLY | MOPT_CLEAR}, {Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET}, {Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR}, {Opt_delalloc, EXT4_MOUNT_DELALLOC, MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, {Opt_nodelalloc, EXT4_MOUNT_DELALLOC, MOPT_EXT4_ONLY | MOPT_CLEAR}, {Opt_warn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_SET}, {Opt_nowarn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_CLEAR}, {Opt_commit, 0, MOPT_NO_EXT2}, {Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_EXT4_ONLY | MOPT_CLEAR}, {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT | EXT4_MOUNT_JOURNAL_CHECKSUM), MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET}, {Opt_data_err, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_NO_EXT2}, {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET}, {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR}, {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET}, {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR}, {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR}, {Opt_dax_type, 0, MOPT_EXT4_ONLY}, {Opt_journal_dev, 0, MOPT_NO_EXT2}, {Opt_journal_path, 0, MOPT_NO_EXT2}, {Opt_journal_ioprio, 0, MOPT_NO_EXT2}, {Opt_data, 0, MOPT_NO_EXT2}, {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET}, #ifdef CONFIG_EXT4_FS_POSIX_ACL {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET}, #else {Opt_acl, 0, MOPT_NOSUPPORT}, #endif {Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET}, {Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET}, {Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q}, {Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q}, {Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA, MOPT_SET | MOPT_Q}, {Opt_prjquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_PRJQUOTA, MOPT_SET | MOPT_Q}, {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA | EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA), MOPT_CLEAR | MOPT_Q}, {Opt_usrjquota, 0, MOPT_Q}, {Opt_grpjquota, 0, MOPT_Q}, {Opt_jqfmt, 0, MOPT_QFMT}, {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, {Opt_no_prefetch_block_bitmaps, EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS, MOPT_SET}, #ifdef CONFIG_EXT4_DEBUG {Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT, MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY}, #endif {Opt_abort, EXT4_MOUNT2_ABORT, MOPT_SET | MOPT_2}, {Opt_err, 0, 0} }; #if IS_ENABLED(CONFIG_UNICODE) static const struct ext4_sb_encodings { __u16 magic; char *name; unsigned int version; } ext4_sb_encoding_map[] = { {EXT4_ENC_UTF8_12_1, "utf8", UNICODE_AGE(12, 1, 0)}, }; static const struct ext4_sb_encodings * ext4_sb_read_encoding(const struct ext4_super_block *es) { __u16 magic = le16_to_cpu(es->s_encoding); int i; for (i = 0; i < ARRAY_SIZE(ext4_sb_encoding_map); i++) if (magic == ext4_sb_encoding_map[i].magic) return &ext4_sb_encoding_map[i]; return NULL; } #endif #define EXT4_SPEC_JQUOTA (1 << 0) #define EXT4_SPEC_JQFMT (1 << 1) #define EXT4_SPEC_DATAJ (1 << 2) #define EXT4_SPEC_SB_BLOCK (1 << 3) #define EXT4_SPEC_JOURNAL_DEV (1 << 4) #define EXT4_SPEC_JOURNAL_IOPRIO (1 << 5) #define EXT4_SPEC_s_want_extra_isize (1 << 7) #define EXT4_SPEC_s_max_batch_time (1 << 8) #define EXT4_SPEC_s_min_batch_time (1 << 9) #define EXT4_SPEC_s_inode_readahead_blks (1 << 10) #define EXT4_SPEC_s_li_wait_mult (1 << 11) #define EXT4_SPEC_s_max_dir_size_kb (1 << 12) #define EXT4_SPEC_s_stripe (1 << 13) #define EXT4_SPEC_s_resuid (1 << 14) #define EXT4_SPEC_s_resgid (1 << 15) #define EXT4_SPEC_s_commit_interval (1 << 16) #define EXT4_SPEC_s_fc_debug_max_replay (1 << 17) #define EXT4_SPEC_s_sb_block (1 << 18) #define EXT4_SPEC_mb_optimize_scan (1 << 19) struct ext4_fs_context { char *s_qf_names[EXT4_MAXQUOTAS]; struct fscrypt_dummy_policy dummy_enc_policy; int s_jquota_fmt; /* Format of quota to use */ #ifdef CONFIG_EXT4_DEBUG int s_fc_debug_max_replay; #endif unsigned short qname_spec; unsigned long vals_s_flags; /* Bits to set in s_flags */ unsigned long mask_s_flags; /* Bits changed in s_flags */ unsigned long journal_devnum; unsigned long s_commit_interval; unsigned long s_stripe; unsigned int s_inode_readahead_blks; unsigned int s_want_extra_isize; unsigned int s_li_wait_mult; unsigned int s_max_dir_size_kb; unsigned int journal_ioprio; unsigned int vals_s_mount_opt; unsigned int mask_s_mount_opt; unsigned int vals_s_mount_opt2; unsigned int mask_s_mount_opt2; unsigned int opt_flags; /* MOPT flags */ unsigned int spec; u32 s_max_batch_time; u32 s_min_batch_time; kuid_t s_resuid; kgid_t s_resgid; ext4_fsblk_t s_sb_block; }; static void ext4_fc_free(struct fs_context *fc) { struct ext4_fs_context *ctx = fc->fs_private; int i; if (!ctx) return; for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(ctx->s_qf_names[i]); fscrypt_free_dummy_policy(&ctx->dummy_enc_policy); kfree(ctx); } int ext4_init_fs_context(struct fs_context *fc) { struct ext4_fs_context *ctx; ctx = kzalloc(sizeof(struct ext4_fs_context), GFP_KERNEL); if (!ctx) return -ENOMEM; fc->fs_private = ctx; fc->ops = &ext4_context_ops; return 0; } #ifdef CONFIG_QUOTA /* * Note the name of the specified quota file. */ static int note_qf_name(struct fs_context *fc, int qtype, struct fs_parameter *param) { struct ext4_fs_context *ctx = fc->fs_private; char *qname; if (param->size < 1) { ext4_msg(NULL, KERN_ERR, "Missing quota name"); return -EINVAL; } if (strchr(param->string, '/')) { ext4_msg(NULL, KERN_ERR, "quotafile must be on filesystem root"); return -EINVAL; } if (ctx->s_qf_names[qtype]) { if (strcmp(ctx->s_qf_names[qtype], param->string) != 0) { ext4_msg(NULL, KERN_ERR, "%s quota file already specified", QTYPE2NAME(qtype)); return -EINVAL; } return 0; } qname = kmemdup_nul(param->string, param->size, GFP_KERNEL); if (!qname) { ext4_msg(NULL, KERN_ERR, "Not enough memory for storing quotafile name"); return -ENOMEM; } ctx->s_qf_names[qtype] = qname; ctx->qname_spec |= 1 << qtype; ctx->spec |= EXT4_SPEC_JQUOTA; return 0; } /* * Clear the name of the specified quota file. */ static int unnote_qf_name(struct fs_context *fc, int qtype) { struct ext4_fs_context *ctx = fc->fs_private; if (ctx->s_qf_names[qtype]) kfree(ctx->s_qf_names[qtype]); ctx->s_qf_names[qtype] = NULL; ctx->qname_spec |= 1 << qtype; ctx->spec |= EXT4_SPEC_JQUOTA; return 0; } #endif static int ext4_parse_test_dummy_encryption(const struct fs_parameter *param, struct ext4_fs_context *ctx) { int err; if (!IS_ENABLED(CONFIG_FS_ENCRYPTION)) { ext4_msg(NULL, KERN_WARNING, "test_dummy_encryption option not supported"); return -EINVAL; } err = fscrypt_parse_test_dummy_encryption(param, &ctx->dummy_enc_policy); if (err == -EINVAL) { ext4_msg(NULL, KERN_WARNING, "Value of option \"%s\" is unrecognized", param->key); } else if (err == -EEXIST) { ext4_msg(NULL, KERN_WARNING, "Conflicting test_dummy_encryption options"); return -EINVAL; } return err; } #define EXT4_SET_CTX(name) \ static inline void ctx_set_##name(struct ext4_fs_context *ctx, \ unsigned long flag) \ { \ ctx->mask_s_##name |= flag; \ ctx->vals_s_##name |= flag; \ } #define EXT4_CLEAR_CTX(name) \ static inline void ctx_clear_##name(struct ext4_fs_context *ctx, \ unsigned long flag) \ { \ ctx->mask_s_##name |= flag; \ ctx->vals_s_##name &= ~flag; \ } #define EXT4_TEST_CTX(name) \ static inline unsigned long \ ctx_test_##name(struct ext4_fs_context *ctx, unsigned long flag) \ { \ return (ctx->vals_s_##name & flag); \ } EXT4_SET_CTX(flags); /* set only */ EXT4_SET_CTX(mount_opt); EXT4_CLEAR_CTX(mount_opt); EXT4_TEST_CTX(mount_opt); EXT4_SET_CTX(mount_opt2); EXT4_CLEAR_CTX(mount_opt2); EXT4_TEST_CTX(mount_opt2); static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct ext4_fs_context *ctx = fc->fs_private; struct fs_parse_result result; const struct mount_opts *m; int is_remount; kuid_t uid; kgid_t gid; int token; token = fs_parse(fc, ext4_param_specs, param, &result); if (token < 0) return token; is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE; for (m = ext4_mount_opts; m->token != Opt_err; m++) if (token == m->token) break; ctx->opt_flags |= m->flags; if (m->flags & MOPT_EXPLICIT) { if (m->mount_opt & EXT4_MOUNT_DELALLOC) { ctx_set_mount_opt2(ctx, EXT4_MOUNT2_EXPLICIT_DELALLOC); } else if (m->mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) { ctx_set_mount_opt2(ctx, EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM); } else return -EINVAL; } if (m->flags & MOPT_NOSUPPORT) { ext4_msg(NULL, KERN_ERR, "%s option not supported", param->key); return 0; } switch (token) { #ifdef CONFIG_QUOTA case Opt_usrjquota: if (!*param->string) return unnote_qf_name(fc, USRQUOTA); else return note_qf_name(fc, USRQUOTA, param); case Opt_grpjquota: if (!*param->string) return unnote_qf_name(fc, GRPQUOTA); else return note_qf_name(fc, GRPQUOTA, param); #endif case Opt_sb: if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { ext4_msg(NULL, KERN_WARNING, "Ignoring %s option on remount", param->key); } else { ctx->s_sb_block = result.uint_32; ctx->spec |= EXT4_SPEC_s_sb_block; } return 0; case Opt_removed: ext4_msg(NULL, KERN_WARNING, "Ignoring removed %s option", param->key); return 0; case Opt_inlinecrypt: #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT ctx_set_flags(ctx, SB_INLINECRYPT); #else ext4_msg(NULL, KERN_ERR, "inline encryption not supported"); #endif return 0; case Opt_errors: ctx_clear_mount_opt(ctx, EXT4_MOUNT_ERRORS_MASK); ctx_set_mount_opt(ctx, result.uint_32); return 0; #ifdef CONFIG_QUOTA case Opt_jqfmt: ctx->s_jquota_fmt = result.uint_32; ctx->spec |= EXT4_SPEC_JQFMT; return 0; #endif case Opt_data: ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS); ctx_set_mount_opt(ctx, result.uint_32); ctx->spec |= EXT4_SPEC_DATAJ; return 0; case Opt_commit: if (result.uint_32 == 0) result.uint_32 = JBD2_DEFAULT_MAX_COMMIT_AGE; else if (result.uint_32 > INT_MAX / HZ) { ext4_msg(NULL, KERN_ERR, "Invalid commit interval %d, " "must be smaller than %d", result.uint_32, INT_MAX / HZ); return -EINVAL; } ctx->s_commit_interval = HZ * result.uint_32; ctx->spec |= EXT4_SPEC_s_commit_interval; return 0; case Opt_debug_want_extra_isize: if ((result.uint_32 & 1) || (result.uint_32 < 4)) { ext4_msg(NULL, KERN_ERR, "Invalid want_extra_isize %d", result.uint_32); return -EINVAL; } ctx->s_want_extra_isize = result.uint_32; ctx->spec |= EXT4_SPEC_s_want_extra_isize; return 0; case Opt_max_batch_time: ctx->s_max_batch_time = result.uint_32; ctx->spec |= EXT4_SPEC_s_max_batch_time; return 0; case Opt_min_batch_time: ctx->s_min_batch_time = result.uint_32; ctx->spec |= EXT4_SPEC_s_min_batch_time; return 0; case Opt_inode_readahead_blks: if (result.uint_32 && (result.uint_32 > (1 << 30) || !is_power_of_2(result.uint_32))) { ext4_msg(NULL, KERN_ERR, "EXT4-fs: inode_readahead_blks must be " "0 or a power of 2 smaller than 2^31"); return -EINVAL; } ctx->s_inode_readahead_blks = result.uint_32; ctx->spec |= EXT4_SPEC_s_inode_readahead_blks; return 0; case Opt_init_itable: ctx_set_mount_opt(ctx, EXT4_MOUNT_INIT_INODE_TABLE); ctx->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; if (param->type == fs_value_is_string) ctx->s_li_wait_mult = result.uint_32; ctx->spec |= EXT4_SPEC_s_li_wait_mult; return 0; case Opt_max_dir_size_kb: ctx->s_max_dir_size_kb = result.uint_32; ctx->spec |= EXT4_SPEC_s_max_dir_size_kb; return 0; #ifdef CONFIG_EXT4_DEBUG case Opt_fc_debug_max_replay: ctx->s_fc_debug_max_replay = result.uint_32; ctx->spec |= EXT4_SPEC_s_fc_debug_max_replay; return 0; #endif case Opt_stripe: ctx->s_stripe = result.uint_32; ctx->spec |= EXT4_SPEC_s_stripe; return 0; case Opt_resuid: uid = make_kuid(current_user_ns(), result.uint_32); if (!uid_valid(uid)) { ext4_msg(NULL, KERN_ERR, "Invalid uid value %d", result.uint_32); return -EINVAL; } ctx->s_resuid = uid; ctx->spec |= EXT4_SPEC_s_resuid; return 0; case Opt_resgid: gid = make_kgid(current_user_ns(), result.uint_32); if (!gid_valid(gid)) { ext4_msg(NULL, KERN_ERR, "Invalid gid value %d", result.uint_32); return -EINVAL; } ctx->s_resgid = gid; ctx->spec |= EXT4_SPEC_s_resgid; return 0; case Opt_journal_dev: if (is_remount) { ext4_msg(NULL, KERN_ERR, "Cannot specify journal on remount"); return -EINVAL; } ctx->journal_devnum = result.uint_32; ctx->spec |= EXT4_SPEC_JOURNAL_DEV; return 0; case Opt_journal_path: { struct inode *journal_inode; struct path path; int error; if (is_remount) { ext4_msg(NULL, KERN_ERR, "Cannot specify journal on remount"); return -EINVAL; } error = fs_lookup_param(fc, param, 1, LOOKUP_FOLLOW, &path); if (error) { ext4_msg(NULL, KERN_ERR, "error: could not find " "journal device path"); return -EINVAL; } journal_inode = d_inode(path.dentry); ctx->journal_devnum = new_encode_dev(journal_inode->i_rdev); ctx->spec |= EXT4_SPEC_JOURNAL_DEV; path_put(&path); return 0; } case Opt_journal_ioprio: if (result.uint_32 > 7) { ext4_msg(NULL, KERN_ERR, "Invalid journal IO priority" " (must be 0-7)"); return -EINVAL; } ctx->journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, result.uint_32); ctx->spec |= EXT4_SPEC_JOURNAL_IOPRIO; return 0; case Opt_test_dummy_encryption: return ext4_parse_test_dummy_encryption(param, ctx); case Opt_dax: case Opt_dax_type: #ifdef CONFIG_FS_DAX { int type = (token == Opt_dax) ? Opt_dax : result.uint_32; switch (type) { case Opt_dax: case Opt_dax_always: ctx_set_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS); ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER); break; case Opt_dax_never: ctx_set_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER); ctx_clear_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS); break; case Opt_dax_inode: ctx_clear_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS); ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER); /* Strictly for printing options */ ctx_set_mount_opt2(ctx, EXT4_MOUNT2_DAX_INODE); break; } return 0; } #else ext4_msg(NULL, KERN_INFO, "dax option not supported"); return -EINVAL; #endif case Opt_data_err: if (result.uint_32 == Opt_data_err_abort) ctx_set_mount_opt(ctx, m->mount_opt); else if (result.uint_32 == Opt_data_err_ignore) ctx_clear_mount_opt(ctx, m->mount_opt); return 0; case Opt_mb_optimize_scan: if (result.int_32 == 1) { ctx_set_mount_opt2(ctx, EXT4_MOUNT2_MB_OPTIMIZE_SCAN); ctx->spec |= EXT4_SPEC_mb_optimize_scan; } else if (result.int_32 == 0) { ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_MB_OPTIMIZE_SCAN); ctx->spec |= EXT4_SPEC_mb_optimize_scan; } else { ext4_msg(NULL, KERN_WARNING, "mb_optimize_scan should be set to 0 or 1."); return -EINVAL; } return 0; } /* * At this point we should only be getting options requiring MOPT_SET, * or MOPT_CLEAR. Anything else is a bug */ if (m->token == Opt_err) { ext4_msg(NULL, KERN_WARNING, "buggy handling of option %s", param->key); WARN_ON(1); return -EINVAL; } else { unsigned int set = 0; if ((param->type == fs_value_is_flag) || result.uint_32 > 0) set = 1; if (m->flags & MOPT_CLEAR) set = !set; else if (unlikely(!(m->flags & MOPT_SET))) { ext4_msg(NULL, KERN_WARNING, "buggy handling of option %s", param->key); WARN_ON(1); return -EINVAL; } if (m->flags & MOPT_2) { if (set != 0) ctx_set_mount_opt2(ctx, m->mount_opt); else ctx_clear_mount_opt2(ctx, m->mount_opt); } else { if (set != 0) ctx_set_mount_opt(ctx, m->mount_opt); else ctx_clear_mount_opt(ctx, m->mount_opt); } } return 0; } static int parse_options(struct fs_context *fc, char *options) { struct fs_parameter param; int ret; char *key; if (!options) return 0; while ((key = strsep(&options, ",")) != NULL) { if (*key) { size_t v_len = 0; char *value = strchr(key, '='); param.type = fs_value_is_flag; param.string = NULL; if (value) { if (value == key) continue; *value++ = 0; v_len = strlen(value); param.string = kmemdup_nul(value, v_len, GFP_KERNEL); if (!param.string) return -ENOMEM; param.type = fs_value_is_string; } param.key = key; param.size = v_len; ret = ext4_parse_param(fc, &param); if (param.string) kfree(param.string); if (ret < 0) return ret; } } ret = ext4_validate_options(fc); if (ret < 0) return ret; return 0; } static int parse_apply_sb_mount_options(struct super_block *sb, struct ext4_fs_context *m_ctx) { struct ext4_sb_info *sbi = EXT4_SB(sb); char *s_mount_opts = NULL; struct ext4_fs_context *s_ctx = NULL; struct fs_context *fc = NULL; int ret = -ENOMEM; if (!sbi->s_es->s_mount_opts[0]) return 0; s_mount_opts = kstrndup(sbi->s_es->s_mount_opts, sizeof(sbi->s_es->s_mount_opts), GFP_KERNEL); if (!s_mount_opts) return ret; fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL); if (!fc) goto out_free; s_ctx = kzalloc(sizeof(struct ext4_fs_context), GFP_KERNEL); if (!s_ctx) goto out_free; fc->fs_private = s_ctx; fc->s_fs_info = sbi; ret = parse_options(fc, s_mount_opts); if (ret < 0) goto parse_failed; ret = ext4_check_opt_consistency(fc, sb); if (ret < 0) { parse_failed: ext4_msg(sb, KERN_WARNING, "failed to parse options in superblock: %s", s_mount_opts); ret = 0; goto out_free; } if (s_ctx->spec & EXT4_SPEC_JOURNAL_DEV) m_ctx->journal_devnum = s_ctx->journal_devnum; if (s_ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO) m_ctx->journal_ioprio = s_ctx->journal_ioprio; ext4_apply_options(fc, sb); ret = 0; out_free: if (fc) { ext4_fc_free(fc); kfree(fc); } kfree(s_mount_opts); return ret; } static void ext4_apply_quota_options(struct fs_context *fc, struct super_block *sb) { #ifdef CONFIG_QUOTA bool quota_feature = ext4_has_feature_quota(sb); struct ext4_fs_context *ctx = fc->fs_private; struct ext4_sb_info *sbi = EXT4_SB(sb); char *qname; int i; if (quota_feature) return; if (ctx->spec & EXT4_SPEC_JQUOTA) { for (i = 0; i < EXT4_MAXQUOTAS; i++) { if (!(ctx->qname_spec & (1 << i))) continue; qname = ctx->s_qf_names[i]; /* May be NULL */ if (qname) set_opt(sb, QUOTA); ctx->s_qf_names[i] = NULL; qname = rcu_replace_pointer(sbi->s_qf_names[i], qname, lockdep_is_held(&sb->s_umount)); if (qname) kfree_rcu_mightsleep(qname); } } if (ctx->spec & EXT4_SPEC_JQFMT) sbi->s_jquota_fmt = ctx->s_jquota_fmt; #endif } /* * Check quota settings consistency. */ static int ext4_check_quota_consistency(struct fs_context *fc, struct super_block *sb) { #ifdef CONFIG_QUOTA struct ext4_fs_context *ctx = fc->fs_private; struct ext4_sb_info *sbi = EXT4_SB(sb); bool quota_feature = ext4_has_feature_quota(sb); bool quota_loaded = sb_any_quota_loaded(sb); bool usr_qf_name, grp_qf_name, usrquota, grpquota; int quota_flags, i; /* * We do the test below only for project quotas. 'usrquota' and * 'grpquota' mount options are allowed even without quota feature * to support legacy quotas in quota files. */ if (ctx_test_mount_opt(ctx, EXT4_MOUNT_PRJQUOTA) && !ext4_has_feature_project(sb)) { ext4_msg(NULL, KERN_ERR, "Project quota feature not enabled. " "Cannot enable project quota enforcement."); return -EINVAL; } quota_flags = EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA | EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA; if (quota_loaded && ctx->mask_s_mount_opt & quota_flags && !ctx_test_mount_opt(ctx, quota_flags)) goto err_quota_change; if (ctx->spec & EXT4_SPEC_JQUOTA) { for (i = 0; i < EXT4_MAXQUOTAS; i++) { if (!(ctx->qname_spec & (1 << i))) continue; if (quota_loaded && !!sbi->s_qf_names[i] != !!ctx->s_qf_names[i]) goto err_jquota_change; if (sbi->s_qf_names[i] && ctx->s_qf_names[i] && strcmp(get_qf_name(sb, sbi, i), ctx->s_qf_names[i]) != 0) goto err_jquota_specified; } if (quota_feature) { ext4_msg(NULL, KERN_INFO, "Journaled quota options ignored when " "QUOTA feature is enabled"); return 0; } } if (ctx->spec & EXT4_SPEC_JQFMT) { if (sbi->s_jquota_fmt != ctx->s_jquota_fmt && quota_loaded) goto err_jquota_change; if (quota_feature) { ext4_msg(NULL, KERN_INFO, "Quota format mount options " "ignored when QUOTA feature is enabled"); return 0; } } /* Make sure we don't mix old and new quota format */ usr_qf_name = (get_qf_name(sb, sbi, USRQUOTA) || ctx->s_qf_names[USRQUOTA]); grp_qf_name = (get_qf_name(sb, sbi, GRPQUOTA) || ctx->s_qf_names[GRPQUOTA]); usrquota = (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) || test_opt(sb, USRQUOTA)); grpquota = (ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA) || test_opt(sb, GRPQUOTA)); if (usr_qf_name) { ctx_clear_mount_opt(ctx, EXT4_MOUNT_USRQUOTA); usrquota = false; } if (grp_qf_name) { ctx_clear_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA); grpquota = false; } if (usr_qf_name || grp_qf_name) { if (usrquota || grpquota) { ext4_msg(NULL, KERN_ERR, "old and new quota " "format mixing"); return -EINVAL; } if (!(ctx->spec & EXT4_SPEC_JQFMT || sbi->s_jquota_fmt)) { ext4_msg(NULL, KERN_ERR, "journaled quota format " "not specified"); return -EINVAL; } } return 0; err_quota_change: ext4_msg(NULL, KERN_ERR, "Cannot change quota options when quota turned on"); return -EINVAL; err_jquota_change: ext4_msg(NULL, KERN_ERR, "Cannot change journaled quota " "options when quota turned on"); return -EINVAL; err_jquota_specified: ext4_msg(NULL, KERN_ERR, "%s quota file already specified", QTYPE2NAME(i)); return -EINVAL; #else return 0; #endif } static int ext4_check_test_dummy_encryption(const struct fs_context *fc, struct super_block *sb) { const struct ext4_fs_context *ctx = fc->fs_private; const struct ext4_sb_info *sbi = EXT4_SB(sb); if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy)) return 0; if (!ext4_has_feature_encrypt(sb)) { ext4_msg(NULL, KERN_WARNING, "test_dummy_encryption requires encrypt feature"); return -EINVAL; } /* * This mount option is just for testing, and it's not worthwhile to * implement the extra complexity (e.g. RCU protection) that would be * needed to allow it to be set or changed during remount. We do allow * it to be specified during remount, but only if there is no change. */ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy, &ctx->dummy_enc_policy)) return 0; ext4_msg(NULL, KERN_WARNING, "Can't set or change test_dummy_encryption on remount"); return -EINVAL; } /* Also make sure s_mount_opts didn't contain a conflicting value. */ if (fscrypt_is_dummy_policy_set(&sbi->s_dummy_enc_policy)) { if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy, &ctx->dummy_enc_policy)) return 0; ext4_msg(NULL, KERN_WARNING, "Conflicting test_dummy_encryption options"); return -EINVAL; } return 0; } static void ext4_apply_test_dummy_encryption(struct ext4_fs_context *ctx, struct super_block *sb) { if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy) || /* if already set, it was already verified to be the same */ fscrypt_is_dummy_policy_set(&EXT4_SB(sb)->s_dummy_enc_policy)) return; EXT4_SB(sb)->s_dummy_enc_policy = ctx->dummy_enc_policy; memset(&ctx->dummy_enc_policy, 0, sizeof(ctx->dummy_enc_policy)); ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled"); } static int ext4_check_opt_consistency(struct fs_context *fc, struct super_block *sb) { struct ext4_fs_context *ctx = fc->fs_private; struct ext4_sb_info *sbi = fc->s_fs_info; int is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE; int err; if ((ctx->opt_flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) { ext4_msg(NULL, KERN_ERR, "Mount option(s) incompatible with ext2"); return -EINVAL; } if ((ctx->opt_flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) { ext4_msg(NULL, KERN_ERR, "Mount option(s) incompatible with ext3"); return -EINVAL; } if (ctx->s_want_extra_isize > (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE)) { ext4_msg(NULL, KERN_ERR, "Invalid want_extra_isize %d", ctx->s_want_extra_isize); return -EINVAL; } if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DIOREAD_NOLOCK)) { int blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); if (blocksize < PAGE_SIZE) ext4_msg(NULL, KERN_WARNING, "Warning: mounting with an " "experimental mount option 'dioread_nolock' " "for blocksize < PAGE_SIZE"); } err = ext4_check_test_dummy_encryption(fc, sb); if (err) return err; if ((ctx->spec & EXT4_SPEC_DATAJ) && is_remount) { if (!sbi->s_journal) { ext4_msg(NULL, KERN_WARNING, "Remounting file system with no journal " "so ignoring journalled data option"); ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS); } else if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS) != test_opt(sb, DATA_FLAGS)) { ext4_msg(NULL, KERN_ERR, "Cannot change data mode " "on remount"); return -EINVAL; } } if (is_remount) { if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) && (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) { ext4_msg(NULL, KERN_ERR, "can't mount with " "both data=journal and dax"); return -EINVAL; } if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) && (!(sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) || (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER))) { fail_dax_change_remount: ext4_msg(NULL, KERN_ERR, "can't change " "dax mount option while remounting"); return -EINVAL; } else if (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER) && (!(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) || (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS))) { goto fail_dax_change_remount; } else if (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_DAX_INODE) && ((sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) || (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) || !(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_INODE))) { goto fail_dax_change_remount; } } return ext4_check_quota_consistency(fc, sb); } static void ext4_apply_options(struct fs_context *fc, struct super_block *sb) { struct ext4_fs_context *ctx = fc->fs_private; struct ext4_sb_info *sbi = fc->s_fs_info; sbi->s_mount_opt &= ~ctx->mask_s_mount_opt; sbi->s_mount_opt |= ctx->vals_s_mount_opt; sbi->s_mount_opt2 &= ~ctx->mask_s_mount_opt2; sbi->s_mount_opt2 |= ctx->vals_s_mount_opt2; sb->s_flags &= ~ctx->mask_s_flags; sb->s_flags |= ctx->vals_s_flags; #define APPLY(X) ({ if (ctx->spec & EXT4_SPEC_##X) sbi->X = ctx->X; }) APPLY(s_commit_interval); APPLY(s_stripe); APPLY(s_max_batch_time); APPLY(s_min_batch_time); APPLY(s_want_extra_isize); APPLY(s_inode_readahead_blks); APPLY(s_max_dir_size_kb); APPLY(s_li_wait_mult); APPLY(s_resgid); APPLY(s_resuid); #ifdef CONFIG_EXT4_DEBUG APPLY(s_fc_debug_max_replay); #endif ext4_apply_quota_options(fc, sb); ext4_apply_test_dummy_encryption(ctx, sb); } static int ext4_validate_options(struct fs_context *fc) { #ifdef CONFIG_QUOTA struct ext4_fs_context *ctx = fc->fs_private; char *usr_qf_name, *grp_qf_name; usr_qf_name = ctx->s_qf_names[USRQUOTA]; grp_qf_name = ctx->s_qf_names[GRPQUOTA]; if (usr_qf_name || grp_qf_name) { if (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) && usr_qf_name) ctx_clear_mount_opt(ctx, EXT4_MOUNT_USRQUOTA); if (ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA) && grp_qf_name) ctx_clear_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA); if (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) || ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA)) { ext4_msg(NULL, KERN_ERR, "old and new quota " "format mixing"); return -EINVAL; } } #endif return 1; } static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb) { #if defined(CONFIG_QUOTA) struct ext4_sb_info *sbi = EXT4_SB(sb); char *usr_qf_name, *grp_qf_name; if (sbi->s_jquota_fmt) { char *fmtname = ""; switch (sbi->s_jquota_fmt) { case QFMT_VFS_OLD: fmtname = "vfsold"; break; case QFMT_VFS_V0: fmtname = "vfsv0"; break; case QFMT_VFS_V1: fmtname = "vfsv1"; break; } seq_printf(seq, ",jqfmt=%s", fmtname); } rcu_read_lock(); usr_qf_name = rcu_dereference(sbi->s_qf_names[USRQUOTA]); grp_qf_name = rcu_dereference(sbi->s_qf_names[GRPQUOTA]); if (usr_qf_name) seq_show_option(seq, "usrjquota", usr_qf_name); if (grp_qf_name) seq_show_option(seq, "grpjquota", grp_qf_name); rcu_read_unlock(); #endif } static const char *token2str(int token) { const struct fs_parameter_spec *spec; for (spec = ext4_param_specs; spec->name != NULL; spec++) if (spec->opt == token && !spec->type) break; return spec->name; } /* * Show an option if * - it's set to a non-default value OR * - if the per-sb default is different from the global default */ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, int nodefs) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; int def_errors; const struct mount_opts *m; char sep = nodefs ? '\n' : ','; #define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep) #define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg) if (sbi->s_sb_block != 1) SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block); for (m = ext4_mount_opts; m->token != Opt_err; m++) { int want_set = m->flags & MOPT_SET; int opt_2 = m->flags & MOPT_2; unsigned int mount_opt, def_mount_opt; if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) || m->flags & MOPT_SKIP) continue; if (opt_2) { mount_opt = sbi->s_mount_opt2; def_mount_opt = sbi->s_def_mount_opt2; } else { mount_opt = sbi->s_mount_opt; def_mount_opt = sbi->s_def_mount_opt; } /* skip if same as the default */ if (!nodefs && !(m->mount_opt & (mount_opt ^ def_mount_opt))) continue; /* select Opt_noFoo vs Opt_Foo */ if ((want_set && (mount_opt & m->mount_opt) != m->mount_opt) || (!want_set && (mount_opt & m->mount_opt))) continue; SEQ_OPTS_PRINT("%s", token2str(m->token)); } if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) || le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID) SEQ_OPTS_PRINT("resuid=%u", from_kuid_munged(&init_user_ns, sbi->s_resuid)); if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) || le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) SEQ_OPTS_PRINT("resgid=%u", from_kgid_munged(&init_user_ns, sbi->s_resgid)); def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors); if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO) SEQ_OPTS_PUTS("errors=remount-ro"); if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE) SEQ_OPTS_PUTS("errors=continue"); if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC) SEQ_OPTS_PUTS("errors=panic"); if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ); if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time); if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time); if (nodefs || sbi->s_stripe) SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe); if (nodefs || EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) { if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) SEQ_OPTS_PUTS("data=journal"); else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) SEQ_OPTS_PUTS("data=ordered"); else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) SEQ_OPTS_PUTS("data=writeback"); } if (nodefs || sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS) SEQ_OPTS_PRINT("inode_readahead_blks=%u", sbi->s_inode_readahead_blks); if (test_opt(sb, INIT_INODE_TABLE) && (nodefs || (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT))) SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult); if (nodefs || sbi->s_max_dir_size_kb) SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb); if (test_opt(sb, DATA_ERR_ABORT)) SEQ_OPTS_PUTS("data_err=abort"); fscrypt_show_test_dummy_encryption(seq, sep, sb); if (sb->s_flags & SB_INLINECRYPT) SEQ_OPTS_PUTS("inlinecrypt"); if (test_opt(sb, DAX_ALWAYS)) { if (IS_EXT2_SB(sb)) SEQ_OPTS_PUTS("dax"); else SEQ_OPTS_PUTS("dax=always"); } else if (test_opt2(sb, DAX_NEVER)) { SEQ_OPTS_PUTS("dax=never"); } else if (test_opt2(sb, DAX_INODE)) { SEQ_OPTS_PUTS("dax=inode"); } if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD && !test_opt2(sb, MB_OPTIMIZE_SCAN)) { SEQ_OPTS_PUTS("mb_optimize_scan=0"); } else if (sbi->s_groups_count < MB_DEFAULT_LINEAR_SCAN_THRESHOLD && test_opt2(sb, MB_OPTIMIZE_SCAN)) { SEQ_OPTS_PUTS("mb_optimize_scan=1"); } ext4_show_quota_options(seq, sb); return 0; } static int ext4_show_options(struct seq_file *seq, struct dentry *root) { return _ext4_show_options(seq, root->d_sb, 0); } int ext4_seq_options_show(struct seq_file *seq, void *offset) { struct super_block *sb = seq->private; int rc; seq_puts(seq, sb_rdonly(sb) ? "ro" : "rw"); rc = _ext4_show_options(seq, sb, 1); seq_puts(seq, "\n"); return rc; } static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, int read_only) { struct ext4_sb_info *sbi = EXT4_SB(sb); int err = 0; if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) { ext4_msg(sb, KERN_ERR, "revision level too high, " "forcing read-only mode"); err = -EROFS; goto done; } if (read_only) goto done; if (!(sbi->s_mount_state & EXT4_VALID_FS)) ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, " "running e2fsck is recommended"); else if (sbi->s_mount_state & EXT4_ERROR_FS) ext4_msg(sb, KERN_WARNING, "warning: mounting fs with errors, " "running e2fsck is recommended"); else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 && le16_to_cpu(es->s_mnt_count) >= (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) ext4_msg(sb, KERN_WARNING, "warning: maximal mount count reached, " "running e2fsck is recommended"); else if (le32_to_cpu(es->s_checkinterval) && (ext4_get_tstamp(es, s_lastcheck) + le32_to_cpu(es->s_checkinterval) <= ktime_get_real_seconds())) ext4_msg(sb, KERN_WARNING, "warning: checktime reached, " "running e2fsck is recommended"); if (!sbi->s_journal) es->s_state &= cpu_to_le16(~EXT4_VALID_FS); if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); le16_add_cpu(&es->s_mnt_count, 1); ext4_update_tstamp(es, s_mtime); if (sbi->s_journal) { ext4_set_feature_journal_needs_recovery(sb); if (ext4_has_feature_orphan_file(sb)) ext4_set_feature_orphan_present(sb); } err = ext4_commit_super(sb); done: if (test_opt(sb, DEBUG)) printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, " "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n", sb->s_blocksize, sbi->s_groups_count, EXT4_BLOCKS_PER_GROUP(sb), EXT4_INODES_PER_GROUP(sb), sbi->s_mount_opt, sbi->s_mount_opt2); return err; } int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct flex_groups **old_groups, **new_groups; int size, i, j; if (!sbi->s_log_groups_per_flex) return 0; size = ext4_flex_group(sbi, ngroup - 1) + 1; if (size <= sbi->s_flex_groups_allocated) return 0; new_groups = kvzalloc(roundup_pow_of_two(size * sizeof(*sbi->s_flex_groups)), GFP_KERNEL); if (!new_groups) { ext4_msg(sb, KERN_ERR, "not enough memory for %d flex group pointers", size); return -ENOMEM; } for (i = sbi->s_flex_groups_allocated; i < size; i++) { new_groups[i] = kvzalloc(roundup_pow_of_two( sizeof(struct flex_groups)), GFP_KERNEL); if (!new_groups[i]) { for (j = sbi->s_flex_groups_allocated; j < i; j++) kvfree(new_groups[j]); kvfree(new_groups); ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups", size); return -ENOMEM; } } rcu_read_lock(); old_groups = rcu_dereference(sbi->s_flex_groups); if (old_groups) memcpy(new_groups, old_groups, (sbi->s_flex_groups_allocated * sizeof(struct flex_groups *))); rcu_read_unlock(); rcu_assign_pointer(sbi->s_flex_groups, new_groups); sbi->s_flex_groups_allocated = size; if (old_groups) ext4_kvfree_array_rcu(old_groups); return 0; } static int ext4_fill_flex_info(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_desc *gdp = NULL; struct flex_groups *fg; ext4_group_t flex_group; int i, err; sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) { sbi->s_log_groups_per_flex = 0; return 1; } err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count); if (err) goto failed; for (i = 0; i < sbi->s_groups_count; i++) { gdp = ext4_get_group_desc(sb, i, NULL); flex_group = ext4_flex_group(sbi, i); fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group); atomic_add(ext4_free_inodes_count(sb, gdp), &fg->free_inodes); atomic64_add(ext4_free_group_clusters(sb, gdp), &fg->free_clusters); atomic_add(ext4_used_dirs_count(sb, gdp), &fg->used_dirs); } return 1; failed: return 0; } static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group, struct ext4_group_desc *gdp) { int offset = offsetof(struct ext4_group_desc, bg_checksum); __u16 crc = 0; __le32 le_group = cpu_to_le32(block_group); struct ext4_sb_info *sbi = EXT4_SB(sb); if (ext4_has_metadata_csum(sbi->s_sb)) { /* Use new metadata_csum algorithm */ __u32 csum32; __u16 dummy_csum = 0; csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group, sizeof(le_group)); csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, offset); csum32 = ext4_chksum(sbi, csum32, (__u8 *)&dummy_csum, sizeof(dummy_csum)); offset += sizeof(dummy_csum); if (offset < sbi->s_desc_size) csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp + offset, sbi->s_desc_size - offset); crc = csum32 & 0xFFFF; goto out; } /* old crc16 code */ if (!ext4_has_feature_gdt_csum(sb)) return 0; crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid)); crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group)); crc = crc16(crc, (__u8 *)gdp, offset); offset += sizeof(gdp->bg_checksum); /* skip checksum */ /* for checksum of struct ext4_group_desc do the rest...*/ if (ext4_has_feature_64bit(sb) && offset < sbi->s_desc_size) crc = crc16(crc, (__u8 *)gdp + offset, sbi->s_desc_size - offset); out: return cpu_to_le16(crc); } int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group, struct ext4_group_desc *gdp) { if (ext4_has_group_desc_csum(sb) && (gdp->bg_checksum != ext4_group_desc_csum(sb, block_group, gdp))) return 0; return 1; } void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group, struct ext4_group_desc *gdp) { if (!ext4_has_group_desc_csum(sb)) return; gdp->bg_checksum = ext4_group_desc_csum(sb, block_group, gdp); } /* Called at mount-time, super-block is locked */ static int ext4_check_descriptors(struct super_block *sb, ext4_fsblk_t sb_block, ext4_group_t *first_not_zeroed) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block); ext4_fsblk_t last_block; ext4_fsblk_t last_bg_block = sb_block + ext4_bg_num_gdb(sb, 0); ext4_fsblk_t block_bitmap; ext4_fsblk_t inode_bitmap; ext4_fsblk_t inode_table; int flexbg_flag = 0; ext4_group_t i, grp = sbi->s_groups_count; if (ext4_has_feature_flex_bg(sb)) flexbg_flag = 1; ext4_debug("Checking group descriptors"); for (i = 0; i < sbi->s_groups_count; i++) { struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); if (i == sbi->s_groups_count - 1 || flexbg_flag) last_block = ext4_blocks_count(sbi->s_es) - 1; else last_block = first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1); if ((grp == sbi->s_groups_count) && !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) grp = i; block_bitmap = ext4_block_bitmap(sb, gdp); if (block_bitmap == sb_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Block bitmap for group %u overlaps " "superblock", i); if (!sb_rdonly(sb)) return 0; } if (block_bitmap >= sb_block + 1 && block_bitmap <= last_bg_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Block bitmap for group %u overlaps " "block group descriptors", i); if (!sb_rdonly(sb)) return 0; } if (block_bitmap < first_block || block_bitmap > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Block bitmap for group %u not in group " "(block %llu)!", i, block_bitmap); return 0; } inode_bitmap = ext4_inode_bitmap(sb, gdp); if (inode_bitmap == sb_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode bitmap for group %u overlaps " "superblock", i); if (!sb_rdonly(sb)) return 0; } if (inode_bitmap >= sb_block + 1 && inode_bitmap <= last_bg_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode bitmap for group %u overlaps " "block group descriptors", i); if (!sb_rdonly(sb)) return 0; } if (inode_bitmap < first_block || inode_bitmap > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode bitmap for group %u not in group " "(block %llu)!", i, inode_bitmap); return 0; } inode_table = ext4_inode_table(sb, gdp); if (inode_table == sb_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode table for group %u overlaps " "superblock", i); if (!sb_rdonly(sb)) return 0; } if (inode_table >= sb_block + 1 && inode_table <= last_bg_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode table for group %u overlaps " "block group descriptors", i); if (!sb_rdonly(sb)) return 0; } if (inode_table < first_block || inode_table + sbi->s_itb_per_group - 1 > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode table for group %u not in group " "(block %llu)!", i, inode_table); return 0; } ext4_lock_group(sb, i); if (!ext4_group_desc_csum_verify(sb, i, gdp)) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Checksum for group %u failed (%u!=%u)", i, le16_to_cpu(ext4_group_desc_csum(sb, i, gdp)), le16_to_cpu(gdp->bg_checksum)); if (!sb_rdonly(sb)) { ext4_unlock_group(sb, i); return 0; } } ext4_unlock_group(sb, i); if (!flexbg_flag) first_block += EXT4_BLOCKS_PER_GROUP(sb); } if (NULL != first_not_zeroed) *first_not_zeroed = grp; return 1; } /* * Maximal extent format file size. * Resulting logical blkno at s_maxbytes must fit in our on-disk * extent format containers, within a sector_t, and within i_blocks * in the vfs. ext4 inode has 48 bits of i_block in fsblock units, * so that won't be a limiting factor. * * However there is other limiting factor. We do store extents in the form * of starting block and length, hence the resulting length of the extent * covering maximum file size must fit into on-disk format containers as * well. Given that length is always by 1 unit bigger than max unit (because * we count 0 as well) we have to lower the s_maxbytes by one fs block. * * Note, this does *not* consider any metadata overhead for vfs i_blocks. */ static loff_t ext4_max_size(int blkbits, int has_huge_files) { loff_t res; loff_t upper_limit = MAX_LFS_FILESIZE; BUILD_BUG_ON(sizeof(blkcnt_t) < sizeof(u64)); if (!has_huge_files) { upper_limit = (1LL << 32) - 1; /* total blocks in file system block size */ upper_limit >>= (blkbits - 9); upper_limit <<= blkbits; } /* * 32-bit extent-start container, ee_block. We lower the maxbytes * by one fs block, so ee_len can cover the extent of maximum file * size */ res = (1LL << 32) - 1; res <<= blkbits; /* Sanity check against vm- & vfs- imposed limits */ if (res > upper_limit) res = upper_limit; return res; } /* * Maximal bitmap file size. There is a direct, and {,double-,triple-}indirect * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks. * We need to be 1 filesystem block less than the 2^48 sector limit. */ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files) { loff_t upper_limit, res = EXT4_NDIR_BLOCKS; int meta_blocks; unsigned int ppb = 1 << (bits - 2); /* * This is calculated to be the largest file size for a dense, block * mapped file such that the file's total number of 512-byte sectors, * including data and all indirect blocks, does not exceed (2^48 - 1). * * __u32 i_blocks_lo and _u16 i_blocks_high represent the total * number of 512-byte sectors of the file. */ if (!has_huge_files) { /* * !has_huge_files or implies that the inode i_block field * represents total file blocks in 2^32 512-byte sectors == * size of vfs inode i_blocks * 8 */ upper_limit = (1LL << 32) - 1; /* total blocks in file system block size */ upper_limit >>= (bits - 9); } else { /* * We use 48 bit ext4_inode i_blocks * With EXT4_HUGE_FILE_FL set the i_blocks * represent total number of blocks in * file system block size */ upper_limit = (1LL << 48) - 1; } /* Compute how many blocks we can address by block tree */ res += ppb; res += ppb * ppb; res += ((loff_t)ppb) * ppb * ppb; /* Compute how many metadata blocks are needed */ meta_blocks = 1; meta_blocks += 1 + ppb; meta_blocks += 1 + ppb + ppb * ppb; /* Does block tree limit file size? */ if (res + meta_blocks <= upper_limit) goto check_lfs; res = upper_limit; /* How many metadata blocks are needed for addressing upper_limit? */ upper_limit -= EXT4_NDIR_BLOCKS; /* indirect blocks */ meta_blocks = 1; upper_limit -= ppb; /* double indirect blocks */ if (upper_limit < ppb * ppb) { meta_blocks += 1 + DIV_ROUND_UP_ULL(upper_limit, ppb); res -= meta_blocks; goto check_lfs; } meta_blocks += 1 + ppb; upper_limit -= ppb * ppb; /* tripple indirect blocks for the rest */ meta_blocks += 1 + DIV_ROUND_UP_ULL(upper_limit, ppb) + DIV_ROUND_UP_ULL(upper_limit, ppb*ppb); res -= meta_blocks; check_lfs: res <<= bits; if (res > MAX_LFS_FILESIZE) res = MAX_LFS_FILESIZE; return res; } static ext4_fsblk_t descriptor_loc(struct super_block *sb, ext4_fsblk_t logical_sb_block, int nr) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t bg, first_meta_bg; int has_super = 0; first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); if (!ext4_has_feature_meta_bg(sb) || nr < first_meta_bg) return logical_sb_block + nr + 1; bg = sbi->s_desc_per_block * nr; if (ext4_bg_has_super(sb, bg)) has_super = 1; /* * If we have a meta_bg fs with 1k blocks, group 0's GDT is at * block 2, not 1. If s_first_data_block == 0 (bigalloc is enabled * on modern mke2fs or blksize > 1k on older mke2fs) then we must * compensate. */ if (sb->s_blocksize == 1024 && nr == 0 && le32_to_cpu(sbi->s_es->s_first_data_block) == 0) has_super++; return (has_super + ext4_group_first_block_no(sb, bg)); } /** * ext4_get_stripe_size: Get the stripe size. * @sbi: In memory super block info * * If we have specified it via mount option, then * use the mount option value. If the value specified at mount time is * greater than the blocks per group use the super block value. * If the super block value is greater than blocks per group return 0. * Allocator needs it be less than blocks per group. * */ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) { unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride); unsigned long stripe_width = le32_to_cpu(sbi->s_es->s_raid_stripe_width); int ret; if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group) ret = sbi->s_stripe; else if (stripe_width && stripe_width <= sbi->s_blocks_per_group) ret = stripe_width; else if (stride && stride <= sbi->s_blocks_per_group) ret = stride; else ret = 0; /* * If the stripe width is 1, this makes no sense and * we set it to 0 to turn off stripe handling code. */ if (ret <= 1) ret = 0; return ret; } /* * Check whether this filesystem can be mounted based on * the features present and the RDONLY/RDWR mount requested. * Returns 1 if this filesystem can be mounted as requested, * 0 if it cannot be. */ int ext4_feature_set_ok(struct super_block *sb, int readonly) { if (ext4_has_unknown_ext4_incompat_features(sb)) { ext4_msg(sb, KERN_ERR, "Couldn't mount because of " "unsupported optional features (%x)", (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) & ~EXT4_FEATURE_INCOMPAT_SUPP)); return 0; } #if !IS_ENABLED(CONFIG_UNICODE) if (ext4_has_feature_casefold(sb)) { ext4_msg(sb, KERN_ERR, "Filesystem with casefold feature cannot be " "mounted without CONFIG_UNICODE"); return 0; } #endif if (readonly) return 1; if (ext4_has_feature_readonly(sb)) { ext4_msg(sb, KERN_INFO, "filesystem is read-only"); sb->s_flags |= SB_RDONLY; return 1; } /* Check that feature set is OK for a read-write mount */ if (ext4_has_unknown_ext4_ro_compat_features(sb)) { ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of " "unsupported optional features (%x)", (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) & ~EXT4_FEATURE_RO_COMPAT_SUPP)); return 0; } if (ext4_has_feature_bigalloc(sb) && !ext4_has_feature_extents(sb)) { ext4_msg(sb, KERN_ERR, "Can't support bigalloc feature without " "extents feature\n"); return 0; } #if !IS_ENABLED(CONFIG_QUOTA) || !IS_ENABLED(CONFIG_QFMT_V2) if (!readonly && (ext4_has_feature_quota(sb) || ext4_has_feature_project(sb))) { ext4_msg(sb, KERN_ERR, "The kernel was not built with CONFIG_QUOTA and CONFIG_QFMT_V2"); return 0; } #endif /* CONFIG_QUOTA */ return 1; } /* * This function is called once a day if we have errors logged * on the file system */ static void print_daily_error_info(struct timer_list *t) { struct ext4_sb_info *sbi = from_timer(sbi, t, s_err_report); struct super_block *sb = sbi->s_sb; struct ext4_super_block *es = sbi->s_es; if (es->s_error_count) /* fsck newer than v1.41.13 is needed to clean this condition. */ ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u", le32_to_cpu(es->s_error_count)); if (es->s_first_error_time) { printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %llu: %.*s:%d", sb->s_id, ext4_get_tstamp(es, s_first_error_time), (int) sizeof(es->s_first_error_func), es->s_first_error_func, le32_to_cpu(es->s_first_error_line)); if (es->s_first_error_ino) printk(KERN_CONT ": inode %u", le32_to_cpu(es->s_first_error_ino)); if (es->s_first_error_block) printk(KERN_CONT ": block %llu", (unsigned long long) le64_to_cpu(es->s_first_error_block)); printk(KERN_CONT "\n"); } if (es->s_last_error_time) { printk(KERN_NOTICE "EXT4-fs (%s): last error at time %llu: %.*s:%d", sb->s_id, ext4_get_tstamp(es, s_last_error_time), (int) sizeof(es->s_last_error_func), es->s_last_error_func, le32_to_cpu(es->s_last_error_line)); if (es->s_last_error_ino) printk(KERN_CONT ": inode %u", le32_to_cpu(es->s_last_error_ino)); if (es->s_last_error_block) printk(KERN_CONT ": block %llu", (unsigned long long) le64_to_cpu(es->s_last_error_block)); printk(KERN_CONT "\n"); } mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */ } /* Find next suitable group and run ext4_init_inode_table */ static int ext4_run_li_request(struct ext4_li_request *elr) { struct ext4_group_desc *gdp = NULL; struct super_block *sb = elr->lr_super; ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; ext4_group_t group = elr->lr_next_group; unsigned int prefetch_ios = 0; int ret = 0; int nr = EXT4_SB(sb)->s_mb_prefetch; u64 start_time; if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) { elr->lr_next_group = ext4_mb_prefetch(sb, group, nr, &prefetch_ios); ext4_mb_prefetch_fini(sb, elr->lr_next_group, nr); trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group, nr); if (group >= elr->lr_next_group) { ret = 1; if (elr->lr_first_not_zeroed != ngroups && !sb_rdonly(sb) && test_opt(sb, INIT_INODE_TABLE)) { elr->lr_next_group = elr->lr_first_not_zeroed; elr->lr_mode = EXT4_LI_MODE_ITABLE; ret = 0; } } return ret; } for (; group < ngroups; group++) { gdp = ext4_get_group_desc(sb, group, NULL); if (!gdp) { ret = 1; break; } if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) break; } if (group >= ngroups) ret = 1; if (!ret) { start_time = ktime_get_real_ns(); ret = ext4_init_inode_table(sb, group, elr->lr_timeout ? 0 : 1); trace_ext4_lazy_itable_init(sb, group); if (elr->lr_timeout == 0) { elr->lr_timeout = nsecs_to_jiffies((ktime_get_real_ns() - start_time) * EXT4_SB(elr->lr_super)->s_li_wait_mult); } elr->lr_next_sched = jiffies + elr->lr_timeout; elr->lr_next_group = group + 1; } return ret; } /* * Remove lr_request from the list_request and free the * request structure. Should be called with li_list_mtx held */ static void ext4_remove_li_request(struct ext4_li_request *elr) { if (!elr) return; list_del(&elr->lr_request); EXT4_SB(elr->lr_super)->s_li_request = NULL; kfree(elr); } static void ext4_unregister_li_request(struct super_block *sb) { mutex_lock(&ext4_li_mtx); if (!ext4_li_info) { mutex_unlock(&ext4_li_mtx); return; } mutex_lock(&ext4_li_info->li_list_mtx); ext4_remove_li_request(EXT4_SB(sb)->s_li_request); mutex_unlock(&ext4_li_info->li_list_mtx); mutex_unlock(&ext4_li_mtx); } static struct task_struct *ext4_lazyinit_task; /* * This is the function where ext4lazyinit thread lives. It walks * through the request list searching for next scheduled filesystem. * When such a fs is found, run the lazy initialization request * (ext4_rn_li_request) and keep track of the time spend in this * function. Based on that time we compute next schedule time of * the request. When walking through the list is complete, compute * next waking time and put itself into sleep. */ static int ext4_lazyinit_thread(void *arg) { struct ext4_lazy_init *eli = arg; struct list_head *pos, *n; struct ext4_li_request *elr; unsigned long next_wakeup, cur; BUG_ON(NULL == eli); set_freezable(); cont_thread: while (true) { next_wakeup = MAX_JIFFY_OFFSET; mutex_lock(&eli->li_list_mtx); if (list_empty(&eli->li_request_list)) { mutex_unlock(&eli->li_list_mtx); goto exit_thread; } list_for_each_safe(pos, n, &eli->li_request_list) { int err = 0; int progress = 0; elr = list_entry(pos, struct ext4_li_request, lr_request); if (time_before(jiffies, elr->lr_next_sched)) { if (time_before(elr->lr_next_sched, next_wakeup)) next_wakeup = elr->lr_next_sched; continue; } if (down_read_trylock(&elr->lr_super->s_umount)) { if (sb_start_write_trylock(elr->lr_super)) { progress = 1; /* * We hold sb->s_umount, sb can not * be removed from the list, it is * now safe to drop li_list_mtx */ mutex_unlock(&eli->li_list_mtx); err = ext4_run_li_request(elr); sb_end_write(elr->lr_super); mutex_lock(&eli->li_list_mtx); n = pos->next; } up_read((&elr->lr_super->s_umount)); } /* error, remove the lazy_init job */ if (err) { ext4_remove_li_request(elr); continue; } if (!progress) { elr->lr_next_sched = jiffies + get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ); } if (time_before(elr->lr_next_sched, next_wakeup)) next_wakeup = elr->lr_next_sched; } mutex_unlock(&eli->li_list_mtx); try_to_freeze(); cur = jiffies; if ((time_after_eq(cur, next_wakeup)) || (MAX_JIFFY_OFFSET == next_wakeup)) { cond_resched(); continue; } schedule_timeout_interruptible(next_wakeup - cur); if (kthread_should_stop()) { ext4_clear_request_list(); goto exit_thread; } } exit_thread: /* * It looks like the request list is empty, but we need * to check it under the li_list_mtx lock, to prevent any * additions into it, and of course we should lock ext4_li_mtx * to atomically free the list and ext4_li_info, because at * this point another ext4 filesystem could be registering * new one. */ mutex_lock(&ext4_li_mtx); mutex_lock(&eli->li_list_mtx); if (!list_empty(&eli->li_request_list)) { mutex_unlock(&eli->li_list_mtx); mutex_unlock(&ext4_li_mtx); goto cont_thread; } mutex_unlock(&eli->li_list_mtx); kfree(ext4_li_info); ext4_li_info = NULL; mutex_unlock(&ext4_li_mtx); return 0; } static void ext4_clear_request_list(void) { struct list_head *pos, *n; struct ext4_li_request *elr; mutex_lock(&ext4_li_info->li_list_mtx); list_for_each_safe(pos, n, &ext4_li_info->li_request_list) { elr = list_entry(pos, struct ext4_li_request, lr_request); ext4_remove_li_request(elr); } mutex_unlock(&ext4_li_info->li_list_mtx); } static int ext4_run_lazyinit_thread(void) { ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread, ext4_li_info, "ext4lazyinit"); if (IS_ERR(ext4_lazyinit_task)) { int err = PTR_ERR(ext4_lazyinit_task); ext4_clear_request_list(); kfree(ext4_li_info); ext4_li_info = NULL; printk(KERN_CRIT "EXT4-fs: error %d creating inode table " "initialization thread\n", err); return err; } ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING; return 0; } /* * Check whether it make sense to run itable init. thread or not. * If there is at least one uninitialized inode table, return * corresponding group number, else the loop goes through all * groups and return total number of groups. */ static ext4_group_t ext4_has_uninit_itable(struct super_block *sb) { ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count; struct ext4_group_desc *gdp = NULL; if (!ext4_has_group_desc_csum(sb)) return ngroups; for (group = 0; group < ngroups; group++) { gdp = ext4_get_group_desc(sb, group, NULL); if (!gdp) continue; if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) break; } return group; } static int ext4_li_info_new(void) { struct ext4_lazy_init *eli = NULL; eli = kzalloc(sizeof(*eli), GFP_KERNEL); if (!eli) return -ENOMEM; INIT_LIST_HEAD(&eli->li_request_list); mutex_init(&eli->li_list_mtx); eli->li_state |= EXT4_LAZYINIT_QUIT; ext4_li_info = eli; return 0; } static struct ext4_li_request *ext4_li_request_new(struct super_block *sb, ext4_group_t start) { struct ext4_li_request *elr; elr = kzalloc(sizeof(*elr), GFP_KERNEL); if (!elr) return NULL; elr->lr_super = sb; elr->lr_first_not_zeroed = start; if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) { elr->lr_mode = EXT4_LI_MODE_ITABLE; elr->lr_next_group = start; } else { elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP; } /* * Randomize first schedule time of the request to * spread the inode table initialization requests * better. */ elr->lr_next_sched = jiffies + get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ); return elr; } int ext4_register_li_request(struct super_block *sb, ext4_group_t first_not_zeroed) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_li_request *elr = NULL; ext4_group_t ngroups = sbi->s_groups_count; int ret = 0; mutex_lock(&ext4_li_mtx); if (sbi->s_li_request != NULL) { /* * Reset timeout so it can be computed again, because * s_li_wait_mult might have changed. */ sbi->s_li_request->lr_timeout = 0; goto out; } if (sb_rdonly(sb) || (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) && (first_not_zeroed == ngroups || !test_opt(sb, INIT_INODE_TABLE)))) goto out; elr = ext4_li_request_new(sb, first_not_zeroed); if (!elr) { ret = -ENOMEM; goto out; } if (NULL == ext4_li_info) { ret = ext4_li_info_new(); if (ret) goto out; } mutex_lock(&ext4_li_info->li_list_mtx); list_add(&elr->lr_request, &ext4_li_info->li_request_list); mutex_unlock(&ext4_li_info->li_list_mtx); sbi->s_li_request = elr; /* * set elr to NULL here since it has been inserted to * the request_list and the removal and free of it is * handled by ext4_clear_request_list from now on. */ elr = NULL; if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) { ret = ext4_run_lazyinit_thread(); if (ret) goto out; } out: mutex_unlock(&ext4_li_mtx); if (ret) kfree(elr); return ret; } /* * We do not need to lock anything since this is called on * module unload. */ static void ext4_destroy_lazyinit_thread(void) { /* * If thread exited earlier * there's nothing to be done. */ if (!ext4_li_info || !ext4_lazyinit_task) return; kthread_stop(ext4_lazyinit_task); } static int set_journal_csum_feature_set(struct super_block *sb) { int ret = 1; int compat, incompat; struct ext4_sb_info *sbi = EXT4_SB(sb); if (ext4_has_metadata_csum(sb)) { /* journal checksum v3 */ compat = 0; incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3; } else { /* journal checksum v1 */ compat = JBD2_FEATURE_COMPAT_CHECKSUM; incompat = 0; } jbd2_journal_clear_features(sbi->s_journal, JBD2_FEATURE_COMPAT_CHECKSUM, 0, JBD2_FEATURE_INCOMPAT_CSUM_V3 | JBD2_FEATURE_INCOMPAT_CSUM_V2); if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { ret = jbd2_journal_set_features(sbi->s_journal, compat, 0, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | incompat); } else if (test_opt(sb, JOURNAL_CHECKSUM)) { ret = jbd2_journal_set_features(sbi->s_journal, compat, 0, incompat); jbd2_journal_clear_features(sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); } else { jbd2_journal_clear_features(sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); } return ret; } /* * Note: calculating the overhead so we can be compatible with * historical BSD practice is quite difficult in the face of * clusters/bigalloc. This is because multiple metadata blocks from * different block group can end up in the same allocation cluster. * Calculating the exact overhead in the face of clustered allocation * requires either O(all block bitmaps) in memory or O(number of block * groups**2) in time. We will still calculate the superblock for * older file systems --- and if we come across with a bigalloc file * system with zero in s_overhead_clusters the estimate will be close to * correct especially for very large cluster sizes --- but for newer * file systems, it's better to calculate this figure once at mkfs * time, and store it in the superblock. If the superblock value is * present (even for non-bigalloc file systems), we will use it. */ static int count_overhead(struct super_block *sb, ext4_group_t grp, char *buf) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_desc *gdp; ext4_fsblk_t first_block, last_block, b; ext4_group_t i, ngroups = ext4_get_groups_count(sb); int s, j, count = 0; int has_super = ext4_bg_has_super(sb, grp); if (!ext4_has_feature_bigalloc(sb)) return (has_super + ext4_bg_num_gdb(sb, grp) + (has_super ? le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0) + sbi->s_itb_per_group + 2); first_block = le32_to_cpu(sbi->s_es->s_first_data_block) + (grp * EXT4_BLOCKS_PER_GROUP(sb)); last_block = first_block + EXT4_BLOCKS_PER_GROUP(sb) - 1; for (i = 0; i < ngroups; i++) { gdp = ext4_get_group_desc(sb, i, NULL); b = ext4_block_bitmap(sb, gdp); if (b >= first_block && b <= last_block) { ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf); count++; } b = ext4_inode_bitmap(sb, gdp); if (b >= first_block && b <= last_block) { ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf); count++; } b = ext4_inode_table(sb, gdp); if (b >= first_block && b + sbi->s_itb_per_group <= last_block) for (j = 0; j < sbi->s_itb_per_group; j++, b++) { int c = EXT4_B2C(sbi, b - first_block); ext4_set_bit(c, buf); count++; } if (i != grp) continue; s = 0; if (ext4_bg_has_super(sb, grp)) { ext4_set_bit(s++, buf); count++; } j = ext4_bg_num_gdb(sb, grp); if (s + j > EXT4_BLOCKS_PER_GROUP(sb)) { ext4_error(sb, "Invalid number of block group " "descriptor blocks: %d", j); j = EXT4_BLOCKS_PER_GROUP(sb) - s; } count += j; for (; j > 0; j--) ext4_set_bit(EXT4_B2C(sbi, s++), buf); } if (!count) return 0; return EXT4_CLUSTERS_PER_GROUP(sb) - ext4_count_free(buf, EXT4_CLUSTERS_PER_GROUP(sb) / 8); } /* * Compute the overhead and stash it in sbi->s_overhead */ int ext4_calculate_overhead(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; struct inode *j_inode; unsigned int j_blocks, j_inum = le32_to_cpu(es->s_journal_inum); ext4_group_t i, ngroups = ext4_get_groups_count(sb); ext4_fsblk_t overhead = 0; char *buf = (char *) get_zeroed_page(GFP_NOFS); if (!buf) return -ENOMEM; /* * Compute the overhead (FS structures). This is constant * for a given filesystem unless the number of block groups * changes so we cache the previous value until it does. */ /* * All of the blocks before first_data_block are overhead */ overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block)); /* * Add the overhead found in each block group */ for (i = 0; i < ngroups; i++) { int blks; blks = count_overhead(sb, i, buf); overhead += blks; if (blks) memset(buf, 0, PAGE_SIZE); cond_resched(); } /* * Add the internal journal blocks whether the journal has been * loaded or not */ if (sbi->s_journal && !sbi->s_journal_bdev_handle) overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len); else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) { /* j_inum for internal journal is non-zero */ j_inode = ext4_get_journal_inode(sb, j_inum); if (!IS_ERR(j_inode)) { j_blocks = j_inode->i_size >> sb->s_blocksize_bits; overhead += EXT4_NUM_B2C(sbi, j_blocks); iput(j_inode); } else { ext4_msg(sb, KERN_ERR, "can't get journal size"); } } sbi->s_overhead = overhead; smp_wmb(); free_page((unsigned long) buf); return 0; } static void ext4_set_resv_clusters(struct super_block *sb) { ext4_fsblk_t resv_clusters; struct ext4_sb_info *sbi = EXT4_SB(sb); /* * There's no need to reserve anything when we aren't using extents. * The space estimates are exact, there are no unwritten extents, * hole punching doesn't need new metadata... This is needed especially * to keep ext2/3 backward compatibility. */ if (!ext4_has_feature_extents(sb)) return; /* * By default we reserve 2% or 4096 clusters, whichever is smaller. * This should cover the situations where we can not afford to run * out of space like for example punch hole, or converting * unwritten extents in delalloc path. In most cases such * allocation would require 1, or 2 blocks, higher numbers are * very rare. */ resv_clusters = (ext4_blocks_count(sbi->s_es) >> sbi->s_cluster_bits); do_div(resv_clusters, 50); resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096); atomic64_set(&sbi->s_resv_clusters, resv_clusters); } static const char *ext4_quota_mode(struct super_block *sb) { #ifdef CONFIG_QUOTA if (!ext4_quota_capable(sb)) return "none"; if (EXT4_SB(sb)->s_journal && ext4_is_quota_journalled(sb)) return "journalled"; else return "writeback"; #else return "disabled"; #endif } static void ext4_setup_csum_trigger(struct super_block *sb, enum ext4_journal_trigger_type type, void (*trigger)( struct jbd2_buffer_trigger_type *type, struct buffer_head *bh, void *mapped_data, size_t size)) { struct ext4_sb_info *sbi = EXT4_SB(sb); sbi->s_journal_triggers[type].sb = sb; sbi->s_journal_triggers[type].tr_triggers.t_frozen = trigger; } static void ext4_free_sbi(struct ext4_sb_info *sbi) { if (!sbi) return; kfree(sbi->s_blockgroup_lock); fs_put_dax(sbi->s_daxdev, NULL); kfree(sbi); } static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb) { struct ext4_sb_info *sbi; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return NULL; sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off, NULL, NULL); sbi->s_blockgroup_lock = kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); if (!sbi->s_blockgroup_lock) goto err_out; sb->s_fs_info = sbi; sbi->s_sb = sb; return sbi; err_out: fs_put_dax(sbi->s_daxdev, NULL); kfree(sbi); return NULL; } static void ext4_set_def_opts(struct super_block *sb, struct ext4_super_block *es) { unsigned long def_mount_opts; /* Set defaults before we parse the mount options */ def_mount_opts = le32_to_cpu(es->s_default_mount_opts); set_opt(sb, INIT_INODE_TABLE); if (def_mount_opts & EXT4_DEFM_DEBUG) set_opt(sb, DEBUG); if (def_mount_opts & EXT4_DEFM_BSDGROUPS) set_opt(sb, GRPID); if (def_mount_opts & EXT4_DEFM_UID16) set_opt(sb, NO_UID32); /* xattr user namespace & acls are now defaulted on */ set_opt(sb, XATTR_USER); #ifdef CONFIG_EXT4_FS_POSIX_ACL set_opt(sb, POSIX_ACL); #endif if (ext4_has_feature_fast_commit(sb)) set_opt2(sb, JOURNAL_FAST_COMMIT); /* don't forget to enable journal_csum when metadata_csum is enabled. */ if (ext4_has_metadata_csum(sb)) set_opt(sb, JOURNAL_CHECKSUM); if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) set_opt(sb, JOURNAL_DATA); else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) set_opt(sb, ORDERED_DATA); else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK) set_opt(sb, WRITEBACK_DATA); if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_PANIC) set_opt(sb, ERRORS_PANIC); else if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_CONTINUE) set_opt(sb, ERRORS_CONT); else set_opt(sb, ERRORS_RO); /* block_validity enabled by default; disable with noblock_validity */ set_opt(sb, BLOCK_VALIDITY); if (def_mount_opts & EXT4_DEFM_DISCARD) set_opt(sb, DISCARD); if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0) set_opt(sb, BARRIER); /* * enable delayed allocation by default * Use -o nodelalloc to turn it off */ if (!IS_EXT3_SB(sb) && !IS_EXT2_SB(sb) && ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) set_opt(sb, DELALLOC); if (sb->s_blocksize == PAGE_SIZE) set_opt(sb, DIOREAD_NOLOCK); } static int ext4_handle_clustersize(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; int clustersize; /* Handle clustersize */ clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size); if (ext4_has_feature_bigalloc(sb)) { if (clustersize < sb->s_blocksize) { ext4_msg(sb, KERN_ERR, "cluster size (%d) smaller than " "block size (%lu)", clustersize, sb->s_blocksize); return -EINVAL; } sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) - le32_to_cpu(es->s_log_block_size); sbi->s_clusters_per_group = le32_to_cpu(es->s_clusters_per_group); if (sbi->s_clusters_per_group > sb->s_blocksize * 8) { ext4_msg(sb, KERN_ERR, "#clusters per group too big: %lu", sbi->s_clusters_per_group); return -EINVAL; } if (sbi->s_blocks_per_group != (sbi->s_clusters_per_group * (clustersize / sb->s_blocksize))) { ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and " "clusters per group (%lu) inconsistent", sbi->s_blocks_per_group, sbi->s_clusters_per_group); return -EINVAL; } } else { if (clustersize != sb->s_blocksize) { ext4_msg(sb, KERN_ERR, "fragment/cluster size (%d) != " "block size (%lu)", clustersize, sb->s_blocksize); return -EINVAL; } if (sbi->s_blocks_per_group > sb->s_blocksize * 8) { ext4_msg(sb, KERN_ERR, "#blocks per group too big: %lu", sbi->s_blocks_per_group); return -EINVAL; } sbi->s_clusters_per_group = sbi->s_blocks_per_group; sbi->s_cluster_bits = 0; } sbi->s_cluster_ratio = clustersize / sb->s_blocksize; /* Do we have standard group size of clustersize * 8 blocks ? */ if (sbi->s_blocks_per_group == clustersize << 3) set_opt2(sb, STD_GROUP_SIZE); return 0; } static void ext4_fast_commit_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); /* Initialize fast commit stuff */ atomic_set(&sbi->s_fc_subtid, 0); INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]); INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]); INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]); INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]); sbi->s_fc_bytes = 0; ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); sbi->s_fc_ineligible_tid = 0; spin_lock_init(&sbi->s_fc_lock); memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats)); sbi->s_fc_replay_state.fc_regions = NULL; sbi->s_fc_replay_state.fc_regions_size = 0; sbi->s_fc_replay_state.fc_regions_used = 0; sbi->s_fc_replay_state.fc_regions_valid = 0; sbi->s_fc_replay_state.fc_modified_inodes = NULL; sbi->s_fc_replay_state.fc_modified_inodes_size = 0; sbi->s_fc_replay_state.fc_modified_inodes_used = 0; } static int ext4_inode_info_init(struct super_block *sb, struct ext4_super_block *es) { struct ext4_sb_info *sbi = EXT4_SB(sb); if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO; } else { sbi->s_inode_size = le16_to_cpu(es->s_inode_size); sbi->s_first_ino = le32_to_cpu(es->s_first_ino); if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) { ext4_msg(sb, KERN_ERR, "invalid first ino: %u", sbi->s_first_ino); return -EINVAL; } if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) || (!is_power_of_2(sbi->s_inode_size)) || (sbi->s_inode_size > sb->s_blocksize)) { ext4_msg(sb, KERN_ERR, "unsupported inode size: %d", sbi->s_inode_size); ext4_msg(sb, KERN_ERR, "blocksize: %lu", sb->s_blocksize); return -EINVAL; } /* * i_atime_extra is the last extra field available for * [acm]times in struct ext4_inode. Checking for that * field should suffice to ensure we have extra space * for all three. */ if (sbi->s_inode_size >= offsetof(struct ext4_inode, i_atime_extra) + sizeof(((struct ext4_inode *)0)->i_atime_extra)) { sb->s_time_gran = 1; sb->s_time_max = EXT4_EXTRA_TIMESTAMP_MAX; } else { sb->s_time_gran = NSEC_PER_SEC; sb->s_time_max = EXT4_NON_EXTRA_TIMESTAMP_MAX; } sb->s_time_min = EXT4_TIMESTAMP_MIN; } if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) { sbi->s_want_extra_isize = sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE; if (ext4_has_feature_extra_isize(sb)) { unsigned v, max = (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE); v = le16_to_cpu(es->s_want_extra_isize); if (v > max) { ext4_msg(sb, KERN_ERR, "bad s_want_extra_isize: %d", v); return -EINVAL; } if (sbi->s_want_extra_isize < v) sbi->s_want_extra_isize = v; v = le16_to_cpu(es->s_min_extra_isize); if (v > max) { ext4_msg(sb, KERN_ERR, "bad s_min_extra_isize: %d", v); return -EINVAL; } if (sbi->s_want_extra_isize < v) sbi->s_want_extra_isize = v; } } return 0; } #if IS_ENABLED(CONFIG_UNICODE) static int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es) { const struct ext4_sb_encodings *encoding_info; struct unicode_map *encoding; __u16 encoding_flags = le16_to_cpu(es->s_encoding_flags); if (!ext4_has_feature_casefold(sb) || sb->s_encoding) return 0; encoding_info = ext4_sb_read_encoding(es); if (!encoding_info) { ext4_msg(sb, KERN_ERR, "Encoding requested by superblock is unknown"); return -EINVAL; } encoding = utf8_load(encoding_info->version); if (IS_ERR(encoding)) { ext4_msg(sb, KERN_ERR, "can't mount with superblock charset: %s-%u.%u.%u " "not supported by the kernel. flags: 0x%x.", encoding_info->name, unicode_major(encoding_info->version), unicode_minor(encoding_info->version), unicode_rev(encoding_info->version), encoding_flags); return -EINVAL; } ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: " "%s-%u.%u.%u with flags 0x%hx", encoding_info->name, unicode_major(encoding_info->version), unicode_minor(encoding_info->version), unicode_rev(encoding_info->version), encoding_flags); sb->s_encoding = encoding; sb->s_encoding_flags = encoding_flags; return 0; } #else static inline int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es) { return 0; } #endif static int ext4_init_metadata_csum(struct super_block *sb, struct ext4_super_block *es) { struct ext4_sb_info *sbi = EXT4_SB(sb); /* Warn if metadata_csum and gdt_csum are both set. */ if (ext4_has_feature_metadata_csum(sb) && ext4_has_feature_gdt_csum(sb)) ext4_warning(sb, "metadata_csum and uninit_bg are " "redundant flags; please run fsck."); /* Check for a known checksum algorithm */ if (!ext4_verify_csum_type(sb, es)) { ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " "unknown checksum algorithm."); return -EINVAL; } ext4_setup_csum_trigger(sb, EXT4_JTR_ORPHAN_FILE, ext4_orphan_file_block_trigger); /* Load the checksum driver */ sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); if (IS_ERR(sbi->s_chksum_driver)) { int ret = PTR_ERR(sbi->s_chksum_driver); ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver."); sbi->s_chksum_driver = NULL; return ret; } /* Check superblock checksum */ if (!ext4_superblock_csum_verify(sb, es)) { ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " "invalid superblock checksum. Run e2fsck?"); return -EFSBADCRC; } /* Precompute checksum seed for all metadata */ if (ext4_has_feature_csum_seed(sb)) sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed); else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb)) sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid, sizeof(es->s_uuid)); return 0; } static int ext4_check_feature_compatibility(struct super_block *sb, struct ext4_super_block *es, int silent) { struct ext4_sb_info *sbi = EXT4_SB(sb); if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV && (ext4_has_compat_features(sb) || ext4_has_ro_compat_features(sb) || ext4_has_incompat_features(sb))) ext4_msg(sb, KERN_WARNING, "feature flags set on rev 0 fs, " "running e2fsck is recommended"); if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) { set_opt2(sb, HURD_COMPAT); if (ext4_has_feature_64bit(sb)) { ext4_msg(sb, KERN_ERR, "The Hurd can't support 64-bit file systems"); return -EINVAL; } /* * ea_inode feature uses l_i_version field which is not * available in HURD_COMPAT mode. */ if (ext4_has_feature_ea_inode(sb)) { ext4_msg(sb, KERN_ERR, "ea_inode feature is not supported for Hurd"); return -EINVAL; } } if (IS_EXT2_SB(sb)) { if (ext2_feature_set_ok(sb)) ext4_msg(sb, KERN_INFO, "mounting ext2 file system " "using the ext4 subsystem"); else { /* * If we're probing be silent, if this looks like * it's actually an ext[34] filesystem. */ if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb))) return -EINVAL; ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due " "to feature incompatibilities"); return -EINVAL; } } if (IS_EXT3_SB(sb)) { if (ext3_feature_set_ok(sb)) ext4_msg(sb, KERN_INFO, "mounting ext3 file system " "using the ext4 subsystem"); else { /* * If we're probing be silent, if this looks like * it's actually an ext4 filesystem. */ if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb))) return -EINVAL; ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due " "to feature incompatibilities"); return -EINVAL; } } /* * Check feature flags regardless of the revision level, since we * previously didn't change the revision level when setting the flags, * so there is a chance incompat flags are set on a rev 0 filesystem. */ if (!ext4_feature_set_ok(sb, (sb_rdonly(sb)))) return -EINVAL; if (sbi->s_daxdev) { if (sb->s_blocksize == PAGE_SIZE) set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags); else ext4_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n"); } if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) { if (ext4_has_feature_inline_data(sb)) { ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem" " that may contain inline data"); return -EINVAL; } if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) { ext4_msg(sb, KERN_ERR, "DAX unsupported by block device."); return -EINVAL; } } if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) { ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d", es->s_encryption_level); return -EINVAL; } return 0; } static int ext4_check_geometry(struct super_block *sb, struct ext4_super_block *es) { struct ext4_sb_info *sbi = EXT4_SB(sb); __u64 blocks_count; int err; if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (sb->s_blocksize / 4)) { ext4_msg(sb, KERN_ERR, "Number of reserved GDT blocks insanely large: %d", le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks)); return -EINVAL; } /* * Test whether we have more sectors than will fit in sector_t, * and whether the max offset is addressable by the page cache. */ err = generic_check_addressable(sb->s_blocksize_bits, ext4_blocks_count(es)); if (err) { ext4_msg(sb, KERN_ERR, "filesystem" " too large to mount safely on this system"); return err; } /* check blocks count against device size */ blocks_count = sb_bdev_nr_blocks(sb); if (blocks_count && ext4_blocks_count(es) > blocks_count) { ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu " "exceeds size of device (%llu blocks)", ext4_blocks_count(es), blocks_count); return -EINVAL; } /* * It makes no sense for the first data block to be beyond the end * of the filesystem. */ if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) { ext4_msg(sb, KERN_WARNING, "bad geometry: first data " "block %u is beyond end of filesystem (%llu)", le32_to_cpu(es->s_first_data_block), ext4_blocks_count(es)); return -EINVAL; } if ((es->s_first_data_block == 0) && (es->s_log_block_size == 0) && (sbi->s_cluster_ratio == 1)) { ext4_msg(sb, KERN_WARNING, "bad geometry: first data " "block is 0 with a 1k block and cluster size"); return -EINVAL; } blocks_count = (ext4_blocks_count(es) - le32_to_cpu(es->s_first_data_block) + EXT4_BLOCKS_PER_GROUP(sb) - 1); do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb)); if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) { ext4_msg(sb, KERN_WARNING, "groups count too large: %llu " "(block count %llu, first data block %u, " "blocks per group %lu)", blocks_count, ext4_blocks_count(es), le32_to_cpu(es->s_first_data_block), EXT4_BLOCKS_PER_GROUP(sb)); return -EINVAL; } sbi->s_groups_count = blocks_count; sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) != le32_to_cpu(es->s_inodes_count)) { ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu", le32_to_cpu(es->s_inodes_count), ((u64)sbi->s_groups_count * sbi->s_inodes_per_group)); return -EINVAL; } return 0; } static int ext4_group_desc_init(struct super_block *sb, struct ext4_super_block *es, ext4_fsblk_t logical_sb_block, ext4_group_t *first_not_zeroed) { struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned int db_count; ext4_fsblk_t block; int i; db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb); if (ext4_has_feature_meta_bg(sb)) { if (le32_to_cpu(es->s_first_meta_bg) > db_count) { ext4_msg(sb, KERN_WARNING, "first meta block group too large: %u " "(group descriptor block count %u)", le32_to_cpu(es->s_first_meta_bg), db_count); return -EINVAL; } } rcu_assign_pointer(sbi->s_group_desc, kvmalloc_array(db_count, sizeof(struct buffer_head *), GFP_KERNEL)); if (sbi->s_group_desc == NULL) { ext4_msg(sb, KERN_ERR, "not enough memory"); return -ENOMEM; } bgl_lock_init(sbi->s_blockgroup_lock); /* Pre-read the descriptors into the buffer cache */ for (i = 0; i < db_count; i++) { block = descriptor_loc(sb, logical_sb_block, i); ext4_sb_breadahead_unmovable(sb, block); } for (i = 0; i < db_count; i++) { struct buffer_head *bh; block = descriptor_loc(sb, logical_sb_block, i); bh = ext4_sb_bread_unmovable(sb, block); if (IS_ERR(bh)) { ext4_msg(sb, KERN_ERR, "can't read group descriptor %d", i); sbi->s_gdb_count = i; return PTR_ERR(bh); } rcu_read_lock(); rcu_dereference(sbi->s_group_desc)[i] = bh; rcu_read_unlock(); } sbi->s_gdb_count = db_count; if (!ext4_check_descriptors(sb, logical_sb_block, first_not_zeroed)) { ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); return -EFSCORRUPTED; } return 0; } static int ext4_load_and_init_journal(struct super_block *sb, struct ext4_super_block *es, struct ext4_fs_context *ctx) { struct ext4_sb_info *sbi = EXT4_SB(sb); int err; err = ext4_load_journal(sb, es, ctx->journal_devnum); if (err) return err; if (ext4_has_feature_64bit(sb) && !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_64BIT)) { ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature"); goto out; } if (!set_journal_csum_feature_set(sb)) { ext4_msg(sb, KERN_ERR, "Failed to set journal checksum " "feature set"); goto out; } if (test_opt2(sb, JOURNAL_FAST_COMMIT) && !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) { ext4_msg(sb, KERN_ERR, "Failed to set fast commit journal feature"); goto out; } /* We have now updated the journal if required, so we can * validate the data journaling mode. */ switch (test_opt(sb, DATA_FLAGS)) { case 0: /* No mode set, assume a default based on the journal * capabilities: ORDERED_DATA if the journal can * cope, else JOURNAL_DATA */ if (jbd2_journal_check_available_features (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { set_opt(sb, ORDERED_DATA); sbi->s_def_mount_opt |= EXT4_MOUNT_ORDERED_DATA; } else { set_opt(sb, JOURNAL_DATA); sbi->s_def_mount_opt |= EXT4_MOUNT_JOURNAL_DATA; } break; case EXT4_MOUNT_ORDERED_DATA: case EXT4_MOUNT_WRITEBACK_DATA: if (!jbd2_journal_check_available_features (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { ext4_msg(sb, KERN_ERR, "Journal does not support " "requested data journaling mode"); goto out; } break; default: break; } if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA && test_opt(sb, JOURNAL_ASYNC_COMMIT)) { ext4_msg(sb, KERN_ERR, "can't mount with " "journal_async_commit in data=ordered mode"); goto out; } set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio); sbi->s_journal->j_submit_inode_data_buffers = ext4_journal_submit_inode_data_buffers; sbi->s_journal->j_finish_inode_data_buffers = ext4_journal_finish_inode_data_buffers; return 0; out: /* flush s_sb_upd_work before destroying the journal. */ flush_work(&sbi->s_sb_upd_work); jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; return -EINVAL; } static int ext4_check_journal_data_mode(struct super_block *sb) { if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with " "data=journal disables delayed allocation, " "dioread_nolock, O_DIRECT and fast_commit support!\n"); /* can't mount with both data=journal and dioread_nolock. */ clear_opt(sb, DIOREAD_NOLOCK); clear_opt2(sb, JOURNAL_FAST_COMMIT); if (test_opt2(sb, EXPLICIT_DELALLOC)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and delalloc"); return -EINVAL; } if (test_opt(sb, DAX_ALWAYS)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and dax"); return -EINVAL; } if (ext4_has_feature_encrypt(sb)) { ext4_msg(sb, KERN_WARNING, "encrypted files will use data=ordered " "instead of data journaling mode"); } if (test_opt(sb, DELALLOC)) clear_opt(sb, DELALLOC); } else { sb->s_iflags |= SB_I_CGROUPWB; } return 0; } static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb, int silent) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es; ext4_fsblk_t logical_sb_block; unsigned long offset = 0; struct buffer_head *bh; int ret = -EINVAL; int blocksize; blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); if (!blocksize) { ext4_msg(sb, KERN_ERR, "unable to set blocksize"); return -EINVAL; } /* * The ext4 superblock will not be buffer aligned for other than 1kB * block sizes. We need to calculate the offset from buffer start. */ if (blocksize != EXT4_MIN_BLOCK_SIZE) { logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE; offset = do_div(logical_sb_block, blocksize); } else { logical_sb_block = sbi->s_sb_block; } bh = ext4_sb_bread_unmovable(sb, logical_sb_block); if (IS_ERR(bh)) { ext4_msg(sb, KERN_ERR, "unable to read superblock"); return PTR_ERR(bh); } /* * Note: s_es must be initialized as soon as possible because * some ext4 macro-instructions depend on its value */ es = (struct ext4_super_block *) (bh->b_data + offset); sbi->s_es = es; sb->s_magic = le16_to_cpu(es->s_magic); if (sb->s_magic != EXT4_SUPER_MAGIC) { if (!silent) ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); goto out; } if (le32_to_cpu(es->s_log_block_size) > (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { ext4_msg(sb, KERN_ERR, "Invalid log block size: %u", le32_to_cpu(es->s_log_block_size)); goto out; } if (le32_to_cpu(es->s_log_cluster_size) > (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { ext4_msg(sb, KERN_ERR, "Invalid log cluster size: %u", le32_to_cpu(es->s_log_cluster_size)); goto out; } blocksize = EXT4_MIN_BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); /* * If the default block size is not the same as the real block size, * we need to reload it. */ if (sb->s_blocksize == blocksize) { *lsb = logical_sb_block; sbi->s_sbh = bh; return 0; } /* * bh must be released before kill_bdev(), otherwise * it won't be freed and its page also. kill_bdev() * is called by sb_set_blocksize(). */ brelse(bh); /* Validate the filesystem blocksize */ if (!sb_set_blocksize(sb, blocksize)) { ext4_msg(sb, KERN_ERR, "bad block size %d", blocksize); bh = NULL; goto out; } logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE; offset = do_div(logical_sb_block, blocksize); bh = ext4_sb_bread_unmovable(sb, logical_sb_block); if (IS_ERR(bh)) { ext4_msg(sb, KERN_ERR, "Can't read superblock on 2nd try"); ret = PTR_ERR(bh); bh = NULL; goto out; } es = (struct ext4_super_block *)(bh->b_data + offset); sbi->s_es = es; if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) { ext4_msg(sb, KERN_ERR, "Magic mismatch, very weird!"); goto out; } *lsb = logical_sb_block; sbi->s_sbh = bh; return 0; out: brelse(bh); return ret; } static void ext4_hash_info_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; unsigned int i; for (i = 0; i < 4; i++) sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); sbi->s_def_hash_version = es->s_def_hash_version; if (ext4_has_feature_dir_index(sb)) { i = le32_to_cpu(es->s_flags); if (i & EXT2_FLAGS_UNSIGNED_HASH) sbi->s_hash_unsigned = 3; else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { #ifdef __CHAR_UNSIGNED__ if (!sb_rdonly(sb)) es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); sbi->s_hash_unsigned = 3; #else if (!sb_rdonly(sb)) es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); #endif } } } static int ext4_block_group_meta_init(struct super_block *sb, int silent) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; int has_huge_files; has_huge_files = ext4_has_feature_huge_file(sb); sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, has_huge_files); sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); sbi->s_desc_size = le16_to_cpu(es->s_desc_size); if (ext4_has_feature_64bit(sb)) { if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT || sbi->s_desc_size > EXT4_MAX_DESC_SIZE || !is_power_of_2(sbi->s_desc_size)) { ext4_msg(sb, KERN_ERR, "unsupported descriptor size %lu", sbi->s_desc_size); return -EINVAL; } } else sbi->s_desc_size = EXT4_MIN_DESC_SIZE; sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); sbi->s_inodes_per_block = sb->s_blocksize / EXT4_INODE_SIZE(sb); if (sbi->s_inodes_per_block == 0 || sbi->s_blocks_per_group == 0) { if (!silent) ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); return -EINVAL; } if (sbi->s_inodes_per_group < sbi->s_inodes_per_block || sbi->s_inodes_per_group > sb->s_blocksize * 8) { ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n", sbi->s_inodes_per_group); return -EINVAL; } sbi->s_itb_per_group = sbi->s_inodes_per_group / sbi->s_inodes_per_block; sbi->s_desc_per_block = sb->s_blocksize / EXT4_DESC_SIZE(sb); sbi->s_mount_state = le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY; sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); return 0; } static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) { struct ext4_super_block *es = NULL; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t logical_sb_block; struct inode *root; int needs_recovery; int err; ext4_group_t first_not_zeroed; struct ext4_fs_context *ctx = fc->fs_private; int silent = fc->sb_flags & SB_SILENT; /* Set defaults for the variables that will be set during parsing */ if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)) ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; sbi->s_sectors_written_start = part_stat_read(sb->s_bdev, sectors[STAT_WRITE]); err = ext4_load_super(sb, &logical_sb_block, silent); if (err) goto out_fail; es = sbi->s_es; sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written); err = ext4_init_metadata_csum(sb, es); if (err) goto failed_mount; ext4_set_def_opts(sb, es); sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid)); sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid)); sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; /* * set default s_li_wait_mult for lazyinit, for the case there is * no mount option specified. */ sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; err = ext4_inode_info_init(sb, es); if (err) goto failed_mount; err = parse_apply_sb_mount_options(sb, ctx); if (err < 0) goto failed_mount; sbi->s_def_mount_opt = sbi->s_mount_opt; sbi->s_def_mount_opt2 = sbi->s_mount_opt2; err = ext4_check_opt_consistency(fc, sb); if (err < 0) goto failed_mount; ext4_apply_options(fc, sb); err = ext4_encoding_init(sb, es); if (err) goto failed_mount; err = ext4_check_journal_data_mode(sb); if (err) goto failed_mount; sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); /* i_version is always enabled now */ sb->s_flags |= SB_I_VERSION; err = ext4_check_feature_compatibility(sb, es, silent); if (err) goto failed_mount; err = ext4_block_group_meta_init(sb, silent); if (err) goto failed_mount; ext4_hash_info_init(sb); err = ext4_handle_clustersize(sb); if (err) goto failed_mount; err = ext4_check_geometry(sb, es); if (err) goto failed_mount; timer_setup(&sbi->s_err_report, print_daily_error_info, 0); spin_lock_init(&sbi->s_error_lock); INIT_WORK(&sbi->s_sb_upd_work, update_super_work); err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed); if (err) goto failed_mount3; err = ext4_es_register_shrinker(sbi); if (err) goto failed_mount3; sbi->s_stripe = ext4_get_stripe_size(sbi); /* * It's hard to get stripe aligned blocks if stripe is not aligned with * cluster, just disable stripe and alert user to simpfy code and avoid * stripe aligned allocation which will rarely successes. */ if (sbi->s_stripe > 0 && sbi->s_cluster_ratio > 1 && sbi->s_stripe % sbi->s_cluster_ratio != 0) { ext4_msg(sb, KERN_WARNING, "stripe (%lu) is not aligned with cluster size (%u), " "stripe is disabled", sbi->s_stripe, sbi->s_cluster_ratio); sbi->s_stripe = 0; } sbi->s_extent_max_zeroout_kb = 32; /* * set up enough so that it can read an inode */ sb->s_op = &ext4_sops; sb->s_export_op = &ext4_export_ops; sb->s_xattr = ext4_xattr_handlers; #ifdef CONFIG_FS_ENCRYPTION sb->s_cop = &ext4_cryptops; #endif #ifdef CONFIG_FS_VERITY sb->s_vop = &ext4_verityops; #endif #ifdef CONFIG_QUOTA sb->dq_op = &ext4_quota_operations; if (ext4_has_feature_quota(sb)) sb->s_qcop = &dquot_quotactl_sysfile_ops; else sb->s_qcop = &ext4_qctl_operations; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; #endif memcpy(&sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); ext4_fast_commit_init(sb); sb->s_root = NULL; needs_recovery = (es->s_last_orphan != 0 || ext4_has_feature_orphan_present(sb) || ext4_has_feature_journal_needs_recovery(sb)); if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb)) { err = ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)); if (err) goto failed_mount3a; } err = -EINVAL; /* * The first inode we look at is the journal inode. Don't try * root first: it may be modified in the journal! */ if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) { err = ext4_load_and_init_journal(sb, es, ctx); if (err) goto failed_mount3a; } else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) && ext4_has_feature_journal_needs_recovery(sb)) { ext4_msg(sb, KERN_ERR, "required journal recovery " "suppressed and not mounted read-only"); goto failed_mount3a; } else { /* Nojournal mode, all journal mount options are illegal */ if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { ext4_msg(sb, KERN_ERR, "can't mount with " "journal_async_commit, fs mounted w/o journal"); goto failed_mount3a; } if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) { ext4_msg(sb, KERN_ERR, "can't mount with " "journal_checksum, fs mounted w/o journal"); goto failed_mount3a; } if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { ext4_msg(sb, KERN_ERR, "can't mount with " "commit=%lu, fs mounted w/o journal", sbi->s_commit_interval / HZ); goto failed_mount3a; } if (EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) { ext4_msg(sb, KERN_ERR, "can't mount with " "data=, fs mounted w/o journal"); goto failed_mount3a; } sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM; clear_opt(sb, JOURNAL_CHECKSUM); clear_opt(sb, DATA_FLAGS); clear_opt2(sb, JOURNAL_FAST_COMMIT); sbi->s_journal = NULL; needs_recovery = 0; } if (!test_opt(sb, NO_MBCACHE)) { sbi->s_ea_block_cache = ext4_xattr_create_cache(); if (!sbi->s_ea_block_cache) { ext4_msg(sb, KERN_ERR, "Failed to create ea_block_cache"); err = -EINVAL; goto failed_mount_wq; } if (ext4_has_feature_ea_inode(sb)) { sbi->s_ea_inode_cache = ext4_xattr_create_cache(); if (!sbi->s_ea_inode_cache) { ext4_msg(sb, KERN_ERR, "Failed to create ea_inode_cache"); err = -EINVAL; goto failed_mount_wq; } } } /* * Get the # of file system overhead blocks from the * superblock if present. */ sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters); /* ignore the precalculated value if it is ridiculous */ if (sbi->s_overhead > ext4_blocks_count(es)) sbi->s_overhead = 0; /* * If the bigalloc feature is not enabled recalculating the * overhead doesn't take long, so we might as well just redo * it to make sure we are using the correct value. */ if (!ext4_has_feature_bigalloc(sb)) sbi->s_overhead = 0; if (sbi->s_overhead == 0) { err = ext4_calculate_overhead(sb); if (err) goto failed_mount_wq; } /* * The maximum number of concurrent works can be high and * concurrency isn't really necessary. Limit it to 1. */ EXT4_SB(sb)->rsv_conversion_wq = alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); if (!EXT4_SB(sb)->rsv_conversion_wq) { printk(KERN_ERR "EXT4-fs: failed to create workqueue\n"); err = -ENOMEM; goto failed_mount4; } /* * The jbd2_journal_load will have done any necessary log recovery, * so we can safely mount the rest of the filesystem now. */ root = ext4_iget(sb, EXT4_ROOT_INO, EXT4_IGET_SPECIAL); if (IS_ERR(root)) { ext4_msg(sb, KERN_ERR, "get root inode failed"); err = PTR_ERR(root); root = NULL; goto failed_mount4; } if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck"); iput(root); err = -EFSCORRUPTED; goto failed_mount4; } sb->s_root = d_make_root(root); if (!sb->s_root) { ext4_msg(sb, KERN_ERR, "get root dentry failed"); err = -ENOMEM; goto failed_mount4; } err = ext4_setup_super(sb, es, sb_rdonly(sb)); if (err == -EROFS) { sb->s_flags |= SB_RDONLY; } else if (err) goto failed_mount4a; ext4_set_resv_clusters(sb); if (test_opt(sb, BLOCK_VALIDITY)) { err = ext4_setup_system_zone(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize system " "zone (%d)", err); goto failed_mount4a; } } ext4_fc_replay_cleanup(sb); ext4_ext_init(sb); /* * Enable optimize_scan if number of groups is > threshold. This can be * turned off by passing "mb_optimize_scan=0". This can also be * turned on forcefully by passing "mb_optimize_scan=1". */ if (!(ctx->spec & EXT4_SPEC_mb_optimize_scan)) { if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD) set_opt2(sb, MB_OPTIMIZE_SCAN); else clear_opt2(sb, MB_OPTIMIZE_SCAN); } err = ext4_mb_init(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", err); goto failed_mount5; } /* * We can only set up the journal commit callback once * mballoc is initialized */ if (sbi->s_journal) sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; err = ext4_percpu_param_init(sbi); if (err) goto failed_mount6; if (ext4_has_feature_flex_bg(sb)) if (!ext4_fill_flex_info(sb)) { ext4_msg(sb, KERN_ERR, "unable to initialize " "flex_bg meta info!"); err = -ENOMEM; goto failed_mount6; } err = ext4_register_li_request(sb, first_not_zeroed); if (err) goto failed_mount6; err = ext4_register_sysfs(sb); if (err) goto failed_mount7; err = ext4_init_orphan_info(sb); if (err) goto failed_mount8; #ifdef CONFIG_QUOTA /* Enable quota usage during mount. */ if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) { err = ext4_enable_quotas(sb); if (err) goto failed_mount9; } #endif /* CONFIG_QUOTA */ /* * Save the original bdev mapping's wb_err value which could be * used to detect the metadata async write error. */ spin_lock_init(&sbi->s_bdev_wb_lock); errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err, &sbi->s_bdev_wb_err); EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; ext4_orphan_cleanup(sb, es); EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; /* * Update the checksum after updating free space/inode counters and * ext4_orphan_cleanup. Otherwise the superblock can have an incorrect * checksum in the buffer cache until it is written out and * e2fsprogs programs trying to open a file system immediately * after it is mounted can fail. */ ext4_superblock_csum_set(sb); if (needs_recovery) { ext4_msg(sb, KERN_INFO, "recovery complete"); err = ext4_mark_recovery_complete(sb, es); if (err) goto failed_mount10; } if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev)) ext4_msg(sb, KERN_WARNING, "mounting with \"discard\" option, but the device does not support discard"); if (es->s_error_count) mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ /* Enable message ratelimiting. Default is 10 messages per 5 secs. */ ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10); ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10); ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10); atomic_set(&sbi->s_warning_count, 0); atomic_set(&sbi->s_msg_count, 0); return 0; failed_mount10: ext4_quotas_off(sb, EXT4_MAXQUOTAS); failed_mount9: __maybe_unused ext4_release_orphan_info(sb); failed_mount8: ext4_unregister_sysfs(sb); kobject_put(&sbi->s_kobj); failed_mount7: ext4_unregister_li_request(sb); failed_mount6: ext4_mb_release(sb); ext4_flex_groups_free(sbi); ext4_percpu_param_destroy(sbi); failed_mount5: ext4_ext_release(sb); ext4_release_system_zone(sb); failed_mount4a: dput(sb->s_root); sb->s_root = NULL; failed_mount4: ext4_msg(sb, KERN_ERR, "mount failed"); if (EXT4_SB(sb)->rsv_conversion_wq) destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); failed_mount_wq: ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); sbi->s_ea_inode_cache = NULL; ext4_xattr_destroy_cache(sbi->s_ea_block_cache); sbi->s_ea_block_cache = NULL; if (sbi->s_journal) { /* flush s_sb_upd_work before journal destroy. */ flush_work(&sbi->s_sb_upd_work); jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; } failed_mount3a: ext4_es_unregister_shrinker(sbi); failed_mount3: /* flush s_sb_upd_work before sbi destroy */ flush_work(&sbi->s_sb_upd_work); del_timer_sync(&sbi->s_err_report); ext4_stop_mmpd(sbi); ext4_group_desc_free(sbi); failed_mount: if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); #if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); #endif #ifdef CONFIG_QUOTA for (unsigned int i = 0; i < EXT4_MAXQUOTAS; i++) kfree(get_qf_name(sb, sbi, i)); #endif fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); brelse(sbi->s_sbh); if (sbi->s_journal_bdev_handle) { invalidate_bdev(sbi->s_journal_bdev_handle->bdev); bdev_release(sbi->s_journal_bdev_handle); } out_fail: invalidate_bdev(sb->s_bdev); sb->s_fs_info = NULL; return err; } static int ext4_fill_super(struct super_block *sb, struct fs_context *fc) { struct ext4_fs_context *ctx = fc->fs_private; struct ext4_sb_info *sbi; const char *descr; int ret; sbi = ext4_alloc_sbi(sb); if (!sbi) return -ENOMEM; fc->s_fs_info = sbi; /* Cleanup superblock name */ strreplace(sb->s_id, '/', '!'); sbi->s_sb_block = 1; /* Default super block location */ if (ctx->spec & EXT4_SPEC_s_sb_block) sbi->s_sb_block = ctx->s_sb_block; ret = __ext4_fill_super(fc, sb); if (ret < 0) goto free_sbi; if (sbi->s_journal) { if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) descr = " journalled data mode"; else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) descr = " ordered data mode"; else descr = " writeback data mode"; } else descr = "out journal"; if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount")) ext4_msg(sb, KERN_INFO, "mounted filesystem %pU %s with%s. " "Quota mode: %s.", &sb->s_uuid, sb_rdonly(sb) ? "ro" : "r/w", descr, ext4_quota_mode(sb)); /* Update the s_overhead_clusters if necessary */ ext4_update_overhead(sb, false); return 0; free_sbi: ext4_free_sbi(sbi); fc->s_fs_info = NULL; return ret; } static int ext4_get_tree(struct fs_context *fc) { return get_tree_bdev(fc, ext4_fill_super); } /* * Setup any per-fs journal parameters now. We'll do this both on * initial mount, once the journal has been initialised but before we've * done any recovery; and again on any subsequent remount. */ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) { struct ext4_sb_info *sbi = EXT4_SB(sb); journal->j_commit_interval = sbi->s_commit_interval; journal->j_min_batch_time = sbi->s_min_batch_time; journal->j_max_batch_time = sbi->s_max_batch_time; ext4_fc_init(sb, journal); write_lock(&journal->j_state_lock); if (test_opt(sb, BARRIER)) journal->j_flags |= JBD2_BARRIER; else journal->j_flags &= ~JBD2_BARRIER; if (test_opt(sb, DATA_ERR_ABORT)) journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR; else journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR; /* * Always enable journal cycle record option, letting the journal * records log transactions continuously between each mount. */ journal->j_flags |= JBD2_CYCLE_RECORD; write_unlock(&journal->j_state_lock); } static struct inode *ext4_get_journal_inode(struct super_block *sb, unsigned int journal_inum) { struct inode *journal_inode; /* * Test for the existence of a valid inode on disk. Bad things * happen if we iget() an unused inode, as the subsequent iput() * will try to delete it. */