Total coverage: 142546 (8%)of 1832176
7 4 1 3 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 // SPDX-License-Identifier: GPL-2.0-or-later /* * IP6 tables REJECT target module * Linux INET6 implementation * * Copyright (C)2003 USAGI/WIDE Project * * Authors: * Yasuyuki Kozakai <yasuyuki.kozakai@toshiba.co.jp> * * Copyright (c) 2005-2007 Patrick McHardy <kaber@trash.net> * * Based on net/ipv4/netfilter/ipt_REJECT.c */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/gfp.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/icmpv6.h> #include <linux/netdevice.h> #include <net/icmp.h> #include <net/flow.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter_ipv6/ip6t_REJECT.h> #include <net/netfilter/ipv6/nf_reject.h> MODULE_AUTHOR("Yasuyuki KOZAKAI <yasuyuki.kozakai@toshiba.co.jp>"); MODULE_DESCRIPTION("Xtables: packet \"rejection\" target for IPv6"); MODULE_LICENSE("GPL"); static unsigned int reject_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct ip6t_reject_info *reject = par->targinfo; struct net *net = xt_net(par); switch (reject->with) { case IP6T_ICMP6_NO_ROUTE: nf_send_unreach6(net, skb, ICMPV6_NOROUTE, xt_hooknum(par)); break; case IP6T_ICMP6_ADM_PROHIBITED: nf_send_unreach6(net, skb, ICMPV6_ADM_PROHIBITED, xt_hooknum(par)); break; case IP6T_ICMP6_NOT_NEIGHBOUR: nf_send_unreach6(net, skb, ICMPV6_NOT_NEIGHBOUR, xt_hooknum(par)); break; case IP6T_ICMP6_ADDR_UNREACH: nf_send_unreach6(net, skb, ICMPV6_ADDR_UNREACH, xt_hooknum(par)); break; case IP6T_ICMP6_PORT_UNREACH: nf_send_unreach6(net, skb, ICMPV6_PORT_UNREACH, xt_hooknum(par)); break; case IP6T_ICMP6_ECHOREPLY: /* Do nothing */ break; case IP6T_TCP_RESET: nf_send_reset6(net, par->state->sk, skb, xt_hooknum(par)); break; case IP6T_ICMP6_POLICY_FAIL: nf_send_unreach6(net, skb, ICMPV6_POLICY_FAIL, xt_hooknum(par)); break; case IP6T_ICMP6_REJECT_ROUTE: nf_send_unreach6(net, skb, ICMPV6_REJECT_ROUTE, xt_hooknum(par)); break; } return NF_DROP; } static int reject_tg6_check(const struct xt_tgchk_param *par) { const struct ip6t_reject_info *rejinfo = par->targinfo; const struct ip6t_entry *e = par->entryinfo; if (rejinfo->with == IP6T_ICMP6_ECHOREPLY) { pr_info_ratelimited("ECHOREPLY is not supported\n"); return -EINVAL; } else if (rejinfo->with == IP6T_TCP_RESET) { /* Must specify that it's a TCP packet */ if (!(e->ipv6.flags & IP6T_F_PROTO) || e->ipv6.proto != IPPROTO_TCP || (e->ipv6.invflags & XT_INV_PROTO)) { pr_info_ratelimited("TCP_RESET illegal for non-tcp\n"); return -EINVAL; } } return 0; } static struct xt_target reject_tg6_reg __read_mostly = { .name = "REJECT", .family = NFPROTO_IPV6, .target = reject_tg6, .targetsize = sizeof(struct ip6t_reject_info), .table = "filter", .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT), .checkentry = reject_tg6_check, .me = THIS_MODULE }; static int __init reject_tg6_init(void) { return xt_register_target(&reject_tg6_reg); } static void __exit reject_tg6_exit(void) { xt_unregister_target(&reject_tg6_reg); } module_init(reject_tg6_init); module_exit(reject_tg6_exit);
522 517 518 515 518 523 520 209 209 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 // SPDX-License-Identifier: GPL-2.0-only /* * Link physical devices with ACPI devices support * * Copyright (c) 2005 David Shaohua Li <shaohua.li@intel.com> * Copyright (c) 2005 Intel Corp. */ #define pr_fmt(fmt) "ACPI: " fmt #include <linux/acpi_iort.h> #include <linux/export.h> #include <linux/init.h> #include <linux/list.h> #include <linux/device.h> #include <linux/slab.h> #include <linux/rwsem.h> #include <linux/acpi.h> #include <linux/dma-mapping.h> #include <linux/pci.h> #include <linux/pci-acpi.h> #include <linux/platform_device.h> #include "internal.h" static LIST_HEAD(bus_type_list); static DECLARE_RWSEM(bus_type_sem); #define PHYSICAL_NODE_STRING "physical_node" #define PHYSICAL_NODE_NAME_SIZE (sizeof(PHYSICAL_NODE_STRING) + 10) int register_acpi_bus_type(struct acpi_bus_type *type) { if (acpi_disabled) return -ENODEV; if (type && type->match && type->find_companion) { down_write(&bus_type_sem); list_add_tail(&type->list, &bus_type_list); up_write(&bus_type_sem); pr_info("bus type %s registered\n", type->name); return 0; } return -ENODEV; } EXPORT_SYMBOL_GPL(register_acpi_bus_type); int unregister_acpi_bus_type(struct acpi_bus_type *type) { if (acpi_disabled) return 0; if (type) { down_write(&bus_type_sem); list_del_init(&type->list); up_write(&bus_type_sem); pr_info("bus type %s unregistered\n", type->name); return 0; } return -ENODEV; } EXPORT_SYMBOL_GPL(unregister_acpi_bus_type); static struct acpi_bus_type *acpi_get_bus_type(struct device *dev) { struct acpi_bus_type *tmp, *ret = NULL; down_read(&bus_type_sem); list_for_each_entry(tmp, &bus_type_list, list) { if (tmp->match(dev)) { ret = tmp; break; } } up_read(&bus_type_sem); return ret; } #define FIND_CHILD_MIN_SCORE 1 #define FIND_CHILD_MID_SCORE 2 #define FIND_CHILD_MAX_SCORE 3 static int match_any(struct acpi_device *adev, void *not_used) { return 1; } static bool acpi_dev_has_children(struct acpi_device *adev) { return acpi_dev_for_each_child(adev, match_any, NULL) > 0; } static int find_child_checks(struct acpi_device *adev, bool check_children) { unsigned long long sta; acpi_status status; if (check_children && !acpi_dev_has_children(adev)) return -ENODEV; status = acpi_evaluate_integer(adev->handle, "_STA", NULL, &sta); if (status == AE_NOT_FOUND) { /* * Special case: backlight device objects without _STA are * preferred to other objects with the same _ADR value, because * it is more likely that they are actually useful. */ if (adev->pnp.type.backlight) return FIND_CHILD_MID_SCORE; return FIND_CHILD_MIN_SCORE; } if (ACPI_FAILURE(status) || !(sta & ACPI_STA_DEVICE_ENABLED)) return -ENODEV; /* * If the device has a _HID returning a valid ACPI/PNP device ID, it is * better to make it look less attractive here, so that the other device * with the same _ADR value (that may not have a valid device ID) can be * matched going forward. [This means a second spec violation in a row, * so whatever we do here is best effort anyway.] */ if (adev->pnp.type.platform_id) return FIND_CHILD_MIN_SCORE; return FIND_CHILD_MAX_SCORE; } struct find_child_walk_data { struct acpi_device *adev; u64 address; int score; bool check_sta; bool check_children; }; static int check_one_child(struct acpi_device *adev, void *data) { struct find_child_walk_data *wd = data; int score; if (!adev->pnp.type.bus_address || acpi_device_adr(adev) != wd->address) return 0; if (!wd->adev) { /* * This is the first matching object, so save it. If it is not * necessary to look for any other matching objects, stop the * search. */ wd->adev = adev; return !(wd->check_sta || wd->check_children); } /* * There is more than one matching device object with the same _ADR * value. That really is unexpected, so we are kind of beyond the scope * of the spec here. We have to choose which one to return, though. * * First, get the score for the previously found object and terminate * the walk if it is maximum. */ if (!wd->score) { score = find_child_checks(wd->adev, wd->check_children); if (score == FIND_CHILD_MAX_SCORE) return 1; wd->score = score; } /* * Second, if the object that has just been found has a better score, * replace the previously found one with it and terminate the walk if * the new score is maximum. */ score = find_child_checks(adev, wd->check_children); if (score > wd->score) { wd->adev = adev; if (score == FIND_CHILD_MAX_SCORE) return 1; wd->score = score; } /* Continue, because there may be better matches. */ return 0; } static struct acpi_device *acpi_find_child(struct acpi_device *parent, u64 address, bool check_children, bool check_sta) { struct find_child_walk_data wd = { .address = address, .check_children = check_children, .check_sta = check_sta, .adev = NULL, .score = 0, }; if (parent) acpi_dev_for_each_child(parent, check_one_child, &wd); return wd.adev; } struct acpi_device *acpi_find_child_device(struct acpi_device *parent, u64 address, bool check_children) { return acpi_find_child(parent, address, check_children, true); } EXPORT_SYMBOL_GPL(acpi_find_child_device); struct acpi_device *acpi_find_child_by_adr(struct acpi_device *adev, acpi_bus_address adr) { return acpi_find_child(adev, adr, false, false); } EXPORT_SYMBOL_GPL(acpi_find_child_by_adr); static void acpi_physnode_link_name(char *buf, unsigned int node_id) { if (node_id > 0) snprintf(buf, PHYSICAL_NODE_NAME_SIZE, PHYSICAL_NODE_STRING "%u", node_id); else strcpy(buf, PHYSICAL_NODE_STRING); } int acpi_bind_one(struct device *dev, struct acpi_device *acpi_dev) { struct acpi_device_physical_node *physical_node, *pn; char physical_node_name[PHYSICAL_NODE_NAME_SIZE]; struct list_head *physnode_list; unsigned int node_id; int retval = -EINVAL; if (has_acpi_companion(dev)) { if (acpi_dev) { dev_warn(dev, "ACPI companion already set\n"); return -EINVAL; } else { acpi_dev = ACPI_COMPANION(dev); } } if (!acpi_dev) return -EINVAL; acpi_dev_get(acpi_dev); get_device(dev); physical_node = kzalloc(sizeof(*physical_node), GFP_KERNEL); if (!physical_node) { retval = -ENOMEM; goto err; } mutex_lock(&acpi_dev->physical_node_lock); /* * Keep the list sorted by node_id so that the IDs of removed nodes can * be recycled easily. */ physnode_list = &acpi_dev->physical_node_list; node_id = 0; list_for_each_entry(pn, &acpi_dev->physical_node_list, node) { /* Sanity check. */ if (pn->dev == dev) { mutex_unlock(&acpi_dev->physical_node_lock); dev_warn(dev, "Already associated with ACPI node\n"); kfree(physical_node); if (ACPI_COMPANION(dev) != acpi_dev) goto err; put_device(dev); acpi_dev_put(acpi_dev); return 0; } if (pn->node_id == node_id) { physnode_list = &pn->node; node_id++; } } physical_node->node_id = node_id; physical_node->dev = dev; list_add(&physical_node->node, physnode_list); acpi_dev->physical_node_count++; if (!has_acpi_companion(dev)) ACPI_COMPANION_SET(dev, acpi_dev); acpi_physnode_link_name(physical_node_name, node_id); retval = sysfs_create_link(&acpi_dev->dev.kobj, &dev->kobj, physical_node_name); if (retval) dev_err(&acpi_dev->dev, "Failed to create link %s (%d)\n", physical_node_name, retval); retval = sysfs_create_link(&dev->kobj, &acpi_dev->dev.kobj, "firmware_node"); if (retval) dev_err(dev, "Failed to create link firmware_node (%d)\n", retval); mutex_unlock(&acpi_dev->physical_node_lock); if (acpi_dev->wakeup.flags.valid) device_set_wakeup_capable(dev, true); return 0; err: ACPI_COMPANION_SET(dev, NULL); put_device(dev); acpi_dev_put(acpi_dev); return retval; } EXPORT_SYMBOL_GPL(acpi_bind_one); int acpi_unbind_one(struct device *dev) { struct acpi_device *acpi_dev = ACPI_COMPANION(dev); struct acpi_device_physical_node *entry; if (!acpi_dev) return 0; mutex_lock(&acpi_dev->physical_node_lock); list_for_each_entry(entry, &acpi_dev->physical_node_list, node) if (entry->dev == dev) { char physnode_name[PHYSICAL_NODE_NAME_SIZE]; list_del(&entry->node); acpi_dev->physical_node_count--; acpi_physnode_link_name(physnode_name, entry->node_id); sysfs_remove_link(&acpi_dev->dev.kobj, physnode_name); sysfs_remove_link(&dev->kobj, "firmware_node"); ACPI_COMPANION_SET(dev, NULL); /* Drop references taken by acpi_bind_one(). */ put_device(dev); acpi_dev_put(acpi_dev); kfree(entry); break; } mutex_unlock(&acpi_dev->physical_node_lock); return 0; } EXPORT_SYMBOL_GPL(acpi_unbind_one); void acpi_device_notify(struct device *dev) { struct acpi_device *adev; int ret; ret = acpi_bind_one(dev, NULL); if (ret) { struct acpi_bus_type *type = acpi_get_bus_type(dev); if (!type) goto err; adev = type->find_companion(dev); if (!adev) { dev_dbg(dev, "ACPI companion not found\n"); goto err; } ret = acpi_bind_one(dev, adev); if (ret) goto err; if (type->setup) { type->setup(dev); goto done; } } else { adev = ACPI_COMPANION(dev); if (dev_is_pci(dev)) { pci_acpi_setup(dev, adev); goto done; } else if (dev_is_platform(dev)) { acpi_configure_pmsi_domain(dev); } } if (adev->handler && adev->handler->bind) adev->handler->bind(dev); done: acpi_handle_debug(ACPI_HANDLE(dev), "Bound to device %s\n", dev_name(dev)); return; err: dev_dbg(dev, "No ACPI support\n"); } void acpi_device_notify_remove(struct device *dev) { struct acpi_device *adev = ACPI_COMPANION(dev); if (!adev) return; if (dev_is_pci(dev)) pci_acpi_cleanup(dev, adev); else if (adev->handler && adev->handler->unbind) adev->handler->unbind(dev); acpi_unbind_one(dev); }
4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * linux/include/linux/jbd2.h * * Written by Stephen C. Tweedie <sct@redhat.com> * * Copyright 1998-2000 Red Hat, Inc --- All Rights Reserved * * Definitions for transaction data structures for the buffer cache * filesystem journaling support. */ #ifndef _LINUX_JBD2_H #define _LINUX_JBD2_H /* Allow this file to be included directly into e2fsprogs */ #ifndef __KERNEL__ #include "jfs_compat.h" #define JBD2_DEBUG #else #include <linux/types.h> #include <linux/buffer_head.h> #include <linux/journal-head.h> #include <linux/stddef.h> #include <linux/mutex.h> #include <linux/timer.h> #include <linux/slab.h> #include <linux/bit_spinlock.h> #include <linux/blkdev.h> #include <linux/crc32c.h> #endif #define journal_oom_retry 1 /* * Define JBD2_PARANIOD_IOFAIL to cause a kernel BUG() if ext4 finds * certain classes of error which can occur due to failed IOs. Under * normal use we want ext4 to continue after such errors, because * hardware _can_ fail, but for debugging purposes when running tests on * known-good hardware we may want to trap these errors. */ #undef JBD2_PARANOID_IOFAIL /* * The default maximum commit age, in seconds. */ #define JBD2_DEFAULT_MAX_COMMIT_AGE 5 #ifdef CONFIG_JBD2_DEBUG /* * Define JBD2_EXPENSIVE_CHECKING to enable more expensive internal * consistency checks. By default we don't do this unless * CONFIG_JBD2_DEBUG is on. */ #define JBD2_EXPENSIVE_CHECKING void __jbd2_debug(int level, const char *file, const char *func, unsigned int line, const char *fmt, ...); #define jbd2_debug(n, fmt, a...) \ __jbd2_debug((n), __FILE__, __func__, __LINE__, (fmt), ##a) #else #define jbd2_debug(n, fmt, a...) no_printk(fmt, ##a) #endif extern void *jbd2_alloc(size_t size, gfp_t flags); extern void jbd2_free(void *ptr, size_t size); #define JBD2_MIN_JOURNAL_BLOCKS 1024 #define JBD2_DEFAULT_FAST_COMMIT_BLOCKS 256 #ifdef __KERNEL__ /** * typedef handle_t - The handle_t type represents a single atomic update being performed by some process. * * All filesystem modifications made by the process go * through this handle. Recursive operations (such as quota operations) * are gathered into a single update. * * The buffer credits field is used to account for journaled buffers * being modified by the running process. To ensure that there is * enough log space for all outstanding operations, we need to limit the * number of outstanding buffers possible at any time. When the * operation completes, any buffer credits not used are credited back to * the transaction, so that at all times we know how many buffers the * outstanding updates on a transaction might possibly touch. * * This is an opaque datatype. **/ typedef struct jbd2_journal_handle handle_t; /* Atomic operation type */ /** * typedef journal_t - The journal_t maintains all of the journaling state information for a single filesystem. * * journal_t is linked to from the fs superblock structure. * * We use the journal_t to keep track of all outstanding transaction * activity on the filesystem, and to manage the state of the log * writing process. * * This is an opaque datatype. **/ typedef struct journal_s journal_t; /* Journal control structure */ #endif /* * Internal structures used by the logging mechanism: */ #define JBD2_MAGIC_NUMBER 0xc03b3998U /* The first 4 bytes of /dev/random! */ /* * On-disk structures */ /* * Descriptor block types: */ #define JBD2_DESCRIPTOR_BLOCK 1 #define JBD2_COMMIT_BLOCK 2 #define JBD2_SUPERBLOCK_V1 3 #define JBD2_SUPERBLOCK_V2 4 #define JBD2_REVOKE_BLOCK 5 /* * Standard header for all descriptor blocks: */ typedef struct journal_header_s { __be32 h_magic; __be32 h_blocktype; __be32 h_sequence; } journal_header_t; /* * Checksum types. */ #define JBD2_CRC32_CHKSUM 1 #define JBD2_MD5_CHKSUM 2 #define JBD2_SHA1_CHKSUM 3 #define JBD2_CRC32C_CHKSUM 4 #define JBD2_CRC32_CHKSUM_SIZE 4 #define JBD2_CHECKSUM_BYTES (32 / sizeof(u32)) /* * Commit block header for storing transactional checksums: * * NOTE: If FEATURE_COMPAT_CHECKSUM (checksum v1) is set, the h_chksum* * fields are used to store a checksum of the descriptor and data blocks. * * If FEATURE_INCOMPAT_CSUM_V2 (checksum v2) is set, then the h_chksum * field is used to store crc32c(uuid+commit_block). Each journal metadata * block gets its own checksum, and data block checksums are stored in * journal_block_tag (in the descriptor). The other h_chksum* fields are * not used. * * If FEATURE_INCOMPAT_CSUM_V3 is set, the descriptor block uses * journal_block_tag3_t to store a full 32-bit checksum. Everything else * is the same as v2. * * Checksum v1, v2, and v3 are mutually exclusive features. */ struct commit_header { __be32 h_magic; __be32 h_blocktype; __be32 h_sequence; unsigned char h_chksum_type; unsigned char h_chksum_size; unsigned char h_padding[2]; __be32 h_chksum[JBD2_CHECKSUM_BYTES]; __be64 h_commit_sec; __be32 h_commit_nsec; }; /* * The block tag: used to describe a single buffer in the journal. * t_blocknr_high is only used if INCOMPAT_64BIT is set, so this * raw struct shouldn't be used for pointer math or sizeof() - use * journal_tag_bytes(journal) instead to compute this. */ typedef struct journal_block_tag3_s { __be32 t_blocknr; /* The on-disk block number */ __be32 t_flags; /* See below */ __be32 t_blocknr_high; /* most-significant high 32bits. */ __be32 t_checksum; /* crc32c(uuid+seq+block) */ } journal_block_tag3_t; typedef struct journal_block_tag_s { __be32 t_blocknr; /* The on-disk block number */ __be16 t_checksum; /* truncated crc32c(uuid+seq+block) */ __be16 t_flags; /* See below */ __be32 t_blocknr_high; /* most-significant high 32bits. */ } journal_block_tag_t; /* Tail of descriptor or revoke block, for checksumming */ struct jbd2_journal_block_tail { __be32 t_checksum; /* crc32c(uuid+descr_block) */ }; /* * The revoke descriptor: used on disk to describe a series of blocks to * be revoked from the log */ typedef struct jbd2_journal_revoke_header_s { journal_header_t r_header; __be32 r_count; /* Count of bytes used in the block */ } jbd2_journal_revoke_header_t; /* Definitions for the journal tag flags word: */ #define JBD2_FLAG_ESCAPE 1 /* on-disk block is escaped */ #define JBD2_FLAG_SAME_UUID 2 /* block has same uuid as previous */ #define JBD2_FLAG_DELETED 4 /* block deleted by this transaction */ #define JBD2_FLAG_LAST_TAG 8 /* last tag in this descriptor block */ /* * The journal superblock. All fields are in big-endian byte order. */ typedef struct journal_superblock_s { /* 0x0000 */ journal_header_t s_header; /* 0x000C */ /* Static information describing the journal */ __be32 s_blocksize; /* journal device blocksize */ __be32 s_maxlen; /* total blocks in journal file */ __be32 s_first; /* first block of log information */ /* 0x0018 */ /* Dynamic information describing the current state of the log */ __be32 s_sequence; /* first commit ID expected in log */ __be32 s_start; /* blocknr of start of log */ /* 0x0020 */ /* Error value, as set by jbd2_journal_abort(). */ __be32 s_errno; /* 0x0024 */ /* Remaining fields are only valid in a version-2 superblock */ __be32 s_feature_compat; /* compatible feature set */ __be32 s_feature_incompat; /* incompatible feature set */ __be32 s_feature_ro_compat; /* readonly-compatible feature set */ /* 0x0030 */ __u8 s_uuid[16]; /* 128-bit uuid for journal */ /* 0x0040 */ __be32 s_nr_users; /* Nr of filesystems sharing log */ __be32 s_dynsuper; /* Blocknr of dynamic superblock copy*/ /* 0x0048 */ __be32 s_max_transaction; /* Limit of journal blocks per trans.*/ __be32 s_max_trans_data; /* Limit of data blocks per trans. */ /* 0x0050 */ __u8 s_checksum_type; /* checksum type */ __u8 s_padding2[3]; /* 0x0054 */ __be32 s_num_fc_blks; /* Number of fast commit blocks */ __be32 s_head; /* blocknr of head of log, only uptodate * while the filesystem is clean */ /* 0x005C */ __u32 s_padding[40]; __be32 s_checksum; /* crc32c(superblock) */ /* 0x0100 */ __u8 s_users[16*48]; /* ids of all fs'es sharing the log */ /* 0x0400 */ } journal_superblock_t; #define JBD2_FEATURE_COMPAT_CHECKSUM 0x00000001 #define JBD2_FEATURE_INCOMPAT_REVOKE 0x00000001 #define JBD2_FEATURE_INCOMPAT_64BIT 0x00000002 #define JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT 0x00000004 #define JBD2_FEATURE_INCOMPAT_CSUM_V2 0x00000008 #define JBD2_FEATURE_INCOMPAT_CSUM_V3 0x00000010 #define JBD2_FEATURE_INCOMPAT_FAST_COMMIT 0x00000020 /* See "journal feature predicate functions" below */ /* Features known to this kernel version: */ #define JBD2_KNOWN_COMPAT_FEATURES JBD2_FEATURE_COMPAT_CHECKSUM #define JBD2_KNOWN_ROCOMPAT_FEATURES 0 #define JBD2_KNOWN_INCOMPAT_FEATURES (JBD2_FEATURE_INCOMPAT_REVOKE | \ JBD2_FEATURE_INCOMPAT_64BIT | \ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | \ JBD2_FEATURE_INCOMPAT_CSUM_V2 | \ JBD2_FEATURE_INCOMPAT_CSUM_V3 | \ JBD2_FEATURE_INCOMPAT_FAST_COMMIT) #ifdef __KERNEL__ #include <linux/fs.h> #include <linux/sched.h> enum jbd_state_bits { BH_JBD /* Has an attached ext3 journal_head */ = BH_PrivateStart, BH_JWrite, /* Being written to log (@@@ DEBUGGING) */ BH_Freed, /* Has been freed (truncated) */ BH_Revoked, /* Has been revoked from the log */ BH_RevokeValid, /* Revoked flag is valid */ BH_JBDDirty, /* Is dirty but journaled */ BH_JournalHead, /* Pins bh->b_private and jh->b_bh */ BH_Shadow, /* IO on shadow buffer is running */ BH_Verified, /* Metadata block has been verified ok */ BH_JBDPrivateStart, /* First bit available for private use by FS */ }; BUFFER_FNS(JBD, jbd) BUFFER_FNS(JWrite, jwrite) BUFFER_FNS(JBDDirty, jbddirty) TAS_BUFFER_FNS(JBDDirty, jbddirty) BUFFER_FNS(Revoked, revoked) TAS_BUFFER_FNS(Revoked, revoked) BUFFER_FNS(RevokeValid, revokevalid) TAS_BUFFER_FNS(RevokeValid, revokevalid) BUFFER_FNS(Freed, freed) BUFFER_FNS(Shadow, shadow) BUFFER_FNS(Verified, verified) static inline struct buffer_head *jh2bh(struct journal_head *jh) { return jh->b_bh; } static inline struct journal_head *bh2jh(struct buffer_head *bh) { return bh->b_private; } static inline void jbd_lock_bh_journal_head(struct buffer_head *bh) { bit_spin_lock(BH_JournalHead, &bh->b_state); } static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh) { bit_spin_unlock(BH_JournalHead, &bh->b_state); } #define J_ASSERT(assert) BUG_ON(!(assert)) #define J_ASSERT_BH(bh, expr) J_ASSERT(expr) #define J_ASSERT_JH(jh, expr) J_ASSERT(expr) #if defined(JBD2_PARANOID_IOFAIL) #define J_EXPECT(expr, why...) J_ASSERT(expr) #define J_EXPECT_BH(bh, expr, why...) J_ASSERT_BH(bh, expr) #define J_EXPECT_JH(jh, expr, why...) J_ASSERT_JH(jh, expr) #else #define __journal_expect(expr, why...) \ ({ \ int val = (expr); \ if (!val) { \ printk(KERN_ERR \ "JBD2 unexpected failure: %s: %s;\n", \ __func__, #expr); \ printk(KERN_ERR why "\n"); \ } \ val; \ }) #define J_EXPECT(expr, why...) __journal_expect(expr, ## why) #define J_EXPECT_BH(bh, expr, why...) __journal_expect(expr, ## why) #define J_EXPECT_JH(jh, expr, why...) __journal_expect(expr, ## why) #endif /* Flags in jbd_inode->i_flags */ #define __JI_COMMIT_RUNNING 0 #define __JI_WRITE_DATA 1 #define __JI_WAIT_DATA 2 /* * Commit of the inode data in progress. We use this flag to protect us from * concurrent deletion of inode. We cannot use reference to inode for this * since we cannot afford doing last iput() on behalf of kjournald */ #define JI_COMMIT_RUNNING (1 << __JI_COMMIT_RUNNING) /* Write allocated dirty buffers in this inode before commit */ #define JI_WRITE_DATA (1 << __JI_WRITE_DATA) /* Wait for outstanding data writes for this inode before commit */ #define JI_WAIT_DATA (1 << __JI_WAIT_DATA) /** * struct jbd2_inode - The jbd_inode type is the structure linking inodes in * ordered mode present in a transaction so that we can sync them during commit. */ struct jbd2_inode { /** * @i_transaction: * * Which transaction does this inode belong to? Either the running * transaction or the committing one. [j_list_lock] */ transaction_t *i_transaction; /** * @i_next_transaction: * * Pointer to the running transaction modifying inode's data in case * there is already a committing transaction touching it. [j_list_lock] */ transaction_t *i_next_transaction; /** * @i_list: List of inodes in the i_transaction [j_list_lock] */ struct list_head i_list; /** * @i_vfs_inode: * * VFS inode this inode belongs to [constant for lifetime of structure] */ struct inode *i_vfs_inode; /** * @i_flags: Flags of inode [j_list_lock] */ unsigned long i_flags; /** * @i_dirty_start: * * Offset in bytes where the dirty range for this inode starts. * [j_list_lock] */ loff_t i_dirty_start; /** * @i_dirty_end: * * Inclusive offset in bytes where the dirty range for this inode * ends. [j_list_lock] */ loff_t i_dirty_end; }; struct jbd2_revoke_table_s; /** * struct jbd2_journal_handle - The jbd2_journal_handle type is the concrete * type associated with handle_t. * @h_transaction: Which compound transaction is this update a part of? * @h_journal: Which journal handle belongs to - used iff h_reserved set. * @h_rsv_handle: Handle reserved for finishing the logical operation. * @h_total_credits: Number of remaining buffers we are allowed to add to * journal. These are dirty buffers and revoke descriptor blocks. * @h_revoke_credits: Number of remaining revoke records available for handle * @h_ref: Reference count on this handle. * @h_err: Field for caller's use to track errors through large fs operations. * @h_sync: Flag for sync-on-close. * @h_jdata: Flag to force data journaling. * @h_reserved: Flag for handle for reserved credits. * @h_aborted: Flag indicating fatal error on handle. * @h_type: For handle statistics. * @h_line_no: For handle statistics. * @h_start_jiffies: Handle Start time. * @h_requested_credits: Holds @h_total_credits after handle is started. * @h_revoke_credits_requested: Holds @h_revoke_credits after handle is started. * @saved_alloc_context: Saved context while transaction is open. **/ /* Docbook can't yet cope with the bit fields, but will leave the documentation * in so it can be fixed later. */ struct jbd2_journal_handle { union { transaction_t *h_transaction; /* Which journal handle belongs to - used iff h_reserved set */ journal_t *h_journal; }; handle_t *h_rsv_handle; int h_total_credits; int h_revoke_credits; int h_revoke_credits_requested; int h_ref; int h_err; /* Flags [no locking] */ unsigned int h_sync: 1; unsigned int h_jdata: 1; unsigned int h_reserved: 1; unsigned int h_aborted: 1; unsigned int h_type: 8; unsigned int h_line_no: 16; unsigned long h_start_jiffies; unsigned int h_requested_credits; unsigned int saved_alloc_context; }; /* * Some stats for checkpoint phase */ struct transaction_chp_stats_s { unsigned long cs_chp_time; __u32 cs_forced_to_close; __u32 cs_written; __u32 cs_dropped; }; /* The transaction_t type is the guts of the journaling mechanism. It * tracks a compound transaction through its various states: * * RUNNING: accepting new updates * LOCKED: Updates still running but we don't accept new ones * RUNDOWN: Updates are tidying up but have finished requesting * new buffers to modify (state not used for now) * FLUSH: All updates complete, but we are still writing to disk * COMMIT: All data on disk, writing commit record * FINISHED: We still have to keep the transaction for checkpointing. * * The transaction keeps track of all of the buffers modified by a * running transaction, and all of the buffers committed but not yet * flushed to home for finished transactions. * (Locking Documentation improved by LockDoc) */ /* * Lock ranking: * * j_list_lock * ->jbd_lock_bh_journal_head() (This is "innermost") * * j_state_lock * ->b_state_lock * * b_state_lock * ->j_list_lock * * j_state_lock * ->j_list_lock (journal_unmap_buffer) * */ struct transaction_s { /* Pointer to the journal for this transaction. [no locking] */ journal_t *t_journal; /* Sequence number for this transaction [no locking] */ tid_t t_tid; /* * Transaction's current state * [no locking - only kjournald2 alters this] * [j_list_lock] guards transition of a transaction into T_FINISHED * state and subsequent call of __jbd2_journal_drop_transaction() * FIXME: needs barriers * KLUDGE: [use j_state_lock] */ enum { T_RUNNING, T_LOCKED, T_SWITCH, T_FLUSH, T_COMMIT, T_COMMIT_DFLUSH, T_COMMIT_JFLUSH, T_COMMIT_CALLBACK, T_FINISHED } t_state; /* * Where in the log does this transaction's commit start? [no locking] */ unsigned long t_log_start; /* * Number of buffers on the t_buffers list [j_list_lock, no locks * needed for jbd2 thread] */ int t_nr_buffers; /* * Doubly-linked circular list of all buffers reserved but not yet * modified by this transaction [j_list_lock, no locks needed fo * jbd2 thread] */ struct journal_head *t_reserved_list; /* * Doubly-linked circular list of all metadata buffers owned by this * transaction [j_list_lock, no locks needed for jbd2 thread] */ struct journal_head *t_buffers; /* * Doubly-linked circular list of all forget buffers (superseded * buffers which we can un-checkpoint once this transaction commits) * [j_list_lock] */ struct journal_head *t_forget; /* * Doubly-linked circular list of all buffers still to be flushed before * this transaction can be checkpointed. [j_list_lock] */ struct journal_head *t_checkpoint_list; /* * Doubly-linked circular list of metadata buffers being * shadowed by log IO. The IO buffers on the iobuf list and * the shadow buffers on this list match each other one for * one at all times. [j_list_lock, no locks needed for jbd2 * thread] */ struct journal_head *t_shadow_list; /* * List of inodes associated with the transaction; e.g., ext4 uses * this to track inodes in data=ordered and data=journal mode that * need special handling on transaction commit; also used by ocfs2. * [j_list_lock] */ struct list_head t_inode_list; /* * Longest time some handle had to wait for running transaction */ unsigned long t_max_wait; /* * When transaction started */ unsigned long t_start; /* * When commit was requested [j_state_lock] */ unsigned long t_requested; /* * Checkpointing stats [j_list_lock] */ struct transaction_chp_stats_s t_chp_stats; /* * Number of outstanding updates running on this transaction * [none] */ atomic_t t_updates; /* * Number of blocks reserved for this transaction in the journal. * This is including all credits reserved when starting transaction * handles as well as all journal descriptor blocks needed for this * transaction. [none] */ atomic_t t_outstanding_credits; /* * Number of revoke records for this transaction added by already * stopped handles. [none] */ atomic_t t_outstanding_revokes; /* * How many handles used this transaction? [none] */ atomic_t t_handle_count; /* * Forward and backward links for the circular list of all transactions * awaiting checkpoint. [j_list_lock] */ transaction_t *t_cpnext, *t_cpprev; /* * When will the transaction expire (become due for commit), in jiffies? * [no locking] */ unsigned long t_expires; /* * When this transaction started, in nanoseconds [no locking] */ ktime_t t_start_time; /* * This transaction is being forced and some process is * waiting for it to finish. */ unsigned int t_synchronous_commit:1; /* Disk flush needs to be sent to fs partition [no locking] */ int t_need_data_flush; /* * For use by the filesystem to store fs-specific data * structures associated with the transaction */ struct list_head t_private_list; }; struct transaction_run_stats_s { unsigned long rs_wait; unsigned long rs_request_delay; unsigned long rs_running; unsigned long rs_locked; unsigned long rs_flushing; unsigned long rs_logging; __u32 rs_handle_count; __u32 rs_blocks; __u32 rs_blocks_logged; }; struct transaction_stats_s { unsigned long ts_tid; unsigned long ts_requested; struct transaction_run_stats_s run; }; static inline unsigned long jbd2_time_diff(unsigned long start, unsigned long end) { if (end >= start) return end - start; return end + (MAX_JIFFY_OFFSET - start); } #define JBD2_NR_BATCH 64 enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY}; #define JBD2_FC_REPLAY_STOP 0 #define JBD2_FC_REPLAY_CONTINUE 1 /** * struct journal_s - The journal_s type is the concrete type associated with * journal_t. */ struct journal_s { /** * @j_flags: General journaling state flags [j_state_lock, * no lock for quick racy checks] */ unsigned long j_flags; /** * @j_errno: * * Is there an outstanding uncleared error on the journal (from a prior * abort)? [j_state_lock] */ int j_errno; /** * @j_abort_mutex: Lock the whole aborting procedure. */ struct mutex j_abort_mutex; /** * @j_sb_buffer: The first part of the superblock buffer. */ struct buffer_head *j_sb_buffer; /** * @j_superblock: The second part of the superblock buffer. */ journal_superblock_t *j_superblock; /** * @j_state_lock: Protect the various scalars in the journal. */ rwlock_t j_state_lock; /** * @j_barrier_count: * * Number of processes waiting to create a barrier lock [j_state_lock, * no lock for quick racy checks] */ int j_barrier_count; /** * @j_barrier: The barrier lock itself. */ struct mutex j_barrier; /** * @j_running_transaction: * * Transactions: The current running transaction... * [j_state_lock, no lock for quick racy checks] [caller holding * open handle] */ transaction_t *j_running_transaction; /** * @j_committing_transaction: * * the transaction we are pushing to disk * [j_state_lock] [caller holding open handle] */ transaction_t *j_committing_transaction; /** * @j_checkpoint_transactions: * * ... and a linked circular list of all transactions waiting for * checkpointing. [j_list_lock] */ transaction_t *j_checkpoint_transactions; /** * @j_wait_transaction_locked: * * Wait queue for waiting for a locked transaction to start committing, * or for a barrier lock to be released. */ wait_queue_head_t j_wait_transaction_locked; /** * @j_wait_done_commit: Wait queue for waiting for commit to complete. */ wait_queue_head_t j_wait_done_commit; /** * @j_wait_commit: Wait queue to trigger commit. */ wait_queue_head_t j_wait_commit; /** * @j_wait_updates: Wait queue to wait for updates to complete. */ wait_queue_head_t j_wait_updates; /** * @j_wait_reserved: * * Wait queue to wait for reserved buffer credits to drop. */ wait_queue_head_t j_wait_reserved; /** * @j_fc_wait: * * Wait queue to wait for completion of async fast commits. */ wait_queue_head_t j_fc_wait; /** * @j_checkpoint_mutex: * * Semaphore for locking against concurrent checkpoints. */ struct mutex j_checkpoint_mutex; /** * @j_chkpt_bhs: * * List of buffer heads used by the checkpoint routine. This * was moved from jbd2_log_do_checkpoint() to reduce stack * usage. Access to this array is controlled by the * @j_checkpoint_mutex. [j_checkpoint_mutex] */ struct buffer_head *j_chkpt_bhs[JBD2_NR_BATCH]; /** * @j_shrinker: * * Journal head shrinker, reclaim buffer's journal head which * has been written back. */ struct shrinker *j_shrinker; /** * @j_checkpoint_jh_count: * * Number of journal buffers on the checkpoint list. [j_list_lock] */ struct percpu_counter j_checkpoint_jh_count; /** * @j_shrink_transaction: * * Record next transaction will shrink on the checkpoint list. * [j_list_lock] */ transaction_t *j_shrink_transaction; /** * @j_head: * * Journal head: identifies the first unused block in the journal. * [j_state_lock] */ unsigned long j_head; /** * @j_tail: * * Journal tail: identifies the oldest still-used block in the journal. * [j_state_lock] */ unsigned long j_tail; /** * @j_free: * * Journal free: how many free blocks are there in the journal? * [j_state_lock] */ unsigned long j_free; /** * @j_first: * * The block number of the first usable block in the journal * [j_state_lock]. */ unsigned long j_first; /** * @j_last: * * The block number one beyond the last usable block in the journal * [j_state_lock]. */ unsigned long j_last; /** * @j_fc_first: * * The block number of the first fast commit block in the journal * [j_state_lock]. */ unsigned long j_fc_first; /** * @j_fc_off: * * Number of fast commit blocks currently allocated. Accessed only * during fast commit. Currently only process can do fast commit, so * this field is not protected by any lock. */ unsigned long j_fc_off; /** * @j_fc_last: * * The block number one beyond the last fast commit block in the journal * [j_state_lock]. */ unsigned long j_fc_last; /** * @j_dev: Device where we store the journal. */ struct block_device *j_dev; /** * @j_blocksize: Block size for the location where we store the journal. */ int j_blocksize; /** * @j_blk_offset: * * Starting block offset into the device where we store the journal. */ unsigned long long j_blk_offset; /** * @j_devname: Journal device name. */ char j_devname[BDEVNAME_SIZE+24]; /** * @j_fs_dev: * * Device which holds the client fs. For internal journal this will be * equal to j_dev. */ struct block_device *j_fs_dev; /** * @j_fs_dev_wb_err: * * Records the errseq of the client fs's backing block device. */ errseq_t j_fs_dev_wb_err; /** * @j_total_len: Total maximum capacity of the journal region on disk. */ unsigned int j_total_len; /** * @j_reserved_credits: * * Number of buffers reserved from the running transaction. */ atomic_t j_reserved_credits; /** * @j_list_lock: Protects the buffer lists and internal buffer state. */ spinlock_t j_list_lock; /** * @j_inode: * * Optional inode where we store the journal. If present, all * journal block numbers are mapped into this inode via bmap(). */ struct inode *j_inode; /** * @j_tail_sequence: * * Sequence number of the oldest transaction in the log [j_state_lock] */ tid_t j_tail_sequence; /** * @j_transaction_sequence: * * Sequence number of the next transaction to grant [j_state_lock] */ tid_t j_transaction_sequence; /** * @j_commit_sequence: * * Sequence number of the most recently committed transaction * [j_state_lock, no lock for quick racy checks] */ tid_t j_commit_sequence; /** * @j_commit_request: * * Sequence number of the most recent transaction wanting commit * [j_state_lock, no lock for quick racy checks] */ tid_t j_commit_request; /** * @j_uuid: * * Journal uuid: identifies the object (filesystem, LVM volume etc) * backed by this journal. This will eventually be replaced by an array * of uuids, allowing us to index multiple devices within a single * journal and to perform atomic updates across them. */ __u8 j_uuid[16]; /** * @j_task: Pointer to the current commit thread for this journal. */ struct task_struct *j_task; /** * @j_max_transaction_buffers: * * Maximum number of metadata buffers to allow in a single compound * commit transaction. */ int j_max_transaction_buffers; /** * @j_revoke_records_per_block: * * Number of revoke records that fit in one descriptor block. */ int j_revoke_records_per_block; /** * @j_transaction_overhead_buffers: * * Number of blocks each transaction needs for its own bookkeeping */ int j_transaction_overhead_buffers; /** * @j_commit_interval: * * What is the maximum transaction lifetime before we begin a commit? */ unsigned long j_commit_interval; /** * @j_commit_timer: The timer used to wakeup the commit thread. */ struct timer_list j_commit_timer; /** * @j_revoke_lock: Protect the revoke table. */ spinlock_t j_revoke_lock; /** * @j_revoke: * * The revoke table - maintains the list of revoked blocks in the * current transaction. */ struct jbd2_revoke_table_s *j_revoke; /** * @j_revoke_table: Alternate revoke tables for j_revoke. */ struct jbd2_revoke_table_s *j_revoke_table[2]; /** * @j_wbuf: Array of bhs for jbd2_journal_commit_transaction. */ struct buffer_head **j_wbuf; /** * @j_fc_wbuf: Array of fast commit bhs for fast commit. Accessed only * during a fast commit. Currently only process can do fast commit, so * this field is not protected by any lock. */ struct buffer_head **j_fc_wbuf; /** * @j_wbufsize: * * Size of @j_wbuf array. */ int j_wbufsize; /** * @j_fc_wbufsize: * * Size of @j_fc_wbuf array. */ int j_fc_wbufsize; /** * @j_last_sync_writer: * * The pid of the last person to run a synchronous operation * through the journal. */ pid_t j_last_sync_writer; /** * @j_average_commit_time: * * The average amount of time in nanoseconds it takes to commit a * transaction to disk. [j_state_lock] */ u64 j_average_commit_time; /** * @j_min_batch_time: * * Minimum time that we should wait for additional filesystem operations * to get batched into a synchronous handle in microseconds. */ u32 j_min_batch_time; /** * @j_max_batch_time: * * Maximum time that we should wait for additional filesystem operations * to get batched into a synchronous handle in microseconds. */ u32 j_max_batch_time; /** * @j_commit_callback: * * This function is called when a transaction is closed. */ void (*j_commit_callback)(journal_t *, transaction_t *); /** * @j_submit_inode_data_buffers: * * This function is called for all inodes associated with the * committing transaction marked with JI_WRITE_DATA flag * before we start to write out the transaction to the journal. */ int (*j_submit_inode_data_buffers) (struct jbd2_inode *); /** * @j_finish_inode_data_buffers: * * This function is called for all inodes associated with the * committing transaction marked with JI_WAIT_DATA flag * after we have written the transaction to the journal * but before we write out the commit block. */ int (*j_finish_inode_data_buffers) (struct jbd2_inode *); /* * Journal statistics */ /** * @j_history_lock: Protect the transactions statistics history. */ spinlock_t j_history_lock; /** * @j_proc_entry: procfs entry for the jbd statistics directory. */ struct proc_dir_entry *j_proc_entry; /** * @j_stats: Overall statistics. */ struct transaction_stats_s j_stats; /** * @j_failed_commit: Failed journal commit ID. */ unsigned int j_failed_commit; /** * @j_private: * * An opaque pointer to fs-private information. ext3 puts its * superblock pointer here. */ void *j_private; /** * @j_csum_seed: * * Precomputed journal UUID checksum for seeding other checksums. */ __u32 j_csum_seed; #ifdef CONFIG_DEBUG_LOCK_ALLOC /** * @j_trans_commit_map: * * Lockdep entity to track transaction commit dependencies. Handles * hold this "lock" for read, when we wait for commit, we acquire the * "lock" for writing. This matches the properties of jbd2 journalling * where the running transaction has to wait for all handles to be * dropped to commit that transaction and also acquiring a handle may * require transaction commit to finish. */ struct lockdep_map j_trans_commit_map; #endif /** * @j_fc_cleanup_callback: * * Clean-up after fast commit or full commit. JBD2 calls this function * after every commit operation. */ void (*j_fc_cleanup_callback)(struct journal_s *journal, int full, tid_t tid); /** * @j_fc_replay_callback: * * File-system specific function that performs replay of a fast * commit. JBD2 calls this function for each fast commit block found in * the journal. This function should return JBD2_FC_REPLAY_CONTINUE * to indicate that the block was processed correctly and more fast * commit replay should continue. Return value of JBD2_FC_REPLAY_STOP * indicates the end of replay (no more blocks remaining). A negative * return value indicates error. */ int (*j_fc_replay_callback)(struct journal_s *journal, struct buffer_head *bh, enum passtype pass, int off, tid_t expected_commit_id); /** * @j_bmap: * * Bmap function that should be used instead of the generic * VFS bmap function. */ int (*j_bmap)(struct journal_s *journal, sector_t *block); }; #define jbd2_might_wait_for_commit(j) \ do { \ rwsem_acquire(&j->j_trans_commit_map, 0, 0, _THIS_IP_); \ rwsem_release(&j->j_trans_commit_map, _THIS_IP_); \ } while (0) /* * We can support any known requested features iff the * superblock is not in version 1. Otherwise we fail to support any * extended sb features. */ static inline bool jbd2_format_support_feature(journal_t *j) { return j->j_superblock->s_header.h_blocktype != cpu_to_be32(JBD2_SUPERBLOCK_V1); } /* journal feature predicate functions */ #define JBD2_FEATURE_COMPAT_FUNCS(name, flagname) \ static inline bool jbd2_has_feature_##name(journal_t *j) \ { \ return (jbd2_format_support_feature(j) && \ ((j)->j_superblock->s_feature_compat & \ cpu_to_be32(JBD2_FEATURE_COMPAT_##flagname)) != 0); \ } \ static inline void jbd2_set_feature_##name(journal_t *j) \ { \ (j)->j_superblock->s_feature_compat |= \ cpu_to_be32(JBD2_FEATURE_COMPAT_##flagname); \ } \ static inline void jbd2_clear_feature_##name(journal_t *j) \ { \ (j)->j_superblock->s_feature_compat &= \ ~cpu_to_be32(JBD2_FEATURE_COMPAT_##flagname); \ } #define JBD2_FEATURE_RO_COMPAT_FUNCS(name, flagname) \ static inline bool jbd2_has_feature_##name(journal_t *j) \ { \ return (jbd2_format_support_feature(j) && \ ((j)->j_superblock->s_feature_ro_compat & \ cpu_to_be32(JBD2_FEATURE_RO_COMPAT_##flagname)) != 0); \ } \ static inline void jbd2_set_feature_##name(journal_t *j) \ { \ (j)->j_superblock->s_feature_ro_compat |= \ cpu_to_be32(JBD2_FEATURE_RO_COMPAT_##flagname); \ } \ static inline void jbd2_clear_feature_##name(journal_t *j) \ { \ (j)->j_superblock->s_feature_ro_compat &= \ ~cpu_to_be32(JBD2_FEATURE_RO_COMPAT_##flagname); \ } #define JBD2_FEATURE_INCOMPAT_FUNCS(name, flagname) \ static inline bool jbd2_has_feature_##name(journal_t *j) \ { \ return (jbd2_format_support_feature(j) && \ ((j)->j_superblock->s_feature_incompat & \ cpu_to_be32(JBD2_FEATURE_INCOMPAT_##flagname)) != 0); \ } \ static inline void jbd2_set_feature_##name(journal_t *j) \ { \ (j)->j_superblock->s_feature_incompat |= \ cpu_to_be32(JBD2_FEATURE_INCOMPAT_##flagname); \ } \ static inline void jbd2_clear_feature_##name(journal_t *j) \ { \ (j)->j_superblock->s_feature_incompat &= \ ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_##flagname); \ } JBD2_FEATURE_COMPAT_FUNCS(checksum, CHECKSUM) JBD2_FEATURE_INCOMPAT_FUNCS(revoke, REVOKE) JBD2_FEATURE_INCOMPAT_FUNCS(64bit, 64BIT) JBD2_FEATURE_INCOMPAT_FUNCS(async_commit, ASYNC_COMMIT) JBD2_FEATURE_INCOMPAT_FUNCS(csum2, CSUM_V2) JBD2_FEATURE_INCOMPAT_FUNCS(csum3, CSUM_V3) JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit, FAST_COMMIT) /* Journal high priority write IO operation flags */ #define JBD2_JOURNAL_REQ_FLAGS (REQ_META | REQ_SYNC | REQ_IDLE) /* * Journal flag definitions */ #define JBD2_UNMOUNT 0x001 /* Journal thread is being destroyed */ #define JBD2_ABORT 0x002 /* Journaling has been aborted for errors. */ #define JBD2_ACK_ERR 0x004 /* The errno in the sb has been acked */ #define JBD2_FLUSHED 0x008 /* The journal superblock has been flushed */ #define JBD2_LOADED 0x010 /* The journal superblock has been loaded */ #define JBD2_BARRIER 0x020 /* Use IDE barriers */ #define JBD2_ABORT_ON_SYNCDATA_ERR 0x040 /* Abort the journal on file * data write error in ordered * mode */ #define JBD2_CYCLE_RECORD 0x080 /* Journal cycled record log on * clean and empty filesystem * logging area */ #define JBD2_FAST_COMMIT_ONGOING 0x100 /* Fast commit is ongoing */ #define JBD2_FULL_COMMIT_ONGOING 0x200 /* Full commit is ongoing */ #define JBD2_JOURNAL_FLUSH_DISCARD 0x0001 #define JBD2_JOURNAL_FLUSH_ZEROOUT 0x0002 #define JBD2_JOURNAL_FLUSH_VALID (JBD2_JOURNAL_FLUSH_DISCARD | \ JBD2_JOURNAL_FLUSH_ZEROOUT) /* * Function declarations for the journaling transaction and buffer * management */ /* Filing buffers */ extern void jbd2_journal_unfile_buffer(journal_t *, struct journal_head *); extern bool __jbd2_journal_refile_buffer(struct journal_head *); extern void jbd2_journal_refile_buffer(journal_t *, struct journal_head *); extern void __jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int); extern void jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int); static inline void jbd2_file_log_bh(struct list_head *head, struct buffer_head *bh) { list_add_tail(&bh->b_assoc_buffers, head); } static inline void jbd2_unfile_log_bh(struct buffer_head *bh) { list_del_init(&bh->b_assoc_buffers); } /* Log buffer allocation */ struct buffer_head *jbd2_journal_get_descriptor_buffer(transaction_t *, int); void jbd2_descriptor_block_csum_set(journal_t *, struct buffer_head *); int jbd2_journal_next_log_block(journal_t *, unsigned long long *); int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid, unsigned long *block); int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block); void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block); /* Commit management */ extern void jbd2_journal_commit_transaction(journal_t *); /* Checkpoint list management */ enum jbd2_shrink_type {JBD2_SHRINK_DESTROY, JBD2_SHRINK_BUSY_STOP, JBD2_SHRINK_BUSY_SKIP}; void __jbd2_journal_clean_checkpoint_list(journal_t *journal, enum jbd2_shrink_type type); unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, unsigned long *nr_to_scan); int __jbd2_journal_remove_checkpoint(struct journal_head *); int jbd2_journal_try_remove_checkpoint(struct journal_head *jh); void jbd2_journal_destroy_checkpoint(journal_t *journal); void __jbd2_journal_insert_checkpoint(struct journal_head *, transaction_t *); /* * Triggers */ struct jbd2_buffer_trigger_type { /* * Fired a the moment data to write to the journal are known to be * stable - so either at the moment b_frozen_data is created or just * before a buffer is written to the journal. mapped_data is a mapped * buffer that is the frozen data for commit. */ void (*t_frozen)(struct jbd2_buffer_trigger_type *type, struct buffer_head *bh, void *mapped_data, size_t size); /* * Fired during journal abort for dirty buffers that will not be * committed. */ void (*t_abort)(struct jbd2_buffer_trigger_type *type, struct buffer_head *bh); }; extern void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data, struct jbd2_buffer_trigger_type *triggers); extern void jbd2_buffer_abort_trigger(struct journal_head *jh, struct jbd2_buffer_trigger_type *triggers); /* Buffer IO */ extern int jbd2_journal_write_metadata_buffer(transaction_t *transaction, struct journal_head *jh_in, struct buffer_head **bh_out, sector_t blocknr); /* Transaction cache support */ extern void jbd2_journal_destroy_transaction_cache(void); extern int __init jbd2_journal_init_transaction_cache(void); extern void jbd2_journal_free_transaction(transaction_t *); /* * Journal locking. * * We need to lock the journal during transaction state changes so that nobody * ever tries to take a handle on the running transaction while we are in the * middle of moving it to the commit phase. j_state_lock does this. * * Note that the locking is completely interrupt unsafe. We never touch * journal structures from interrupts. */ static inline handle_t *journal_current_handle(void) { return current->journal_info; } /* The journaling code user interface: * * Create and destroy handles * Register buffer modifications against the current transaction. */ extern handle_t *jbd2_journal_start(journal_t *, int nblocks); extern handle_t *jbd2__journal_start(journal_t *, int blocks, int rsv_blocks, int revoke_records, gfp_t gfp_mask, unsigned int type, unsigned int line_no); extern int jbd2_journal_restart(handle_t *, int nblocks); extern int jbd2__journal_restart(handle_t *, int nblocks, int revoke_records, gfp_t gfp_mask); extern int jbd2_journal_start_reserved(handle_t *handle, unsigned int type, unsigned int line_no); extern void jbd2_journal_free_reserved(handle_t *handle); extern int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records); extern int jbd2_journal_get_write_access(handle_t *, struct buffer_head *); extern int jbd2_journal_get_create_access (handle_t *, struct buffer_head *); extern int jbd2_journal_get_undo_access(handle_t *, struct buffer_head *); void jbd2_journal_set_triggers(struct buffer_head *, struct jbd2_buffer_trigger_type *type); extern int jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *); extern int jbd2_journal_forget (handle_t *, struct buffer_head *); int jbd2_journal_invalidate_folio(journal_t *, struct folio *, size_t offset, size_t length); bool jbd2_journal_try_to_free_buffers(journal_t *journal, struct folio *folio); extern int jbd2_journal_stop(handle_t *); extern int jbd2_journal_flush(journal_t *journal, unsigned int flags); extern void jbd2_journal_lock_updates (journal_t *); extern void jbd2_journal_unlock_updates (journal_t *); void jbd2_journal_wait_updates(journal_t *); extern journal_t * jbd2_journal_init_dev(struct block_device *bdev, struct block_device *fs_dev, unsigned long long start, int len, int bsize); extern journal_t * jbd2_journal_init_inode (struct inode *); extern int jbd2_journal_update_format (journal_t *); extern int jbd2_journal_check_used_features (journal_t *, unsigned long, unsigned long, unsigned long); extern int jbd2_journal_check_available_features (journal_t *, unsigned long, unsigned long, unsigned long); extern int jbd2_journal_set_features (journal_t *, unsigned long, unsigned long, unsigned long); extern void jbd2_journal_clear_features (journal_t *, unsigned long, unsigned long, unsigned long); extern int jbd2_journal_load (journal_t *journal); extern int jbd2_journal_destroy (journal_t *); extern int jbd2_journal_recover (journal_t *journal); extern int jbd2_journal_wipe (journal_t *, int); extern int jbd2_journal_skip_recovery (journal_t *); extern void jbd2_journal_update_sb_errno(journal_t *); extern int jbd2_journal_update_sb_log_tail (journal_t *, tid_t, unsigned long, blk_opf_t); extern void jbd2_journal_abort (journal_t *, int); extern int jbd2_journal_errno (journal_t *); extern void jbd2_journal_ack_err (journal_t *); extern int jbd2_journal_clear_err (journal_t *); extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *); extern int jbd2_journal_force_commit(journal_t *); extern int jbd2_journal_force_commit_nested(journal_t *); extern int jbd2_journal_inode_ranged_write(handle_t *handle, struct jbd2_inode *inode, loff_t start_byte, loff_t length); extern int jbd2_journal_inode_ranged_wait(handle_t *handle, struct jbd2_inode *inode, loff_t start_byte, loff_t length); extern int jbd2_journal_finish_inode_data_buffers( struct jbd2_inode *jinode); extern int jbd2_journal_begin_ordered_truncate(journal_t *journal, struct jbd2_inode *inode, loff_t new_size); extern void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode); extern void jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode); /* * journal_head management */ struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh); struct journal_head *jbd2_journal_grab_journal_head(struct buffer_head *bh); void jbd2_journal_put_journal_head(struct journal_head *jh); /* * handle management */ extern struct kmem_cache *jbd2_handle_cache; /* * This specialized allocator has to be a macro for its allocations to be * accounted separately (to have a separate alloc_tag). The typecast is * intentional to enforce typesafety. */ #define jbd2_alloc_handle(_gfp_flags) \ ((handle_t *)kmem_cache_zalloc(jbd2_handle_cache, _gfp_flags)) static inline void jbd2_free_handle(handle_t *handle) { kmem_cache_free(jbd2_handle_cache, handle); } /* * jbd2_inode management (optional, for those file systems that want to use * dynamically allocated jbd2_inode structures) */ extern struct kmem_cache *jbd2_inode_cache; /* * This specialized allocator has to be a macro for its allocations to be * accounted separately (to have a separate alloc_tag). The typecast is * intentional to enforce typesafety. */ #define jbd2_alloc_inode(_gfp_flags) \ ((struct jbd2_inode *)kmem_cache_alloc(jbd2_inode_cache, _gfp_flags)) static inline void jbd2_free_inode(struct jbd2_inode *jinode) { kmem_cache_free(jbd2_inode_cache, jinode); } /* Primary revoke support */ #define JOURNAL_REVOKE_DEFAULT_HASH 256 extern int jbd2_journal_init_revoke(journal_t *, int); extern void jbd2_journal_destroy_revoke_record_cache(void); extern void jbd2_journal_destroy_revoke_table_cache(void); extern int __init jbd2_journal_init_revoke_record_cache(void); extern int __init jbd2_journal_init_revoke_table_cache(void); extern void jbd2_journal_destroy_revoke(journal_t *); extern int jbd2_journal_revoke (handle_t *, unsigned long long, struct buffer_head *); extern int jbd2_journal_cancel_revoke(handle_t *, struct journal_head *); extern void jbd2_journal_write_revoke_records(transaction_t *transaction, struct list_head *log_bufs); /* Recovery revoke support */ extern int jbd2_journal_set_revoke(journal_t *, unsigned long long, tid_t); extern int jbd2_journal_test_revoke(journal_t *, unsigned long long, tid_t); extern void jbd2_journal_clear_revoke(journal_t *); extern void jbd2_journal_switch_revoke_table(journal_t *journal); extern void jbd2_clear_buffer_revoked_flags(journal_t *journal); /* * The log thread user interface: * * Request space in the current transaction, and force transaction commit * transitions on demand. */ int jbd2_log_start_commit(journal_t *journal, tid_t tid); int jbd2_journal_start_commit(journal_t *journal, tid_t *tid); int jbd2_log_wait_commit(journal_t *journal, tid_t tid); int jbd2_transaction_committed(journal_t *journal, tid_t tid); int jbd2_complete_transaction(journal_t *journal, tid_t tid); int jbd2_log_do_checkpoint(journal_t *journal); int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid); void __jbd2_log_wait_for_space(journal_t *journal); extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *); extern int jbd2_cleanup_journal_tail(journal_t *); /* Fast commit related APIs */ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid); int jbd2_fc_end_commit(journal_t *journal); int jbd2_fc_end_commit_fallback(journal_t *journal); int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out); int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode); int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode); int jbd2_fc_wait_bufs(journal_t *journal, int num_blks); void jbd2_fc_release_bufs(journal_t *journal); /* * is_journal_abort * * Simple test wrapper function to test the JBD2_ABORT state flag. This * bit, when set, indicates that we have had a fatal error somewhere, * either inside the journaling layer or indicated to us by the client * (eg. ext3), and that we and should not commit any further * transactions. */ static inline int is_journal_aborted(journal_t *journal) { return journal->j_flags & JBD2_ABORT; } static inline int is_handle_aborted(handle_t *handle) { if (handle->h_aborted || !handle->h_transaction) return 1; return is_journal_aborted(handle->h_transaction->t_journal); } static inline void jbd2_journal_abort_handle(handle_t *handle) { handle->h_aborted = 1; } static inline void jbd2_init_fs_dev_write_error(journal_t *journal) { struct address_space *mapping = journal->j_fs_dev->bd_mapping; /* * Save the original wb_err value of client fs's bdev mapping which * could be used to detect the client fs's metadata async write error. */ errseq_check_and_advance(&mapping->wb_err, &journal->j_fs_dev_wb_err); } static inline int jbd2_check_fs_dev_write_error(journal_t *journal) { struct address_space *mapping = journal->j_fs_dev->bd_mapping; return errseq_check(&mapping->wb_err, READ_ONCE(journal->j_fs_dev_wb_err)); } #endif /* __KERNEL__ */ /* Comparison functions for transaction IDs: perform comparisons using * modulo arithmetic so that they work over sequence number wraps. */ static inline int tid_gt(tid_t x, tid_t y) { int difference = (x - y); return (difference > 0); } static inline int tid_geq(tid_t x, tid_t y) { int difference = (x - y); return (difference >= 0); } extern int jbd2_journal_blocks_per_page(struct inode *inode); extern size_t journal_tag_bytes(journal_t *journal); static inline bool jbd2_journal_has_csum_v2or3_feature(journal_t *j) { return jbd2_has_feature_csum2(j) || jbd2_has_feature_csum3(j); } static inline int jbd2_journal_has_csum_v2or3(journal_t *journal) { return jbd2_journal_has_csum_v2or3_feature(journal); } static inline int jbd2_journal_get_num_fc_blks(journal_superblock_t *jsb) { int num_fc_blocks = be32_to_cpu(jsb->s_num_fc_blks); return num_fc_blocks ? num_fc_blocks : JBD2_DEFAULT_FAST_COMMIT_BLOCKS; } /* * Return number of free blocks in the log. Must be called under j_state_lock. */ static inline unsigned long jbd2_log_space_left(journal_t *journal) { /* Allow for rounding errors */ long free = journal->j_free - 32; if (journal->j_committing_transaction) { free -= atomic_read(&journal-> j_committing_transaction->t_outstanding_credits); } return max_t(long, free, 0); } /* * Definitions which augment the buffer_head layer */ /* journaling buffer types */ #define BJ_None 0 /* Not journaled */ #define BJ_Metadata 1 /* Normal journaled metadata */ #define BJ_Forget 2 /* Buffer superseded by this transaction */ #define BJ_Shadow 3 /* Buffer contents being shadowed to the log */ #define BJ_Reserved 4 /* Buffer is reserved for access by journal */ #define BJ_Types 5 static inline u32 jbd2_chksum(journal_t *journal, u32 crc, const void *address, unsigned int length) { return crc32c(crc, address, length); } /* Return most recent uncommitted transaction */ static inline tid_t jbd2_get_latest_transaction(journal_t *journal) { tid_t tid; read_lock(&journal->j_state_lock); tid = journal->j_commit_request; if (journal->j_running_transaction) tid = journal->j_running_transaction->t_tid; read_unlock(&journal->j_state_lock); return tid; } static inline int jbd2_handle_buffer_credits(handle_t *handle) { journal_t *journal; if (!handle->h_reserved) journal = handle->h_transaction->t_journal; else journal = handle->h_journal; return handle->h_total_credits - DIV_ROUND_UP(handle->h_revoke_credits_requested, journal->j_revoke_records_per_block); } #ifdef __KERNEL__ #define buffer_trace_init(bh) do {} while (0) #define print_buffer_fields(bh) do {} while (0) #define print_buffer_trace(bh) do {} while (0) #define BUFFER_TRACE(bh, info) do {} while (0) #define BUFFER_TRACE2(bh, bh2, info) do {} while (0) #define JBUFFER_TRACE(jh, info) do {} while (0) #endif /* __KERNEL__ */ #define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #endif /* _LINUX_JBD2_H */
4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 // SPDX-License-Identifier: GPL-2.0 /* * fs/ext4/fast_commit.c * * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com> * * Ext4 fast commits routines. */ #include "ext4.h" #include "ext4_jbd2.h" #include "ext4_extents.h" #include "mballoc.h" /* * Ext4 Fast Commits * ----------------- * * Ext4 fast commits implement fine grained journalling for Ext4. * * Fast commits are organized as a log of tag-length-value (TLV) structs. (See * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by * TLV during the recovery phase. For the scenarios for which we currently * don't have replay code, fast commit falls back to full commits. * Fast commits record delta in one of the following three categories. * * (A) Directory entry updates: * * - EXT4_FC_TAG_UNLINK - records directory entry unlink * - EXT4_FC_TAG_LINK - records directory entry link * - EXT4_FC_TAG_CREAT - records inode and directory entry creation * * (B) File specific data range updates: * * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode * * (C) Inode metadata (mtime / ctime etc): * * - EXT4_FC_TAG_INODE - record the inode that should be replayed * during recovery. Note that iblocks field is * not replayed and instead derived during * replay. * Commit Operation * ---------------- * With fast commits, we maintain all the directory entry operations in the * order in which they are issued in an in-memory queue. This queue is flushed * to disk during the commit operation. We also maintain a list of inodes * that need to be committed during a fast commit in another in memory queue of * inodes. During the commit operation, we commit in the following order: * * [1] Lock inodes for any further data updates by setting COMMITTING state * [2] Submit data buffers of all the inodes * [3] Wait for [2] to complete * [4] Commit all the directory entry updates in the fast commit space * [5] Commit all the changed inode structures * [6] Write tail tag (this tag ensures the atomicity, please read the following * section for more details). * [7] Wait for [4], [5] and [6] to complete. * * All the inode updates must call ext4_fc_start_update() before starting an * update. If such an ongoing update is present, fast commit waits for it to * complete. The completion of such an update is marked by * ext4_fc_stop_update(). * * Fast Commit Ineligibility * ------------------------- * * Not all operations are supported by fast commits today (e.g extended * attributes). Fast commit ineligibility is marked by calling * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back * to full commit. * * Atomicity of commits * -------------------- * In order to guarantee atomicity during the commit operation, fast commit * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail * tag contains CRC of the contents and TID of the transaction after which * this fast commit should be applied. Recovery code replays fast commit * logs only if there's at least 1 valid tail present. For every fast commit * operation, there is 1 tail. This means, we may end up with multiple tails * in the fast commit space. Here's an example: * * - Create a new file A and remove existing file B * - fsync() * - Append contents to file A * - Truncate file A * - fsync() * * The fast commit space at the end of above operations would look like this: * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL] * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->| * * Replay code should thus check for all the valid tails in the FC area. * * Fast Commit Replay Idempotence * ------------------------------ * * Fast commits tags are idempotent in nature provided the recovery code follows * certain rules. The guiding principle that the commit path follows while * committing is that it stores the result of a particular operation instead of * storing the procedure. * * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a' * was associated with inode 10. During fast commit, instead of storing this * operation as a procedure "rename a to b", we store the resulting file system * state as a "series" of outcomes: * * - Link dirent b to inode 10 * - Unlink dirent a * - Inode <10> with valid refcount * * Now when recovery code runs, it needs "enforce" this state on the file * system. This is what guarantees idempotence of fast commit replay. * * Let's take an example of a procedure that is not idempotent and see how fast * commits make it idempotent. Consider following sequence of operations: * * rm A; mv B A; read A * (x) (y) (z) * * (x), (y) and (z) are the points at which we can crash. If we store this * sequence of operations as is then the replay is not idempotent. Let's say * while in replay, we crash at (z). During the second replay, file A (which was * actually created as a result of "mv B A" operation) would get deleted. Thus, * file named A would be absent when we try to read A. So, this sequence of * operations is not idempotent. However, as mentioned above, instead of storing * the procedure fast commits store the outcome of each procedure. Thus the fast * commit log for above procedure would be as follows: * * (Let's assume dirent A was linked to inode 10 and dirent B was linked to * inode 11 before the replay) * * [Unlink A] [Link A to inode 11] [Unlink B] [Inode 11] * (w) (x) (y) (z) * * If we crash at (z), we will have file A linked to inode 11. During the second * replay, we will remove file A (inode 11). But we will create it back and make * it point to inode 11. We won't find B, so we'll just skip that step. At this * point, the refcount for inode 11 is not reliable, but that gets fixed by the * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled * similarly. Thus, by converting a non-idempotent procedure into a series of * idempotent outcomes, fast commits ensured idempotence during the replay. * * TODOs * ----- * * 0) Fast commit replay path hardening: Fast commit replay code should use * journal handles to make sure all the updates it does during the replay * path are atomic. With that if we crash during fast commit replay, after * trying to do recovery again, we will find a file system where fast commit * area is invalid (because new full commit would be found). In order to deal * with that, fast commit replay code should ensure that the "FC_REPLAY" * superblock state is persisted before starting the replay, so that after * the crash, fast commit recovery code can look at that flag and perform * fast commit recovery even if that area is invalidated by later full * commits. * * 1) Fast commit's commit path locks the entire file system during fast * commit. This has significant performance penalty. Instead of that, we * should use ext4_fc_start/stop_update functions to start inode level * updates from ext4_journal_start/stop. Once we do that we can drop file * system locking during commit path. * * 2) Handle more ineligible cases. */ #include <trace/events/ext4.h> static struct kmem_cache *ext4_fc_dentry_cachep; static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate) { BUFFER_TRACE(bh, ""); if (uptodate) { ext4_debug("%s: Block %lld up-to-date", __func__, bh->b_blocknr); set_buffer_uptodate(bh); } else { ext4_debug("%s: Block %lld not up-to-date", __func__, bh->b_blocknr); clear_buffer_uptodate(bh); } unlock_buffer(bh); } static inline void ext4_fc_reset_inode(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); ei->i_fc_lblk_start = 0; ei->i_fc_lblk_len = 0; } void ext4_fc_init_inode(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); ext4_fc_reset_inode(inode); ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING); INIT_LIST_HEAD(&ei->i_fc_list); INIT_LIST_HEAD(&ei->i_fc_dilist); init_waitqueue_head(&ei->i_fc_wait); atomic_set(&ei->i_fc_updates, 0); } /* This function must be called with sbi->s_fc_lock held. */ static void ext4_fc_wait_committing_inode(struct inode *inode) __releases(&EXT4_SB(inode->i_sb)->s_fc_lock) { wait_queue_head_t *wq; struct ext4_inode_info *ei = EXT4_I(inode); #if (BITS_PER_LONG < 64) DEFINE_WAIT_BIT(wait, &ei->i_state_flags, EXT4_STATE_FC_COMMITTING); wq = bit_waitqueue(&ei->i_state_flags, EXT4_STATE_FC_COMMITTING); #else DEFINE_WAIT_BIT(wait, &ei->i_flags, EXT4_STATE_FC_COMMITTING); wq = bit_waitqueue(&ei->i_flags, EXT4_STATE_FC_COMMITTING); #endif lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock); prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); schedule(); finish_wait(wq, &wait.wq_entry); } static bool ext4_fc_disabled(struct super_block *sb) { return (!test_opt2(sb, JOURNAL_FAST_COMMIT) || (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)); } /* * Inform Ext4's fast about start of an inode update * * This function is called by the high level call VFS callbacks before * performing any inode update. This function blocks if there's an ongoing * fast commit on the inode in question. */ void ext4_fc_start_update(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); if (ext4_fc_disabled(inode->i_sb)) return; restart: spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); if (list_empty(&ei->i_fc_list)) goto out; if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { ext4_fc_wait_committing_inode(inode); goto restart; } out: atomic_inc(&ei->i_fc_updates); spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); } /* * Stop inode update and wake up waiting fast commits if any. */ void ext4_fc_stop_update(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); if (ext4_fc_disabled(inode->i_sb)) return; if (atomic_dec_and_test(&ei->i_fc_updates)) wake_up_all(&ei->i_fc_wait); } /* * Remove inode from fast commit list. If the inode is being committed * we wait until inode commit is done. */ void ext4_fc_del(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_fc_dentry_update *fc_dentry; if (ext4_fc_disabled(inode->i_sb)) return; restart: spin_lock(&sbi->s_fc_lock); if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) { spin_unlock(&sbi->s_fc_lock); return; } if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { ext4_fc_wait_committing_inode(inode); goto restart; } if (!list_empty(&ei->i_fc_list)) list_del_init(&ei->i_fc_list); /* * Since this inode is getting removed, let's also remove all FC * dentry create references, since it is not needed to log it anyways. */ if (list_empty(&ei->i_fc_dilist)) { spin_unlock(&sbi->s_fc_lock); return; } fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist); WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT); list_del_init(&fc_dentry->fcd_list); list_del_init(&fc_dentry->fcd_dilist); WARN_ON(!list_empty(&ei->i_fc_dilist)); spin_unlock(&sbi->s_fc_lock); release_dentry_name_snapshot(&fc_dentry->fcd_name); kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); return; } /* * Mark file system as fast commit ineligible, and record latest * ineligible transaction tid. This means until the recorded * transaction, commit operation would result in a full jbd2 commit. */ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle) { struct ext4_sb_info *sbi = EXT4_SB(sb); tid_t tid; bool has_transaction = true; bool is_ineligible; if (ext4_fc_disabled(sb)) return; if (handle && !IS_ERR(handle)) tid = handle->h_transaction->t_tid; else { read_lock(&sbi->s_journal->j_state_lock); if (sbi->s_journal->j_running_transaction) tid = sbi->s_journal->j_running_transaction->t_tid; else has_transaction = false; read_unlock(&sbi->s_journal->j_state_lock); } spin_lock(&sbi->s_fc_lock); is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); if (has_transaction && (!is_ineligible || tid_gt(tid, sbi->s_fc_ineligible_tid))) sbi->s_fc_ineligible_tid = tid; ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); spin_unlock(&sbi->s_fc_lock); WARN_ON(reason >= EXT4_FC_REASON_MAX); sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; } /* * Generic fast commit tracking function. If this is the first time this we are * called after a full commit, we initialize fast commit fields and then call * __fc_track_fn() with update = 0. If we have already been called after a full * commit, we pass update = 1. Based on that, the track function can determine * if it needs to track a field for the first time or if it needs to just * update the previously tracked value. * * If enqueue is set, this function enqueues the inode in fast commit list. */ static int ext4_fc_track_template( handle_t *handle, struct inode *inode, int (*__fc_track_fn)(handle_t *handle, struct inode *, void *, bool), void *args, int enqueue) { bool update = false; struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); tid_t tid = 0; int ret; tid = handle->h_transaction->t_tid; mutex_lock(&ei->i_fc_lock); if (tid == ei->i_sync_tid) { update = true; } else { ext4_fc_reset_inode(inode); ei->i_sync_tid = tid; } ret = __fc_track_fn(handle, inode, args, update); mutex_unlock(&ei->i_fc_lock); if (!enqueue) return ret; spin_lock(&sbi->s_fc_lock); if (list_empty(&EXT4_I(inode)->i_fc_list)) list_add_tail(&EXT4_I(inode)->i_fc_list, (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ? &sbi->s_fc_q[FC_Q_STAGING] : &sbi->s_fc_q[FC_Q_MAIN]); spin_unlock(&sbi->s_fc_lock); return ret; } struct __track_dentry_update_args { struct dentry *dentry; int op; }; /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */ static int __track_dentry_update(handle_t *handle, struct inode *inode, void *arg, bool update) { struct ext4_fc_dentry_update *node; struct ext4_inode_info *ei = EXT4_I(inode); struct __track_dentry_update_args *dentry_update = (struct __track_dentry_update_args *)arg; struct dentry *dentry = dentry_update->dentry; struct inode *dir = dentry->d_parent->d_inode; struct super_block *sb = inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); mutex_unlock(&ei->i_fc_lock); if (IS_ENCRYPTED(dir)) { ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME, handle); mutex_lock(&ei->i_fc_lock); return -EOPNOTSUPP; } node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); if (!node) { ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, handle); mutex_lock(&ei->i_fc_lock); return -ENOMEM; } node->fcd_op = dentry_update->op; node->fcd_parent = dir->i_ino; node->fcd_ino = inode->i_ino; take_dentry_name_snapshot(&node->fcd_name, dentry); INIT_LIST_HEAD(&node->fcd_dilist); spin_lock(&sbi->s_fc_lock); if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_STAGING]); else list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]); /* * This helps us keep a track of all fc_dentry updates which is part of * this ext4 inode. So in case the inode is getting unlinked, before * even we get a chance to fsync, we could remove all fc_dentry * references while evicting the inode in ext4_fc_del(). * Also with this, we don't need to loop over all the inodes in * sbi->s_fc_q to get the corresponding inode in * ext4_fc_commit_dentry_updates(). */ if (dentry_update->op == EXT4_FC_TAG_CREAT) { WARN_ON(!list_empty(&ei->i_fc_dilist)); list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist); } spin_unlock(&sbi->s_fc_lock); mutex_lock(&ei->i_fc_lock); return 0; } void __ext4_fc_track_unlink(handle_t *handle, struct inode *inode, struct dentry *dentry) { struct __track_dentry_update_args args; int ret; args.dentry = dentry; args.op = EXT4_FC_TAG_UNLINK; ret = ext4_fc_track_template(handle, inode, __track_dentry_update, (void *)&args, 0); trace_ext4_fc_track_unlink(handle, inode, dentry, ret); } void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry) { struct inode *inode = d_inode(dentry); if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; __ext4_fc_track_unlink(handle, inode, dentry); } void __ext4_fc_track_link(handle_t *handle, struct inode *inode, struct dentry *dentry) { struct __track_dentry_update_args args; int ret; args.dentry = dentry; args.op = EXT4_FC_TAG_LINK; ret = ext4_fc_track_template(handle, inode, __track_dentry_update, (void *)&args, 0); trace_ext4_fc_track_link(handle, inode, dentry, ret); } void ext4_fc_track_link(handle_t *handle, struct dentry *dentry) { struct inode *inode = d_inode(dentry); if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; __ext4_fc_track_link(handle, inode, dentry); } void __ext4_fc_track_create(handle_t *handle, struct inode *inode, struct dentry *dentry) { struct __track_dentry_update_args args; int ret; args.dentry = dentry; args.op = EXT4_FC_TAG_CREAT; ret = ext4_fc_track_template(handle, inode, __track_dentry_update, (void *)&args, 0); trace_ext4_fc_track_create(handle, inode, dentry, ret); } void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) { struct inode *inode = d_inode(dentry); if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; __ext4_fc_track_create(handle, inode, dentry); } /* __track_fn for inode tracking */ static int __track_inode(handle_t *handle, struct inode *inode, void *arg, bool update) { if (update) return -EEXIST; EXT4_I(inode)->i_fc_lblk_len = 0; return 0; } void ext4_fc_track_inode(handle_t *handle, struct inode *inode) { int ret; if (S_ISDIR(inode->i_mode)) return; if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_should_journal_data(inode)) { ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_INODE_JOURNAL_DATA, handle); return; } if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1); trace_ext4_fc_track_inode(handle, inode, ret); } struct __track_range_args { ext4_lblk_t start, end; }; /* __track_fn for tracking data updates */ static int __track_range(handle_t *handle, struct inode *inode, void *arg, bool update) { struct ext4_inode_info *ei = EXT4_I(inode); ext4_lblk_t oldstart; struct __track_range_args *__arg = (struct __track_range_args *)arg; if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) { ext4_debug("Special inode %ld being modified\n", inode->i_ino); return -ECANCELED; } oldstart = ei->i_fc_lblk_start; if (update && ei->i_fc_lblk_len > 0) { ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start); ei->i_fc_lblk_len = max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) - ei->i_fc_lblk_start + 1; } else { ei->i_fc_lblk_start = __arg->start; ei->i_fc_lblk_len = __arg->end - __arg->start + 1; } return 0; } void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start, ext4_lblk_t end) { struct __track_range_args args; int ret; if (S_ISDIR(inode->i_mode)) return; if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; if (ext4_has_inline_data(inode)) { ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle); return; } args.start = start; args.end = end; ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1); trace_ext4_fc_track_range(handle, inode, start, end, ret); } static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail) { blk_opf_t write_flags = REQ_SYNC; struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh; /* Add REQ_FUA | REQ_PREFLUSH only its tail */ if (test_opt(sb, BARRIER) && is_tail) write_flags |= REQ_FUA | REQ_PREFLUSH; lock_buffer(bh); set_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = ext4_end_buffer_io_sync; submit_bh(REQ_OP_WRITE | write_flags, bh); EXT4_SB(sb)->s_fc_bh = NULL; } /* Ext4 commit path routines */ /* * Allocate len bytes on a fast commit buffer. * * During the commit time this function is used to manage fast commit * block space. We don't split a fast commit log onto different * blocks. So this function makes sure that if there's not enough space * on the current block, the remaining space in the current block is * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case, * new block is from jbd2 and CRC is updated to reflect the padding * we added. */ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) { struct ext4_fc_tl tl; struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bh; int bsize = sbi->s_journal->j_blocksize; int ret, off = sbi->s_fc_bytes % bsize; int remaining; u8 *dst; /* * If 'len' is too long to fit in any block alongside a PAD tlv, then we * cannot fulfill the request. */ if (len > bsize - EXT4_FC_TAG_BASE_LEN) return NULL; if (!sbi->s_fc_bh) { ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); if (ret) return NULL; sbi->s_fc_bh = bh; } dst = sbi->s_fc_bh->b_data + off; /* * Allocate the bytes in the current block if we can do so while still * leaving enough space for a PAD tlv. */ remaining = bsize - EXT4_FC_TAG_BASE_LEN - off; if (len <= remaining) { sbi->s_fc_bytes += len; return dst; } /* * Else, terminate the current block with a PAD tlv, then allocate a new * block and allocate the bytes at the start of that new block. */ tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); tl.fc_len = cpu_to_le16(remaining); memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining); *crc = ext4_chksum(sbi, *crc, sbi->s_fc_bh->b_data, bsize); ext4_fc_submit_bh(sb, false); ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); if (ret) return NULL; sbi->s_fc_bh = bh; sbi->s_fc_bytes += bsize - off + len; return sbi->s_fc_bh->b_data; } /* * Complete a fast commit by writing tail tag. * * Writing tail tag marks the end of a fast commit. In order to guarantee * atomicity, after writing tail tag, even if there's space remaining * in the block, next commit shouldn't use it. That's why tail tag * has the length as that of the remaining space on the block. */ static int ext4_fc_write_tail(struct super_block *sb, u32 crc) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_fc_tl tl; struct ext4_fc_tail tail; int off, bsize = sbi->s_journal->j_blocksize; u8 *dst; /* * ext4_fc_reserve_space takes care of allocating an extra block if * there's no enough space on this block for accommodating this tail. */ dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(tail), &crc); if (!dst) return -ENOSPC; off = sbi->s_fc_bytes % bsize; tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL); tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ext4_fc_tail)); sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize); memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); dst += EXT4_FC_TAG_BASE_LEN; tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid); memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid)); dst += sizeof(tail.fc_tid); crc = ext4_chksum(sbi, crc, sbi->s_fc_bh->b_data, dst - (u8 *)sbi->s_fc_bh->b_data); tail.fc_crc = cpu_to_le32(crc); memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc)); dst += sizeof(tail.fc_crc); memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */ ext4_fc_submit_bh(sb, true); return 0; } /* * Adds tag, length, value and updates CRC. Returns true if tlv was added. * Returns false if there's not enough space. */ static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val, u32 *crc) { struct ext4_fc_tl tl; u8 *dst; dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + len, crc); if (!dst) return false; tl.fc_tag = cpu_to_le16(tag); tl.fc_len = cpu_to_le16(len); memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); memcpy(dst + EXT4_FC_TAG_BASE_LEN, val, len); return true; } /* Same as above, but adds dentry tlv. */ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, struct ext4_fc_dentry_update *fc_dentry) { struct ext4_fc_dentry_info fcd; struct ext4_fc_tl tl; int dlen = fc_dentry->fcd_name.name.len; u8 *dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc); if (!dst) return false; fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent); fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino); tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op); tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen); memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); dst += EXT4_FC_TAG_BASE_LEN; memcpy(dst, &fcd, sizeof(fcd)); dst += sizeof(fcd); memcpy(dst, fc_dentry->fcd_name.name.name, dlen); return true; } /* * Writes inode in the fast commit space under TLV with tag @tag. * Returns 0 on success, error on failure. */ static int ext4_fc_write_inode(struct inode *inode, u32 *crc) { struct ext4_inode_info *ei = EXT4_I(inode); int inode_len = EXT4_GOOD_OLD_INODE_SIZE; int ret; struct ext4_iloc iloc; struct ext4_fc_inode fc_inode; struct ext4_fc_tl tl; u8 *dst; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) inode_len = EXT4_INODE_SIZE(inode->i_sb); else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) inode_len += ei->i_extra_isize; fc_inode.fc_ino = cpu_to_le32(inode->i_ino); tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE); tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino)); ret = -ECANCELED; dst = ext4_fc_reserve_space(inode->i_sb, EXT4_FC_TAG_BASE_LEN + inode_len + sizeof(fc_inode.fc_ino), crc); if (!dst) goto err; memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); dst += EXT4_FC_TAG_BASE_LEN; memcpy(dst, &fc_inode, sizeof(fc_inode)); dst += sizeof(fc_inode); memcpy(dst, (u8 *)ext4_raw_inode(&iloc), inode_len); ret = 0; err: brelse(iloc.bh); return ret; } /* * Writes updated data ranges for the inode in question. Updates CRC. * Returns 0 on success, error otherwise. */ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) { ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size; struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_map_blocks map; struct ext4_fc_add_range fc_ext; struct ext4_fc_del_range lrange; struct ext4_extent *ex; int ret; mutex_lock(&ei->i_fc_lock); if (ei->i_fc_lblk_len == 0) { mutex_unlock(&ei->i_fc_lock); return 0; } old_blk_size = ei->i_fc_lblk_start; new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1; ei->i_fc_lblk_len = 0; mutex_unlock(&ei->i_fc_lock); cur_lblk_off = old_blk_size; ext4_debug("will try writing %d to %d for inode %ld\n", cur_lblk_off, new_blk_size, inode->i_ino); while (cur_lblk_off <= new_blk_size) { map.m_lblk = cur_lblk_off; map.m_len = new_blk_size - cur_lblk_off + 1; ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) return -ECANCELED; if (map.m_len == 0) { cur_lblk_off++; continue; } if (ret == 0) { lrange.fc_ino = cpu_to_le32(inode->i_ino); lrange.fc_lblk = cpu_to_le32(map.m_lblk); lrange.fc_len = cpu_to_le32(map.m_len); if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE, sizeof(lrange), (u8 *)&lrange, crc)) return -ENOSPC; } else { unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ? EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN; /* Limit the number of blocks in one extent */ map.m_len = min(max, map.m_len); fc_ext.fc_ino = cpu_to_le32(inode->i_ino); ex = (struct ext4_extent *)&fc_ext.fc_ex; ex->ee_block = cpu_to_le32(map.m_lblk); ex->ee_len = cpu_to_le16(map.m_len); ext4_ext_store_pblock(ex, map.m_pblk); if (map.m_flags & EXT4_MAP_UNWRITTEN) ext4_ext_mark_unwritten(ex); else ext4_ext_mark_initialized(ex); if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE, sizeof(fc_ext), (u8 *)&fc_ext, crc)) return -ENOSPC; } cur_lblk_off += map.m_len; } return 0; } /* Submit data for all the fast commit inodes */ static int ext4_fc_submit_inode_data_all(journal_t *journal) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *ei; int ret = 0; spin_lock(&sbi->s_fc_lock); list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); while (atomic_read(&ei->i_fc_updates)) { DEFINE_WAIT(wait); prepare_to_wait(&ei->i_fc_wait, &wait, TASK_UNINTERRUPTIBLE); if (atomic_read(&ei->i_fc_updates)) { spin_unlock(&sbi->s_fc_lock); schedule(); spin_lock(&sbi->s_fc_lock); } finish_wait(&ei->i_fc_wait, &wait); } spin_unlock(&sbi->s_fc_lock); ret = jbd2_submit_inode_data(journal, ei->jinode); if (ret) return ret; spin_lock(&sbi->s_fc_lock); } spin_unlock(&sbi->s_fc_lock); return ret; } /* Wait for completion of data for all the fast commit inodes */ static int ext4_fc_wait_inode_data_all(journal_t *journal) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *pos, *n; int ret = 0; spin_lock(&sbi->s_fc_lock); list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { if (!ext4_test_inode_state(&pos->vfs_inode, EXT4_STATE_FC_COMMITTING)) continue; spin_unlock(&sbi->s_fc_lock); ret = jbd2_wait_inode_data(journal, pos->jinode); if (ret) return ret; spin_lock(&sbi->s_fc_lock); } spin_unlock(&sbi->s_fc_lock); return 0; } /* Commit all the directory entry updates */ static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) __acquires(&sbi->s_fc_lock) __releases(&sbi->s_fc_lock) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n; struct inode *inode; struct ext4_inode_info *ei; int ret; if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) return 0; list_for_each_entry_safe(fc_dentry, fc_dentry_n, &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { spin_unlock(&sbi->s_fc_lock); if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { ret = -ENOSPC; goto lock_and_exit; } spin_lock(&sbi->s_fc_lock); continue; } /* * With fcd_dilist we need not loop in sbi->s_fc_q to get the * corresponding inode pointer */ WARN_ON(list_empty(&fc_dentry->fcd_dilist)); ei = list_first_entry(&fc_dentry->fcd_dilist, struct ext4_inode_info, i_fc_dilist); inode = &ei->vfs_inode; WARN_ON(inode->i_ino != fc_dentry->fcd_ino); spin_unlock(&sbi->s_fc_lock); /* * We first write the inode and then the create dirent. This * allows the recovery code to create an unnamed inode first * and then link it to a directory entry. This allows us * to use namei.c routines almost as is and simplifies * the recovery code. */ ret = ext4_fc_write_inode(inode, crc); if (ret) goto lock_and_exit; ret = ext4_fc_write_inode_data(inode, crc); if (ret) goto lock_and_exit; if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { ret = -ENOSPC; goto lock_and_exit; } spin_lock(&sbi->s_fc_lock); } return 0; lock_and_exit: spin_lock(&sbi->s_fc_lock); return ret; } static int ext4_fc_perform_commit(journal_t *journal) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *iter; struct ext4_fc_head head; struct inode *inode; struct blk_plug plug; int ret = 0; u32 crc = 0; ret = ext4_fc_submit_inode_data_all(journal); if (ret) return ret; ret = ext4_fc_wait_inode_data_all(journal); if (ret) return ret; /* * If file system device is different from journal device, issue a cache * flush before we start writing fast commit blocks. */ if (journal->j_fs_dev != journal->j_dev) blkdev_issue_flush(journal->j_fs_dev); blk_start_plug(&plug); if (sbi->s_fc_bytes == 0) { /* * Add a head tag only if this is the first fast commit * in this TID. */ head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES); head.fc_tid = cpu_to_le32( sbi->s_journal->j_running_transaction->t_tid); if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head), (u8 *)&head, &crc)) { ret = -ENOSPC; goto out; } } spin_lock(&sbi->s_fc_lock); ret = ext4_fc_commit_dentry_updates(journal, &crc); if (ret) { spin_unlock(&sbi->s_fc_lock); goto out; } list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { inode = &iter->vfs_inode; if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) continue; spin_unlock(&sbi->s_fc_lock); ret = ext4_fc_write_inode_data(inode, &crc); if (ret) goto out; ret = ext4_fc_write_inode(inode, &crc); if (ret) goto out; spin_lock(&sbi->s_fc_lock); } spin_unlock(&sbi->s_fc_lock); ret = ext4_fc_write_tail(sb, crc); out: blk_finish_plug(&plug); return ret; } static void ext4_fc_update_stats(struct super_block *sb, int status, u64 commit_time, int nblks, tid_t commit_tid) { struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats; ext4_debug("Fast commit ended with status = %d for tid %u", status, commit_tid); if (status == EXT4_FC_STATUS_OK) { stats->fc_num_commits++; stats->fc_numblks += nblks; if (likely(stats->s_fc_avg_commit_time)) stats->s_fc_avg_commit_time = (commit_time + stats->s_fc_avg_commit_time * 3) / 4; else stats->s_fc_avg_commit_time = commit_time; } else if (status == EXT4_FC_STATUS_FAILED || status == EXT4_FC_STATUS_INELIGIBLE) { if (status == EXT4_FC_STATUS_FAILED) stats->fc_failed_commits++; stats->fc_ineligible_commits++; } else { stats->fc_skipped_commits++; } trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid); } /* * The main commit entry point. Performs a fast commit for transaction * commit_tid if needed. If it's not possible to perform a fast commit * due to various reasons, we fall back to full commit. Returns 0 * on success, error otherwise. */ int ext4_fc_commit(journal_t *journal, tid_t commit_tid) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); int nblks = 0, ret, bsize = journal->j_blocksize; int subtid = atomic_read(&sbi->s_fc_subtid); int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0; ktime_t start_time, commit_time; if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) return jbd2_complete_transaction(journal, commit_tid); trace_ext4_fc_commit_start(sb, commit_tid); start_time = ktime_get(); restart_fc: ret = jbd2_fc_begin_commit(journal, commit_tid); if (ret == -EALREADY) { /* There was an ongoing commit, check if we need to restart */ if (atomic_read(&sbi->s_fc_subtid) <= subtid && tid_gt(commit_tid, journal->j_commit_sequence)) goto restart_fc; ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0, commit_tid); return 0; } else if (ret) { /* * Commit couldn't start. Just update stats and perform a * full commit. */ ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0, commit_tid); return jbd2_complete_transaction(journal, commit_tid); } /* * After establishing journal barrier via jbd2_fc_begin_commit(), check * if we are fast commit ineligible. */ if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) { status = EXT4_FC_STATUS_INELIGIBLE; goto fallback; } fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize; ret = ext4_fc_perform_commit(journal); if (ret < 0) { status = EXT4_FC_STATUS_FAILED; goto fallback; } nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before; ret = jbd2_fc_wait_bufs(journal, nblks); if (ret < 0) { status = EXT4_FC_STATUS_FAILED; goto fallback; } atomic_inc(&sbi->s_fc_subtid); ret = jbd2_fc_end_commit(journal); /* * weight the commit time higher than the average time so we * don't react too strongly to vast changes in the commit time */ commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid); return ret; fallback: ret = jbd2_fc_end_commit_fallback(journal); ext4_fc_update_stats(sb, status, 0, 0, commit_tid); return ret; } /* * Fast commit cleanup routine. This is called after every fast commit and * full commit. full is true if we are called after a full commit. */ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *iter, *iter_n; struct ext4_fc_dentry_update *fc_dentry; if (full && sbi->s_fc_bh) sbi->s_fc_bh = NULL; trace_ext4_fc_cleanup(journal, full, tid); jbd2_fc_release_bufs(journal); spin_lock(&sbi->s_fc_lock); list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { list_del_init(&iter->i_fc_list); ext4_clear_inode_state(&iter->vfs_inode, EXT4_STATE_FC_COMMITTING); if (tid_geq(tid, iter->i_sync_tid)) { ext4_fc_reset_inode(&iter->vfs_inode); } else if (full) { /* * We are called after a full commit, inode has been * modified while the commit was running. Re-enqueue * the inode into STAGING, which will then be splice * back into MAIN. This cannot happen during * fastcommit because the journal is locked all the * time in that case (and tid doesn't increase so * tid check above isn't reliable). */ list_add_tail(&EXT4_I(&iter->vfs_inode)->i_fc_list, &sbi->s_fc_q[FC_Q_STAGING]); } /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */ smp_mb(); #if (BITS_PER_LONG < 64) wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING); #else wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING); #endif } while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) { fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN], struct ext4_fc_dentry_update, fcd_list); list_del_init(&fc_dentry->fcd_list); list_del_init(&fc_dentry->fcd_dilist); spin_unlock(&sbi->s_fc_lock); release_dentry_name_snapshot(&fc_dentry->fcd_name); kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); spin_lock(&sbi->s_fc_lock); } list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING], &sbi->s_fc_dentry_q[FC_Q_MAIN]); list_splice_init(&sbi->s_fc_q[FC_Q_STAGING], &sbi->s_fc_q[FC_Q_MAIN]); if (tid_geq(tid, sbi->s_fc_ineligible_tid)) { sbi->s_fc_ineligible_tid = 0; ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); } if (full) sbi->s_fc_bytes = 0; spin_unlock(&sbi->s_fc_lock); trace_ext4_fc_stats(sb); } /* Ext4 Replay Path Routines */ /* Helper struct for dentry replay routines */ struct dentry_info_args { int parent_ino, dname_len, ino, inode_len; char *dname; }; /* Same as struct ext4_fc_tl, but uses native endianness fields */ struct ext4_fc_tl_mem { u16 fc_tag; u16 fc_len; }; static inline void tl_to_darg(struct dentry_info_args *darg, struct ext4_fc_tl_mem *tl, u8 *val) { struct ext4_fc_dentry_info fcd; memcpy(&fcd, val, sizeof(fcd)); darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino); darg->ino = le32_to_cpu(fcd.fc_ino); darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname); darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info); } static inline void ext4_fc_get_tl(struct ext4_fc_tl_mem *tl, u8 *val) { struct ext4_fc_tl tl_disk; memcpy(&tl_disk, val, EXT4_FC_TAG_BASE_LEN); tl->fc_len = le16_to_cpu(tl_disk.fc_len); tl->fc_tag = le16_to_cpu(tl_disk.fc_tag); } /* Unlink replay function */ static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl_mem *tl, u8 *val) { struct inode *inode, *old_parent; struct qstr entry; struct dentry_info_args darg; int ret = 0; tl_to_darg(&darg, tl, val); trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino, darg.parent_ino, darg.dname_len); entry.name = darg.dname; entry.len = darg.dname_len; inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("Inode %d not found", darg.ino); return 0; } old_parent = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); if (IS_ERR(old_parent)) { ext4_debug("Dir with inode %d not found", darg.parent_ino); iput(inode); return 0; } ret = __ext4_unlink(old_parent, &entry, inode, NULL); /* -ENOENT ok coz it might not exist anymore. */ if (ret == -ENOENT) ret = 0; iput(old_parent); iput(inode); return ret; } static int ext4_fc_replay_link_internal(struct super_block *sb, struct dentry_info_args *darg, struct inode *inode) { struct inode *dir = NULL; struct dentry *dentry_dir = NULL, *dentry_inode = NULL; struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len); int ret = 0; dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL); if (IS_ERR(dir)) { ext4_debug("Dir with inode %d not found.", darg->parent_ino); dir = NULL; goto out; } dentry_dir = d_obtain_alias(dir); if (IS_ERR(dentry_dir)) { ext4_debug("Failed to obtain dentry"); dentry_dir = NULL; goto out; } dentry_inode = d_alloc(dentry_dir, &qstr_dname); if (!dentry_inode) { ext4_debug("Inode dentry not created."); ret = -ENOMEM; goto out; } ret = __ext4_link(dir, inode, dentry_inode); /* * It's possible that link already existed since data blocks * for the dir in question got persisted before we crashed OR * we replayed this tag and crashed before the entire replay * could complete. */ if (ret && ret != -EEXIST) { ext4_debug("Failed to link\n"); goto out; } ret = 0; out: if (dentry_dir) { d_drop(dentry_dir); dput(dentry_dir); } else if (dir) { iput(dir); } if (dentry_inode) { d_drop(dentry_inode); dput(dentry_inode); } return ret; } /* Link replay function */ static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl_mem *tl, u8 *val) { struct inode *inode; struct dentry_info_args darg; int ret = 0; tl_to_darg(&darg, tl, val); trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino, darg.parent_ino, darg.dname_len); inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("Inode not found."); return 0; } ret = ext4_fc_replay_link_internal(sb, &darg, inode); iput(inode); return ret; } /* * Record all the modified inodes during replay. We use this later to setup * block bitmaps correctly. */ static int ext4_fc_record_modified_inode(struct super_block *sb, int ino) { struct ext4_fc_replay_state *state; int i; state = &EXT4_SB(sb)->s_fc_replay_state; for (i = 0; i < state->fc_modified_inodes_used; i++) if (state->fc_modified_inodes[i] == ino) return 0; if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) { int *fc_modified_inodes; fc_modified_inodes = krealloc(state->fc_modified_inodes, sizeof(int) * (state->fc_modified_inodes_size + EXT4_FC_REPLAY_REALLOC_INCREMENT), GFP_KERNEL); if (!fc_modified_inodes) return -ENOMEM; state->fc_modified_inodes = fc_modified_inodes; state->fc_modified_inodes_size += EXT4_FC_REPLAY_REALLOC_INCREMENT; } state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino; return 0; } /* * Inode replay function */ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl_mem *tl, u8 *val) { struct ext4_fc_inode fc_inode; struct ext4_inode *raw_inode; struct ext4_inode *raw_fc_inode; struct inode *inode = NULL; struct ext4_iloc iloc; int inode_len, ino, ret, tag = tl->fc_tag; struct ext4_extent_header *eh; size_t off_gen = offsetof(struct ext4_inode, i_generation); memcpy(&fc_inode, val, sizeof(fc_inode)); ino = le32_to_cpu(fc_inode.fc_ino); trace_ext4_fc_replay(sb, tag, ino, 0, 0); inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); if (!IS_ERR(inode)) { ext4_ext_clear_bb(inode); iput(inode); } inode = NULL; ret = ext4_fc_record_modified_inode(sb, ino); if (ret) goto out; raw_fc_inode = (struct ext4_inode *) (val + offsetof(struct ext4_fc_inode, fc_raw_inode)); ret = ext4_get_fc_inode_loc(sb, ino, &iloc); if (ret) goto out; inode_len = tl->fc_len - sizeof(struct ext4_fc_inode); raw_inode = ext4_raw_inode(&iloc); memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block)); memcpy((u8 *)raw_inode + off_gen, (u8 *)raw_fc_inode + off_gen, inode_len - off_gen); if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) { eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]); if (eh->eh_magic != EXT4_EXT_MAGIC) { memset(eh, 0, sizeof(*eh)); eh->eh_magic = EXT4_EXT_MAGIC; eh->eh_max = cpu_to_le16( (sizeof(raw_inode->i_block) - sizeof(struct ext4_extent_header)) / sizeof(struct ext4_extent)); } } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) { memcpy(raw_inode->i_block, raw_fc_inode->i_block, sizeof(raw_inode->i_block)); } /* Immediately update the inode on disk. */ ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); if (ret) goto out; ret = sync_dirty_buffer(iloc.bh); if (ret) goto out; ret = ext4_mark_inode_used(sb, ino); if (ret) goto out; /* Given that we just wrote the inode on disk, this SHOULD succeed. */ inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("Inode not found."); return -EFSCORRUPTED; } /* * Our allocator could have made different decisions than before * crashing. This should be fixed but until then, we calculate * the number of blocks the inode. */ if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) ext4_ext_replay_set_iblocks(inode); inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation); ext4_reset_inode_seed(inode); ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode)); ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); sync_dirty_buffer(iloc.bh); brelse(iloc.bh); out: iput(inode); if (!ret) blkdev_issue_flush(sb->s_bdev); return 0; } /* * Dentry create replay function. * * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the * inode for which we are trying to create a dentry here, should already have * been replayed before we start here. */ static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl_mem *tl, u8 *val) { int ret = 0; struct inode *inode = NULL; struct inode *dir = NULL; struct dentry_info_args darg; tl_to_darg(&darg, tl, val); trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino, darg.parent_ino, darg.dname_len); /* This takes care of update group descriptor and other metadata */ ret = ext4_mark_inode_used(sb, darg.ino); if (ret) goto out; inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("inode %d not found.", darg.ino); inode = NULL; ret = -EINVAL; goto out; } if (S_ISDIR(inode->i_mode)) { /* * If we are creating a directory, we need to make sure that the * dot and dot dot dirents are setup properly. */ dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); if (IS_ERR(dir)) { ext4_debug("Dir %d not found.", darg.ino); goto out; } ret = ext4_init_new_dir(NULL, dir, inode); iput(dir); if (ret) { ret = 0; goto out; } } ret = ext4_fc_replay_link_internal(sb, &darg, inode); if (ret) goto out; set_nlink(inode, 1); ext4_mark_inode_dirty(NULL, inode); out: iput(inode); return ret; } /* * Record physical disk regions which are in use as per fast commit area, * and used by inodes during replay phase. Our simple replay phase * allocator excludes these regions from allocation. */ int ext4_fc_record_regions(struct super_block *sb, int ino, ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay) { struct ext4_fc_replay_state *state; struct ext4_fc_alloc_region *region; state = &EXT4_SB(sb)->s_fc_replay_state; /* * during replay phase, the fc_regions_valid may not same as * fc_regions_used, update it when do new additions. */ if (replay && state->fc_regions_used != state->fc_regions_valid) state->fc_regions_used = state->fc_regions_valid; if (state->fc_regions_used == state->fc_regions_size) { struct ext4_fc_alloc_region *fc_regions; fc_regions = krealloc(state->fc_regions, sizeof(struct ext4_fc_alloc_region) * (state->fc_regions_size + EXT4_FC_REPLAY_REALLOC_INCREMENT), GFP_KERNEL); if (!fc_regions) return -ENOMEM; state->fc_regions_size += EXT4_FC_REPLAY_REALLOC_INCREMENT; state->fc_regions = fc_regions; } region = &state->fc_regions[state->fc_regions_used++]; region->ino = ino; region->lblk = lblk; region->pblk = pblk; region->len = len; if (replay) state->fc_regions_valid++; return 0; } /* Replay add range tag */ static int ext4_fc_replay_add_range(struct super_block *sb, struct ext4_fc_tl_mem *tl, u8 *val) { struct ext4_fc_add_range fc_add_ex; struct ext4_extent newex, *ex; struct inode *inode; ext4_lblk_t start, cur; int remaining, len; ext4_fsblk_t start_pblk; struct ext4_map_blocks map; struct ext4_ext_path *path = NULL; int ret; memcpy(&fc_add_ex, val, sizeof(fc_add_ex)); ex = (struct ext4_extent *)&fc_add_ex.fc_ex; trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE, le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block), ext4_ext_get_actual_len(ex)); inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("Inode not found."); return 0; } ret = ext4_fc_record_modified_inode(sb, inode->i_ino); if (ret) goto out; start = le32_to_cpu(ex->ee_block); start_pblk = ext4_ext_pblock(ex); len = ext4_ext_get_actual_len(ex); cur = start; remaining = len; ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n", start, start_pblk, len, ext4_ext_is_unwritten(ex), inode->i_ino); while (remaining > 0) { map.m_lblk = cur; map.m_len = remaining; map.m_pblk = 0; ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) goto out; if (ret == 0) { /* Range is not mapped */ path = ext4_find_extent(inode, cur, path, 0); if (IS_ERR(path)) goto out; memset(&newex, 0, sizeof(newex)); newex.ee_block = cpu_to_le32(cur); ext4_ext_store_pblock( &newex, start_pblk + cur - start); newex.ee_len = cpu_to_le16(map.m_len); if (ext4_ext_is_unwritten(ex)) ext4_ext_mark_unwritten(&newex); down_write(&EXT4_I(inode)->i_data_sem); path = ext4_ext_insert_extent(NULL, inode, path, &newex, 0); up_write((&EXT4_I(inode)->i_data_sem)); if (IS_ERR(path)) goto out; goto next; } if (start_pblk + cur - start != map.m_pblk) { /* * Logical to physical mapping changed. This can happen * if this range was removed and then reallocated to * map to new physical blocks during a fast commit. */ ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, ext4_ext_is_unwritten(ex), start_pblk + cur - start); if (ret) goto out; /* * Mark the old blocks as free since they aren't used * anymore. We maintain an array of all the modified * inodes. In case these blocks are still used at either * a different logical range in the same inode or in * some different inode, we will mark them as allocated * at the end of the FC replay using our array of * modified inodes. */ ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); goto next; } /* Range is mapped and needs a state change */ ext4_debug("Converting from %ld to %d %lld", map.m_flags & EXT4_MAP_UNWRITTEN, ext4_ext_is_unwritten(ex), map.m_pblk); ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, ext4_ext_is_unwritten(ex), map.m_pblk); if (ret) goto out; /* * We may have split the extent tree while toggling the state. * Try to shrink the extent tree now. */ ext4_ext_replay_shrink_inode(inode, start + len); next: cur += map.m_len; remaining -= map.m_len; } ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> sb->s_blocksize_bits); out: ext4_free_ext_path(path); iput(inode); return 0; } /* Replay DEL_RANGE tag */ static int ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl_mem *tl, u8 *val) { struct inode *inode; struct ext4_fc_del_range lrange; struct ext4_map_blocks map; ext4_lblk_t cur, remaining; int ret; memcpy(&lrange, val, sizeof(lrange)); cur = le32_to_cpu(lrange.fc_lblk); remaining = le32_to_cpu(lrange.fc_len); trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE, le32_to_cpu(lrange.fc_ino), cur, remaining); inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("Inode %d not found", le32_to_cpu(lrange.fc_ino)); return 0; } ret = ext4_fc_record_modified_inode(sb, inode->i_ino); if (ret) goto out; ext4_debug("DEL_RANGE, inode %ld, lblk %d, len %d\n", inode->i_ino, le32_to_cpu(lrange.fc_lblk), le32_to_cpu(lrange.fc_len)); while (remaining > 0) { map.m_lblk = cur; map.m_len = remaining; ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) goto out; if (ret > 0) { remaining -= ret; cur += ret; ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); } else { remaining -= map.m_len; cur += map.m_len; } } down_write(&EXT4_I(inode)->i_data_sem); ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk), le32_to_cpu(lrange.fc_lblk) + le32_to_cpu(lrange.fc_len) - 1); up_write(&EXT4_I(inode)->i_data_sem); if (ret) goto out; ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> sb->s_blocksize_bits); ext4_mark_inode_dirty(NULL, inode); out: iput(inode); return 0; } static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) { struct ext4_fc_replay_state *state; struct inode *inode; struct ext4_ext_path *path = NULL; struct ext4_map_blocks map; int i, ret, j; ext4_lblk_t cur, end; state = &EXT4_SB(sb)->s_fc_replay_state; for (i = 0; i < state->fc_modified_inodes_used; i++) { inode = ext4_iget(sb, state->fc_modified_inodes[i], EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("Inode %d not found.", state->fc_modified_inodes[i]); continue; } cur = 0; end = EXT_MAX_BLOCKS; if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) { iput(inode); continue; } while (cur < end) { map.m_lblk = cur; map.m_len = end - cur; ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) break; if (ret > 0) { path = ext4_find_extent(inode, map.m_lblk, path, 0); if (!IS_ERR(path)) { for (j = 0; j < path->p_depth; j++) ext4_mb_mark_bb(inode->i_sb, path[j].p_block, 1, true); } else { path = NULL; } cur += ret; ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, true); } else { cur = cur + (map.m_len ? map.m_len : 1); } } iput(inode); } ext4_free_ext_path(path); } /* * Check if block is in excluded regions for block allocation. The simple * allocator that runs during replay phase is calls this function to see * if it is okay to use a block. */ bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk) { int i; struct ext4_fc_replay_state *state; state = &EXT4_SB(sb)->s_fc_replay_state; for (i = 0; i < state->fc_regions_valid; i++) { if (state->fc_regions[i].ino == 0 || state->fc_regions[i].len == 0) continue; if (in_range(blk, state->fc_regions[i].pblk, state->fc_regions[i].len)) return true; } return false; } /* Cleanup function called after replay */ void ext4_fc_replay_cleanup(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); sbi->s_mount_state &= ~EXT4_FC_REPLAY; kfree(sbi->s_fc_replay_state.fc_regions); kfree(sbi->s_fc_replay_state.fc_modified_inodes); } static bool ext4_fc_value_len_isvalid(struct ext4_sb_info *sbi, int tag, int len) { switch (tag) { case EXT4_FC_TAG_ADD_RANGE: return len == sizeof(struct ext4_fc_add_range); case EXT4_FC_TAG_DEL_RANGE: return len == sizeof(struct ext4_fc_del_range); case EXT4_FC_TAG_CREAT: case EXT4_FC_TAG_LINK: case EXT4_FC_TAG_UNLINK: len -= sizeof(struct ext4_fc_dentry_info); return len >= 1 && len <= EXT4_NAME_LEN; case EXT4_FC_TAG_INODE: len -= sizeof(struct ext4_fc_inode); return len >= EXT4_GOOD_OLD_INODE_SIZE && len <= sbi->s_inode_size; case EXT4_FC_TAG_PAD: return true; /* padding can have any length */ case EXT4_FC_TAG_TAIL: return len >= sizeof(struct ext4_fc_tail); case EXT4_FC_TAG_HEAD: return len == sizeof(struct ext4_fc_head); } return false; } /* * Recovery Scan phase handler * * This function is called during the scan phase and is responsible * for doing following things: * - Make sure the fast commit area has valid tags for replay * - Count number of tags that need to be replayed by the replay handler * - Verify CRC * - Create a list of excluded blocks for allocation during replay phase * * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP * to indicate that scan has finished and JBD2 can now start replay phase. * It returns a negative error to indicate that there was an error. At the end * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set * to indicate the number of tags that need to replayed during the replay phase. */ static int ext4_fc_replay_scan(journal_t *journal, struct buffer_head *bh, int off, tid_t expected_tid) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_fc_replay_state *state; int ret = JBD2_FC_REPLAY_CONTINUE; struct ext4_fc_add_range ext; struct ext4_fc_tl_mem tl; struct ext4_fc_tail tail; __u8 *start, *end, *cur, *val; struct ext4_fc_head head; struct ext4_extent *ex; state = &sbi->s_fc_replay_state; start = (u8 *)bh->b_data; end = start + journal->j_blocksize; if (state->fc_replay_expected_off == 0) { state->fc_cur_tag = 0; state->fc_replay_num_tags = 0; state->fc_crc = 0; state->fc_regions = NULL; state->fc_regions_valid = state->fc_regions_used = state->fc_regions_size = 0; /* Check if we can stop early */ if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag) != EXT4_FC_TAG_HEAD) return 0; } if (off != state->fc_replay_expected_off) { ret = -EFSCORRUPTED; goto out_err; } state->fc_replay_expected_off++; for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN; cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { ext4_fc_get_tl(&tl, cur); val = cur + EXT4_FC_TAG_BASE_LEN; if (tl.fc_len > end - val || !ext4_fc_value_len_isvalid(sbi, tl.fc_tag, tl.fc_len)) { ret = state->fc_replay_num_tags ? JBD2_FC_REPLAY_STOP : -ECANCELED; goto out_err; } ext4_debug("Scan phase, tag:%s, blk %lld\n", tag2str(tl.fc_tag), bh->b_blocknr); switch (tl.fc_tag) { case EXT4_FC_TAG_ADD_RANGE: memcpy(&ext, val, sizeof(ext)); ex = (struct ext4_extent *)&ext.fc_ex; ret = ext4_fc_record_regions(sb, le32_to_cpu(ext.fc_ino), le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), ext4_ext_get_actual_len(ex), 0); if (ret < 0) break; ret = JBD2_FC_REPLAY_CONTINUE; fallthrough; case EXT4_FC_TAG_DEL_RANGE: case EXT4_FC_TAG_LINK: case EXT4_FC_TAG_UNLINK: case EXT4_FC_TAG_CREAT: case EXT4_FC_TAG_INODE: case EXT4_FC_TAG_PAD: state->fc_cur_tag++; state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, EXT4_FC_TAG_BASE_LEN + tl.fc_len); break; case EXT4_FC_TAG_TAIL: state->fc_cur_tag++; memcpy(&tail, val, sizeof(tail)); state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, EXT4_FC_TAG_BASE_LEN + offsetof(struct ext4_fc_tail, fc_crc)); if (le32_to_cpu(tail.fc_tid) == expected_tid && le32_to_cpu(tail.fc_crc) == state->fc_crc) { state->fc_replay_num_tags = state->fc_cur_tag; state->fc_regions_valid = state->fc_regions_used; } else { ret = state->fc_replay_num_tags ? JBD2_FC_REPLAY_STOP : -EFSBADCRC; } state->fc_crc = 0; break; case EXT4_FC_TAG_HEAD: memcpy(&head, val, sizeof(head)); if (le32_to_cpu(head.fc_features) & ~EXT4_FC_SUPPORTED_FEATURES) { ret = -EOPNOTSUPP; break; } if (le32_to_cpu(head.fc_tid) != expected_tid) { ret = JBD2_FC_REPLAY_STOP; break; } state->fc_cur_tag++; state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, EXT4_FC_TAG_BASE_LEN + tl.fc_len); break; default: ret = state->fc_replay_num_tags ? JBD2_FC_REPLAY_STOP : -ECANCELED; } if (ret < 0 || ret == JBD2_FC_REPLAY_STOP) break; } out_err: trace_ext4_fc_replay_scan(sb, ret, off); return ret; } /* * Main recovery path entry point. * The meaning of return codes is similar as above. */ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, enum passtype pass, int off, tid_t expected_tid) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_fc_tl_mem tl; __u8 *start, *end, *cur, *val; int ret = JBD2_FC_REPLAY_CONTINUE; struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state; struct ext4_fc_tail tail; if (pass == PASS_SCAN) { state->fc_current_pass = PASS_SCAN; return ext4_fc_replay_scan(journal, bh, off, expected_tid); } if (state->fc_current_pass != pass) { state->fc_current_pass = pass; sbi->s_mount_state |= EXT4_FC_REPLAY; } if (!sbi->s_fc_replay_state.fc_replay_num_tags) { ext4_debug("Replay stops\n"); ext4_fc_set_bitmaps_and_counters(sb); return 0; } #ifdef CONFIG_EXT4_DEBUG if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) { pr_warn("Dropping fc block %d because max_replay set\n", off); return JBD2_FC_REPLAY_STOP; } #endif start = (u8 *)bh->b_data; end = start + journal->j_blocksize; for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN; cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { ext4_fc_get_tl(&tl, cur); val = cur + EXT4_FC_TAG_BASE_LEN; if (state->fc_replay_num_tags == 0) { ret = JBD2_FC_REPLAY_STOP; ext4_fc_set_bitmaps_and_counters(sb); break; } ext4_debug("Replay phase, tag:%s\n", tag2str(tl.fc_tag)); state->fc_replay_num_tags--; switch (tl.fc_tag) { case EXT4_FC_TAG_LINK: ret = ext4_fc_replay_link(sb, &tl, val); break; case EXT4_FC_TAG_UNLINK: ret = ext4_fc_replay_unlink(sb, &tl, val); break; case EXT4_FC_TAG_ADD_RANGE: ret = ext4_fc_replay_add_range(sb, &tl, val); break; case EXT4_FC_TAG_CREAT: ret = ext4_fc_replay_create(sb, &tl, val); break; case EXT4_FC_TAG_DEL_RANGE: ret = ext4_fc_replay_del_range(sb, &tl, val); break; case EXT4_FC_TAG_INODE: ret = ext4_fc_replay_inode(sb, &tl, val); break; case EXT4_FC_TAG_PAD: trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0, tl.fc_len, 0); break; case EXT4_FC_TAG_TAIL: trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0, tl.fc_len, 0); memcpy(&tail, val, sizeof(tail)); WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid); break; case EXT4_FC_TAG_HEAD: break; default: trace_ext4_fc_replay(sb, tl.fc_tag, 0, tl.fc_len, 0); ret = -ECANCELED; break; } if (ret < 0) break; ret = JBD2_FC_REPLAY_CONTINUE; } return ret; } void ext4_fc_init(struct super_block *sb, journal_t *journal) { /* * We set replay callback even if fast commit disabled because we may * could still have fast commit blocks that need to be replayed even if * fast commit has now been turned off. */ journal->j_fc_replay_callback = ext4_fc_replay; if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) return; journal->j_fc_cleanup_callback = ext4_fc_cleanup; } static const char * const fc_ineligible_reasons[] = { [EXT4_FC_REASON_XATTR] = "Extended attributes changed", [EXT4_FC_REASON_CROSS_RENAME] = "Cross rename", [EXT4_FC_REASON_JOURNAL_FLAG_CHANGE] = "Journal flag changed", [EXT4_FC_REASON_NOMEM] = "Insufficient memory", [EXT4_FC_REASON_SWAP_BOOT] = "Swap boot", [EXT4_FC_REASON_RESIZE] = "Resize", [EXT4_FC_REASON_RENAME_DIR] = "Dir renamed", [EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op", [EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling", [EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename", }; int ext4_fc_info_show(struct seq_file *seq, void *v) { struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private); struct ext4_fc_stats *stats = &sbi->s_fc_stats; int i; if (v != SEQ_START_TOKEN) return 0; seq_printf(seq, "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n", stats->fc_num_commits, stats->fc_ineligible_commits, stats->fc_numblks, div_u64(stats->s_fc_avg_commit_time, 1000)); seq_puts(seq, "Ineligible reasons:\n"); for (i = 0; i < EXT4_FC_REASON_MAX; i++) seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i], stats->fc_ineligible_reason_count[i]); return 0; } int __init ext4_fc_init_dentry_cache(void) { ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update, SLAB_RECLAIM_ACCOUNT); if (ext4_fc_dentry_cachep == NULL) return -ENOMEM; return 0; } void ext4_fc_destroy_dentry_cache(void) { kmem_cache_destroy(ext4_fc_dentry_cachep); }
4 4 4 4 4 2 2 12 12 9 10 10 2 2 4 4 4 4 8 1 6 1 2 2 4 1 3 1 2 2 2 2 4 4 4 3 3 4 4 1 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 // SPDX-License-Identifier: GPL-2.0 /* Copyright 2011-2014 Autronica Fire and Security AS * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se * * The HSR spec says never to forward the same frame twice on the same * interface. A frame is identified by its source MAC address and its HSR * sequence number. This code keeps track of senders and their sequence numbers * to allow filtering of duplicate frames, and to detect HSR ring errors. * Same code handles filtering of duplicates for PRP as well. */ #include <linux/if_ether.h> #include <linux/etherdevice.h> #include <linux/slab.h> #include <linux/rculist.h> #include "hsr_main.h" #include "hsr_framereg.h" #include "hsr_netlink.h" /* seq_nr_after(a, b) - return true if a is after (higher in sequence than) b, * false otherwise. */ static bool seq_nr_after(u16 a, u16 b) { /* Remove inconsistency where * seq_nr_after(a, b) == seq_nr_before(a, b) */ if ((int)b - a == 32768) return false; return (((s16)(b - a)) < 0); } #define seq_nr_before(a, b) seq_nr_after((b), (a)) #define seq_nr_before_or_eq(a, b) (!seq_nr_after((a), (b))) bool hsr_addr_is_redbox(struct hsr_priv *hsr, unsigned char *addr) { if (!hsr->redbox || !is_valid_ether_addr(hsr->macaddress_redbox)) return false; return ether_addr_equal(addr, hsr->macaddress_redbox); } bool hsr_addr_is_self(struct hsr_priv *hsr, unsigned char *addr) { struct hsr_self_node *sn; bool ret = false; rcu_read_lock(); sn = rcu_dereference(hsr->self_node); if (!sn) { WARN_ONCE(1, "HSR: No self node\n"); goto out; } if (ether_addr_equal(addr, sn->macaddress_A) || ether_addr_equal(addr, sn->macaddress_B)) ret = true; out: rcu_read_unlock(); return ret; } /* Search for mac entry. Caller must hold rcu read lock. */ static struct hsr_node *find_node_by_addr_A(struct list_head *node_db, const unsigned char addr[ETH_ALEN]) { struct hsr_node *node; list_for_each_entry_rcu(node, node_db, mac_list) { if (ether_addr_equal(node->macaddress_A, addr)) return node; } return NULL; } /* Check if node for a given MAC address is already present in data base */ bool hsr_is_node_in_db(struct list_head *node_db, const unsigned char addr[ETH_ALEN]) { return !!find_node_by_addr_A(node_db, addr); } /* Helper for device init; the self_node is used in hsr_rcv() to recognize * frames from self that's been looped over the HSR ring. */ int hsr_create_self_node(struct hsr_priv *hsr, const unsigned char addr_a[ETH_ALEN], const unsigned char addr_b[ETH_ALEN]) { struct hsr_self_node *sn, *old; sn = kmalloc(sizeof(*sn), GFP_KERNEL); if (!sn) return -ENOMEM; ether_addr_copy(sn->macaddress_A, addr_a); ether_addr_copy(sn->macaddress_B, addr_b); spin_lock_bh(&hsr->list_lock); old = rcu_replace_pointer(hsr->self_node, sn, lockdep_is_held(&hsr->list_lock)); spin_unlock_bh(&hsr->list_lock); if (old) kfree_rcu(old, rcu_head); return 0; } void hsr_del_self_node(struct hsr_priv *hsr) { struct hsr_self_node *old; spin_lock_bh(&hsr->list_lock); old = rcu_replace_pointer(hsr->self_node, NULL, lockdep_is_held(&hsr->list_lock)); spin_unlock_bh(&hsr->list_lock); if (old) kfree_rcu(old, rcu_head); } void hsr_del_nodes(struct list_head *node_db) { struct hsr_node *node; struct hsr_node *tmp; list_for_each_entry_safe(node, tmp, node_db, mac_list) kfree(node); } void prp_handle_san_frame(bool san, enum hsr_port_type port, struct hsr_node *node) { /* Mark if the SAN node is over LAN_A or LAN_B */ if (port == HSR_PT_SLAVE_A) { node->san_a = true; return; } if (port == HSR_PT_SLAVE_B) node->san_b = true; } /* Allocate an hsr_node and add it to node_db. 'addr' is the node's address_A; * seq_out is used to initialize filtering of outgoing duplicate frames * originating from the newly added node. */ static struct hsr_node *hsr_add_node(struct hsr_priv *hsr, struct list_head *node_db, unsigned char addr[], u16 seq_out, bool san, enum hsr_port_type rx_port) { struct hsr_node *new_node, *node; unsigned long now; int i; new_node = kzalloc(sizeof(*new_node), GFP_ATOMIC); if (!new_node) return NULL; ether_addr_copy(new_node->macaddress_A, addr); spin_lock_init(&new_node->seq_out_lock); /* We are only interested in time diffs here, so use current jiffies * as initialization. (0 could trigger an spurious ring error warning). */ now = jiffies; for (i = 0; i < HSR_PT_PORTS; i++) { new_node->time_in[i] = now; new_node->time_out[i] = now; } for (i = 0; i < HSR_PT_PORTS; i++) new_node->seq_out[i] = seq_out; if (san && hsr->proto_ops->handle_san_frame) hsr->proto_ops->handle_san_frame(san, rx_port, new_node); spin_lock_bh(&hsr->list_lock); list_for_each_entry_rcu(node, node_db, mac_list, lockdep_is_held(&hsr->list_lock)) { if (ether_addr_equal(node->macaddress_A, addr)) goto out; if (ether_addr_equal(node->macaddress_B, addr)) goto out; } list_add_tail_rcu(&new_node->mac_list, node_db); spin_unlock_bh(&hsr->list_lock); return new_node; out: spin_unlock_bh(&hsr->list_lock); kfree(new_node); return node; } void prp_update_san_info(struct hsr_node *node, bool is_sup) { if (!is_sup) return; node->san_a = false; node->san_b = false; } /* Get the hsr_node from which 'skb' was sent. */ struct hsr_node *hsr_get_node(struct hsr_port *port, struct list_head *node_db, struct sk_buff *skb, bool is_sup, enum hsr_port_type rx_port) { struct hsr_priv *hsr = port->hsr; struct hsr_node *node; struct ethhdr *ethhdr; struct prp_rct *rct; bool san = false; u16 seq_out; if (!skb_mac_header_was_set(skb)) return NULL; ethhdr = (struct ethhdr *)skb_mac_header(skb); list_for_each_entry_rcu(node, node_db, mac_list) { if (ether_addr_equal(node->macaddress_A, ethhdr->h_source)) { if (hsr->proto_ops->update_san_info) hsr->proto_ops->update_san_info(node, is_sup); return node; } if (ether_addr_equal(node->macaddress_B, ethhdr->h_source)) { if (hsr->proto_ops->update_san_info) hsr->proto_ops->update_san_info(node, is_sup); return node; } } /* Check if required node is not in proxy nodes table */ list_for_each_entry_rcu(node, &hsr->proxy_node_db, mac_list) { if (ether_addr_equal(node->macaddress_A, ethhdr->h_source)) { if (hsr->proto_ops->update_san_info) hsr->proto_ops->update_san_info(node, is_sup); return node; } } /* Everyone may create a node entry, connected node to a HSR/PRP * device. */ if (ethhdr->h_proto == htons(ETH_P_PRP) || ethhdr->h_proto == htons(ETH_P_HSR)) { /* Check if skb contains hsr_ethhdr */ if (skb->mac_len < sizeof(struct hsr_ethhdr)) return NULL; /* Use the existing sequence_nr from the tag as starting point * for filtering duplicate frames. */ seq_out = hsr_get_skb_sequence_nr(skb) - 1; } else { rct = skb_get_PRP_rct(skb); if (rct && prp_check_lsdu_size(skb, rct, is_sup)) { seq_out = prp_get_skb_sequence_nr(rct); } else { if (rx_port != HSR_PT_MASTER) san = true; seq_out = HSR_SEQNR_START; } } return hsr_add_node(hsr, node_db, ethhdr->h_source, seq_out, san, rx_port); } /* Use the Supervision frame's info about an eventual macaddress_B for merging * nodes that has previously had their macaddress_B registered as a separate * node. */ void hsr_handle_sup_frame(struct hsr_frame_info *frame) { struct hsr_node *node_curr = frame->node_src; struct hsr_port *port_rcv = frame->port_rcv; struct hsr_priv *hsr = port_rcv->hsr; struct hsr_sup_payload *hsr_sp; struct hsr_sup_tlv *hsr_sup_tlv; struct hsr_node *node_real; struct sk_buff *skb = NULL; struct list_head *node_db; struct ethhdr *ethhdr; int i; unsigned int pull_size = 0; unsigned int total_pull_size = 0; /* Here either frame->skb_hsr or frame->skb_prp should be * valid as supervision frame always will have protocol * header info. */ if (frame->skb_hsr) skb = frame->skb_hsr; else if (frame->skb_prp) skb = frame->skb_prp; else if (frame->skb_std) skb = frame->skb_std; if (!skb) return; /* Leave the ethernet header. */ pull_size = sizeof(struct ethhdr); skb_pull(skb, pull_size); total_pull_size += pull_size; ethhdr = (struct ethhdr *)skb_mac_header(skb); /* And leave the HSR tag. */ if (ethhdr->h_proto == htons(ETH_P_HSR)) { pull_size = sizeof(struct hsr_tag); skb_pull(skb, pull_size); total_pull_size += pull_size; } /* And leave the HSR sup tag. */ pull_size = sizeof(struct hsr_sup_tag); skb_pull(skb, pull_size); total_pull_size += pull_size; /* get HSR sup payload */ hsr_sp = (struct hsr_sup_payload *)skb->data; /* Merge node_curr (registered on macaddress_B) into node_real */ node_db = &port_rcv->hsr->node_db; node_real = find_node_by_addr_A(node_db, hsr_sp->macaddress_A); if (!node_real) /* No frame received from AddrA of this node yet */ node_real = hsr_add_node(hsr, node_db, hsr_sp->macaddress_A, HSR_SEQNR_START - 1, true, port_rcv->type); if (!node_real) goto done; /* No mem */ if (node_real == node_curr) /* Node has already been merged */ goto done; /* Leave the first HSR sup payload. */ pull_size = sizeof(struct hsr_sup_payload); skb_pull(skb, pull_size); total_pull_size += pull_size; /* Get second supervision tlv */ hsr_sup_tlv = (struct hsr_sup_tlv *)skb->data; /* And check if it is a redbox mac TLV */ if (hsr_sup_tlv->HSR_TLV_type == PRP_TLV_REDBOX_MAC) { /* We could stop here after pushing hsr_sup_payload, * or proceed and allow macaddress_B and for redboxes. */ /* Sanity check length */ if (hsr_sup_tlv->HSR_TLV_length != 6) goto done; /* Leave the second HSR sup tlv. */ pull_size = sizeof(struct hsr_sup_tlv); skb_pull(skb, pull_size); total_pull_size += pull_size; /* Get redbox mac address. */ hsr_sp = (struct hsr_sup_payload *)skb->data; /* Check if redbox mac and node mac are equal. */ if (!ether_addr_equal(node_real->macaddress_A, hsr_sp->macaddress_A)) { /* This is a redbox supervision frame for a VDAN! */ goto done; } } ether_addr_copy(node_real->macaddress_B, ethhdr->h_source); spin_lock_bh(&node_real->seq_out_lock); for (i = 0; i < HSR_PT_PORTS; i++) { if (!node_curr->time_in_stale[i] && time_after(node_curr->time_in[i], node_real->time_in[i])) { node_real->time_in[i] = node_curr->time_in[i]; node_real->time_in_stale[i] = node_curr->time_in_stale[i]; } if (seq_nr_after(node_curr->seq_out[i], node_real->seq_out[i])) node_real->seq_out[i] = node_curr->seq_out[i]; } spin_unlock_bh(&node_real->seq_out_lock); node_real->addr_B_port = port_rcv->type; spin_lock_bh(&hsr->list_lock); if (!node_curr->removed) { list_del_rcu(&node_curr->mac_list); node_curr->removed = true; kfree_rcu(node_curr, rcu_head); } spin_unlock_bh(&hsr->list_lock); done: /* Push back here */ skb_push(skb, total_pull_size); } /* 'skb' is a frame meant for this host, that is to be passed to upper layers. * * If the frame was sent by a node's B interface, replace the source * address with that node's "official" address (macaddress_A) so that upper * layers recognize where it came from. */ void hsr_addr_subst_source(struct hsr_node *node, struct sk_buff *skb) { if (!skb_mac_header_was_set(skb)) { WARN_ONCE(1, "%s: Mac header not set\n", __func__); return; } memcpy(&eth_hdr(skb)->h_source, node->macaddress_A, ETH_ALEN); } /* 'skb' is a frame meant for another host. * 'port' is the outgoing interface * * Substitute the target (dest) MAC address if necessary, so the it matches the * recipient interface MAC address, regardless of whether that is the * recipient's A or B interface. * This is needed to keep the packets flowing through switches that learn on * which "side" the different interfaces are. */ void hsr_addr_subst_dest(struct hsr_node *node_src, struct sk_buff *skb, struct hsr_port *port) { struct hsr_node *node_dst; if (!skb_mac_header_was_set(skb)) { WARN_ONCE(1, "%s: Mac header not set\n", __func__); return; } if (!is_unicast_ether_addr(eth_hdr(skb)->h_dest)) return; node_dst = find_node_by_addr_A(&port->hsr->node_db, eth_hdr(skb)->h_dest); if (!node_dst && port->hsr->redbox) node_dst = find_node_by_addr_A(&port->hsr->proxy_node_db, eth_hdr(skb)->h_dest); if (!node_dst) { if (port->hsr->prot_version != PRP_V1 && net_ratelimit()) netdev_err(skb->dev, "%s: Unknown node\n", __func__); return; } if (port->type != node_dst->addr_B_port) return; if (is_valid_ether_addr(node_dst->macaddress_B)) ether_addr_copy(eth_hdr(skb)->h_dest, node_dst->macaddress_B); } void hsr_register_frame_in(struct hsr_node *node, struct hsr_port *port, u16 sequence_nr) { /* Don't register incoming frames without a valid sequence number. This * ensures entries of restarted nodes gets pruned so that they can * re-register and resume communications. */ if (!(port->dev->features & NETIF_F_HW_HSR_TAG_RM) && seq_nr_before(sequence_nr, node->seq_out[port->type])) return; node->time_in[port->type] = jiffies; node->time_in_stale[port->type] = false; } /* 'skb' is a HSR Ethernet frame (with a HSR tag inserted), with a valid * ethhdr->h_source address and skb->mac_header set. * * Return: * 1 if frame can be shown to have been sent recently on this interface, * 0 otherwise, or * negative error code on error */ int hsr_register_frame_out(struct hsr_port *port, struct hsr_node *node, u16 sequence_nr) { spin_lock_bh(&node->seq_out_lock); if (seq_nr_before_or_eq(sequence_nr, node->seq_out[port->type]) && time_is_after_jiffies(node->time_out[port->type] + msecs_to_jiffies(HSR_ENTRY_FORGET_TIME))) { spin_unlock_bh(&node->seq_out_lock); return 1; } node->time_out[port->type] = jiffies; node->seq_out[port->type] = sequence_nr; spin_unlock_bh(&node->seq_out_lock); return 0; } static struct hsr_port *get_late_port(struct hsr_priv *hsr, struct hsr_node *node) { if (node->time_in_stale[HSR_PT_SLAVE_A]) return hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A); if (node->time_in_stale[HSR_PT_SLAVE_B]) return hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B); if (time_after(node->time_in[HSR_PT_SLAVE_B], node->time_in[HSR_PT_SLAVE_A] + msecs_to_jiffies(MAX_SLAVE_DIFF))) return hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A); if (time_after(node->time_in[HSR_PT_SLAVE_A], node->time_in[HSR_PT_SLAVE_B] + msecs_to_jiffies(MAX_SLAVE_DIFF))) return hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B); return NULL; } /* Remove stale sequence_nr records. Called by timer every * HSR_LIFE_CHECK_INTERVAL (two seconds or so). */ void hsr_prune_nodes(struct timer_list *t) { struct hsr_priv *hsr = from_timer(hsr, t, prune_timer); struct hsr_node *node; struct hsr_node *tmp; struct hsr_port *port; unsigned long timestamp; unsigned long time_a, time_b; spin_lock_bh(&hsr->list_lock); list_for_each_entry_safe(node, tmp, &hsr->node_db, mac_list) { /* Don't prune own node. Neither time_in[HSR_PT_SLAVE_A] * nor time_in[HSR_PT_SLAVE_B], will ever be updated for * the master port. Thus the master node will be repeatedly * pruned leading to packet loss. */ if (hsr_addr_is_self(hsr, node->macaddress_A)) continue; /* Shorthand */ time_a = node->time_in[HSR_PT_SLAVE_A]; time_b = node->time_in[HSR_PT_SLAVE_B]; /* Check for timestamps old enough to risk wrap-around */ if (time_after(jiffies, time_a + MAX_JIFFY_OFFSET / 2)) node->time_in_stale[HSR_PT_SLAVE_A] = true; if (time_after(jiffies, time_b + MAX_JIFFY_OFFSET / 2)) node->time_in_stale[HSR_PT_SLAVE_B] = true; /* Get age of newest frame from node. * At least one time_in is OK here; nodes get pruned long * before both time_ins can get stale */ timestamp = time_a; if (node->time_in_stale[HSR_PT_SLAVE_A] || (!node->time_in_stale[HSR_PT_SLAVE_B] && time_after(time_b, time_a))) timestamp = time_b; /* Warn of ring error only as long as we get frames at all */ if (time_is_after_jiffies(timestamp + msecs_to_jiffies(1.5 * MAX_SLAVE_DIFF))) { rcu_read_lock(); port = get_late_port(hsr, node); if (port) hsr_nl_ringerror(hsr, node->macaddress_A, port); rcu_read_unlock(); } /* Prune old entries */ if (time_is_before_jiffies(timestamp + msecs_to_jiffies(HSR_NODE_FORGET_TIME))) { hsr_nl_nodedown(hsr, node->macaddress_A); if (!node->removed) { list_del_rcu(&node->mac_list); node->removed = true; /* Note that we need to free this entry later: */ kfree_rcu(node, rcu_head); } } } spin_unlock_bh(&hsr->list_lock); /* Restart timer */ mod_timer(&hsr->prune_timer, jiffies + msecs_to_jiffies(PRUNE_PERIOD)); } void hsr_prune_proxy_nodes(struct timer_list *t) { struct hsr_priv *hsr = from_timer(hsr, t, prune_proxy_timer); unsigned long timestamp; struct hsr_node *node; struct hsr_node *tmp; spin_lock_bh(&hsr->list_lock); list_for_each_entry_safe(node, tmp, &hsr->proxy_node_db, mac_list) { /* Don't prune RedBox node. */ if (hsr_addr_is_redbox(hsr, node->macaddress_A)) continue; timestamp = node->time_in[HSR_PT_INTERLINK]; /* Prune old entries */ if (time_is_before_jiffies(timestamp + msecs_to_jiffies(HSR_PROXY_NODE_FORGET_TIME))) { hsr_nl_nodedown(hsr, node->macaddress_A); if (!node->removed) { list_del_rcu(&node->mac_list); node->removed = true; /* Note that we need to free this entry later: */ kfree_rcu(node, rcu_head); } } } spin_unlock_bh(&hsr->list_lock); /* Restart timer */ mod_timer(&hsr->prune_proxy_timer, jiffies + msecs_to_jiffies(PRUNE_PROXY_PERIOD)); } void *hsr_get_next_node(struct hsr_priv *hsr, void *_pos, unsigned char addr[ETH_ALEN]) { struct hsr_node *node; if (!_pos) { node = list_first_or_null_rcu(&hsr->node_db, struct hsr_node, mac_list); if (node) ether_addr_copy(addr, node->macaddress_A); return node; } node = _pos; list_for_each_entry_continue_rcu(node, &hsr->node_db, mac_list) { ether_addr_copy(addr, node->macaddress_A); return node; } return NULL; } int hsr_get_node_data(struct hsr_priv *hsr, const unsigned char *addr, unsigned char addr_b[ETH_ALEN], unsigned int *addr_b_ifindex, int *if1_age, u16 *if1_seq, int *if2_age, u16 *if2_seq) { struct hsr_node *node; struct hsr_port *port; unsigned long tdiff; node = find_node_by_addr_A(&hsr->node_db, addr); if (!node) return -ENOENT; ether_addr_copy(addr_b, node->macaddress_B); tdiff = jiffies - node->time_in[HSR_PT_SLAVE_A]; if (node->time_in_stale[HSR_PT_SLAVE_A]) *if1_age = INT_MAX; #if HZ <= MSEC_PER_SEC else if (tdiff > msecs_to_jiffies(INT_MAX)) *if1_age = INT_MAX; #endif else *if1_age = jiffies_to_msecs(tdiff); tdiff = jiffies - node->time_in[HSR_PT_SLAVE_B]; if (node->time_in_stale[HSR_PT_SLAVE_B]) *if2_age = INT_MAX; #if HZ <= MSEC_PER_SEC else if (tdiff > msecs_to_jiffies(INT_MAX)) *if2_age = INT_MAX; #endif else *if2_age = jiffies_to_msecs(tdiff); /* Present sequence numbers as if they were incoming on interface */ *if1_seq = node->seq_out[HSR_PT_SLAVE_B]; *if2_seq = node->seq_out[HSR_PT_SLAVE_A]; if (node->addr_B_port != HSR_PT_NONE) { port = hsr_port_get_hsr(hsr, node->addr_B_port); *addr_b_ifindex = port->dev->ifindex; } else { *addr_b_ifindex = -1; } return 0; }
3 5 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 /* SPDX-License-Identifier: GPL-2.0 */ /* Based on net/wireless/trace.h */ #undef TRACE_SYSTEM #define TRACE_SYSTEM cfg802154 #if !defined(__RDEV_CFG802154_OPS_TRACE) || defined(TRACE_HEADER_MULTI_READ) #define __RDEV_CFG802154_OPS_TRACE #include <linux/tracepoint.h> #include <net/cfg802154.h> #define MAXNAME 32 #define WPAN_PHY_ENTRY __array(char, wpan_phy_name, MAXNAME) #define WPAN_PHY_ASSIGN strscpy(__entry->wpan_phy_name, \ wpan_phy_name(wpan_phy), \ MAXNAME) #define WPAN_PHY_PR_FMT "%s" #define WPAN_PHY_PR_ARG __entry->wpan_phy_name #define WPAN_DEV_ENTRY __field(u32, identifier) #define WPAN_DEV_ASSIGN (__entry->identifier) = (!IS_ERR_OR_NULL(wpan_dev) \ ? wpan_dev->identifier : 0) #define WPAN_DEV_PR_FMT "wpan_dev(%u)" #define WPAN_DEV_PR_ARG (__entry->identifier) #define WPAN_CCA_ENTRY __field(enum nl802154_cca_modes, cca_mode) \ __field(enum nl802154_cca_opts, cca_opt) #define WPAN_CCA_ASSIGN \ do { \ (__entry->cca_mode) = cca->mode; \ (__entry->cca_opt) = cca->opt; \ } while (0) #define WPAN_CCA_PR_FMT "cca_mode: %d, cca_opt: %d" #define WPAN_CCA_PR_ARG __entry->cca_mode, __entry->cca_opt #define BOOL_TO_STR(bo) (bo) ? "true" : "false" /************************************************************* * rdev->ops traces * *************************************************************/ DECLARE_EVENT_CLASS(wpan_phy_only_evt, TP_PROTO(struct wpan_phy *wpan_phy), TP_ARGS(wpan_phy), TP_STRUCT__entry( WPAN_PHY_ENTRY ), TP_fast_assign( WPAN_PHY_ASSIGN; ), TP_printk(WPAN_PHY_PR_FMT, WPAN_PHY_PR_ARG) ); DEFINE_EVENT(wpan_phy_only_evt, 802154_rdev_suspend, TP_PROTO(struct wpan_phy *wpan_phy), TP_ARGS(wpan_phy) ); DEFINE_EVENT(wpan_phy_only_evt, 802154_rdev_resume, TP_PROTO(struct wpan_phy *wpan_phy), TP_ARGS(wpan_phy) ); TRACE_EVENT(802154_rdev_add_virtual_intf, TP_PROTO(struct wpan_phy *wpan_phy, char *name, enum nl802154_iftype type, __le64 extended_addr), TP_ARGS(wpan_phy, name, type, extended_addr), TP_STRUCT__entry( WPAN_PHY_ENTRY __string(vir_intf_name, name ? name : "<noname>") __field(enum nl802154_iftype, type) __field(__le64, extended_addr) ), TP_fast_assign( WPAN_PHY_ASSIGN; __assign_str(vir_intf_name); __entry->type = type; __entry->extended_addr = extended_addr; ), TP_printk(WPAN_PHY_PR_FMT ", virtual intf name: %s, type: %d, extended addr: 0x%llx", WPAN_PHY_PR_ARG, __get_str(vir_intf_name), __entry->type, __le64_to_cpu(__entry->extended_addr)) ); TRACE_EVENT(802154_rdev_del_virtual_intf, TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev), TP_ARGS(wpan_phy, wpan_dev), TP_STRUCT__entry( WPAN_PHY_ENTRY WPAN_DEV_ENTRY ), TP_fast_assign( WPAN_PHY_ASSIGN; WPAN_DEV_ASSIGN; ), TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT, WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG) ); TRACE_EVENT(802154_rdev_set_channel, TP_PROTO(struct wpan_phy *wpan_phy, u8 page, u8 channel), TP_ARGS(wpan_phy, page, channel), TP_STRUCT__entry( WPAN_PHY_ENTRY __field(u8, page) __field(u8, channel) ), TP_fast_assign( WPAN_PHY_ASSIGN; __entry->page = page; __entry->channel = channel; ), TP_printk(WPAN_PHY_PR_FMT ", page: %d, channel: %d", WPAN_PHY_PR_ARG, __entry->page, __entry->channel) ); TRACE_EVENT(802154_rdev_set_tx_power, TP_PROTO(struct wpan_phy *wpan_phy, s32 power), TP_ARGS(wpan_phy, power), TP_STRUCT__entry( WPAN_PHY_ENTRY __field(s32, power) ), TP_fast_assign( WPAN_PHY_ASSIGN; __entry->power = power; ), TP_printk(WPAN_PHY_PR_FMT ", mbm: %d", WPAN_PHY_PR_ARG, __entry->power) ); TRACE_EVENT(802154_rdev_set_cca_mode, TP_PROTO(struct wpan_phy *wpan_phy, const struct wpan_phy_cca *cca), TP_ARGS(wpan_phy, cca), TP_STRUCT__entry( WPAN_PHY_ENTRY WPAN_CCA_ENTRY ), TP_fast_assign( WPAN_PHY_ASSIGN; WPAN_CCA_ASSIGN; ), TP_printk(WPAN_PHY_PR_FMT ", " WPAN_CCA_PR_FMT, WPAN_PHY_PR_ARG, WPAN_CCA_PR_ARG) ); TRACE_EVENT(802154_rdev_set_cca_ed_level, TP_PROTO(struct wpan_phy *wpan_phy, s32 ed_level), TP_ARGS(wpan_phy, ed_level), TP_STRUCT__entry( WPAN_PHY_ENTRY __field(s32, ed_level) ), TP_fast_assign( WPAN_PHY_ASSIGN; __entry->ed_level = ed_level; ), TP_printk(WPAN_PHY_PR_FMT ", ed level: %d", WPAN_PHY_PR_ARG, __entry->ed_level) ); DECLARE_EVENT_CLASS(802154_le16_template, TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, __le16 le16arg), TP_ARGS(wpan_phy, wpan_dev, le16arg), TP_STRUCT__entry( WPAN_PHY_ENTRY WPAN_DEV_ENTRY __field(__le16, le16arg) ), TP_fast_assign( WPAN_PHY_ASSIGN; WPAN_DEV_ASSIGN; __entry->le16arg = le16arg; ), TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT ", pan id: 0x%04x", WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG, __le16_to_cpu(__entry->le16arg)) ); DEFINE_EVENT(802154_le16_template, 802154_rdev_set_pan_id, TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, __le16 le16arg), TP_ARGS(wpan_phy, wpan_dev, le16arg) ); DEFINE_EVENT_PRINT(802154_le16_template, 802154_rdev_set_short_addr, TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, __le16 le16arg), TP_ARGS(wpan_phy, wpan_dev, le16arg), TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT ", short addr: 0x%04x", WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG, __le16_to_cpu(__entry->le16arg)) ); TRACE_EVENT(802154_rdev_set_backoff_exponent, TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, u8 min_be, u8 max_be), TP_ARGS(wpan_phy, wpan_dev, min_be, max_be), TP_STRUCT__entry( WPAN_PHY_ENTRY WPAN_DEV_ENTRY __field(u8, min_be) __field(u8, max_be) ), TP_fast_assign( WPAN_PHY_ASSIGN; WPAN_DEV_ASSIGN; __entry->min_be = min_be; __entry->max_be = max_be; ), TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT ", min be: %d, max be: %d", WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG, __entry->min_be, __entry->max_be) ); TRACE_EVENT(802154_rdev_set_csma_backoffs, TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, u8 max_csma_backoffs), TP_ARGS(wpan_phy, wpan_dev, max_csma_backoffs), TP_STRUCT__entry( WPAN_PHY_ENTRY WPAN_DEV_ENTRY __field(u8, max_csma_backoffs) ), TP_fast_assign( WPAN_PHY_ASSIGN; WPAN_DEV_ASSIGN; __entry->max_csma_backoffs = max_csma_backoffs; ), TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT ", max csma backoffs: %d", WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG, __entry->max_csma_backoffs) ); TRACE_EVENT(802154_rdev_set_max_frame_retries, TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, s8 max_frame_retries), TP_ARGS(wpan_phy, wpan_dev, max_frame_retries), TP_STRUCT__entry( WPAN_PHY_ENTRY WPAN_DEV_ENTRY __field(s8, max_frame_retries) ), TP_fast_assign( WPAN_PHY_ASSIGN; WPAN_DEV_ASSIGN; __entry->max_frame_retries = max_frame_retries; ), TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT ", max frame retries: %d", WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG, __entry->max_frame_retries) ); TRACE_EVENT(802154_rdev_set_lbt_mode, TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, bool mode), TP_ARGS(wpan_phy, wpan_dev, mode), TP_STRUCT__entry( WPAN_PHY_ENTRY WPAN_DEV_ENTRY __field(bool, mode) ), TP_fast_assign( WPAN_PHY_ASSIGN; WPAN_DEV_ASSIGN; __entry->mode = mode; ), TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT ", lbt mode: %s", WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG, BOOL_TO_STR(__entry->mode)) ); TRACE_EVENT(802154_rdev_set_ackreq_default, TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, bool ackreq), TP_ARGS(wpan_phy, wpan_dev, ackreq), TP_STRUCT__entry( WPAN_PHY_ENTRY WPAN_DEV_ENTRY __field(bool, ackreq) ), TP_fast_assign( WPAN_PHY_ASSIGN; WPAN_DEV_ASSIGN; __entry->ackreq = ackreq; ), TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT ", ackreq default: %s", WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG, BOOL_TO_STR(__entry->ackreq)) ); TRACE_EVENT(802154_rdev_trigger_scan, TP_PROTO(struct wpan_phy *wpan_phy, struct cfg802154_scan_request *request), TP_ARGS(wpan_phy, request), TP_STRUCT__entry( WPAN_PHY_ENTRY __field(u8, page) __field(u32, channels) __field(u8, duration) ), TP_fast_assign( WPAN_PHY_ASSIGN; __entry->page = request->page; __entry->channels = request->channels; __entry->duration = request->duration; ), TP_printk(WPAN_PHY_PR_FMT ", scan, page: %d, channels: %x, duration %d", WPAN_PHY_PR_ARG, __entry->page, __entry->channels, __entry->duration) ); TRACE_EVENT(802154_rdev_send_beacons, TP_PROTO(struct wpan_phy *wpan_phy, struct cfg802154_beacon_request *request), TP_ARGS(wpan_phy, request), TP_STRUCT__entry( WPAN_PHY_ENTRY __field(u8, interval) ), TP_fast_assign( WPAN_PHY_ASSIGN; __entry->interval = request->interval; ), TP_printk(WPAN_PHY_PR_FMT ", sending beacons (interval order: %d)", WPAN_PHY_PR_ARG, __entry->interval) ); DECLARE_EVENT_CLASS(802154_wdev_template, TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev), TP_ARGS(wpan_phy, wpan_dev), TP_STRUCT__entry( WPAN_PHY_ENTRY WPAN_DEV_ENTRY ), TP_fast_assign( WPAN_PHY_ASSIGN; WPAN_DEV_ASSIGN; ), TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT, WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG) ); DEFINE_EVENT(802154_wdev_template, 802154_rdev_abort_scan, TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev), TP_ARGS(wpan_phy, wpan_dev) ); DEFINE_EVENT(802154_wdev_template, 802154_rdev_stop_beacons, TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev), TP_ARGS(wpan_phy, wpan_dev) ); TRACE_EVENT(802154_rdev_associate, TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, struct ieee802154_addr *coord), TP_ARGS(wpan_phy, wpan_dev, coord), TP_STRUCT__entry( WPAN_PHY_ENTRY WPAN_DEV_ENTRY __field(__le64, addr) ), TP_fast_assign( WPAN_PHY_ASSIGN; WPAN_DEV_ASSIGN; __entry->addr = coord->extended_addr; ), TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT ", associating with: 0x%llx", WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG, __entry->addr) ); TRACE_EVENT(802154_rdev_disassociate, TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, struct ieee802154_addr *target), TP_ARGS(wpan_phy, wpan_dev, target), TP_STRUCT__entry( WPAN_PHY_ENTRY WPAN_DEV_ENTRY __field(__le64, addr) ), TP_fast_assign( WPAN_PHY_ASSIGN; WPAN_DEV_ASSIGN; __entry->addr = target->extended_addr; ), TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT ", disassociating with: 0x%llx", WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG, __entry->addr) ); TRACE_EVENT(802154_rdev_return_int, TP_PROTO(struct wpan_phy *wpan_phy, int ret), TP_ARGS(wpan_phy, ret), TP_STRUCT__entry( WPAN_PHY_ENTRY __field(int, ret) ), TP_fast_assign( WPAN_PHY_ASSIGN; __entry->ret = ret; ), TP_printk(WPAN_PHY_PR_FMT ", returned: %d", WPAN_PHY_PR_ARG, __entry->ret) ); #endif /* !__RDEV_CFG802154_OPS_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE trace #include <trace/define_trace.h>
2017 59 1974 2012 2013 2020 232 1936 55 52 2031 1889 1899 30 30 30 30 30 30 13 30 30 30 81 30 1 29 170 169 171 64 79 39 6 77 1 2015 2031 2007 2020 1893 1894 8 1889 17 2007 1881 2016 2017 1482 1365 2030 13 2013 2017 2031 170 2166 2175 1860 419 424 1102 1094 1102 1096 1046 1280 2168 1866 1618 54 55 2 6 6 6 6 6 1 2 2 5 4 2 7 1 2 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 // SPDX-License-Identifier: GPL-2.0 /* * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner * * High-resolution kernel timers * * In contrast to the low-resolution timeout API, aka timer wheel, * hrtimers provide finer resolution and accuracy depending on system * configuration and capabilities. * * Started by: Thomas Gleixner and Ingo Molnar * * Credits: * Based on the original timer wheel code * * Help, testing, suggestions, bugfixes, improvements were * provided by: * * George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel * et. al. */ #include <linux/cpu.h> #include <linux/export.h> #include <linux/percpu.h> #include <linux/hrtimer.h> #include <linux/notifier.h> #include <linux/syscalls.h> #include <linux/interrupt.h> #include <linux/tick.h> #include <linux/err.h> #include <linux/debugobjects.h> #include <linux/sched/signal.h> #include <linux/sched/sysctl.h> #include <linux/sched/rt.h> #include <linux/sched/deadline.h> #include <linux/sched/nohz.h> #include <linux/sched/debug.h> #include <linux/sched/isolation.h> #include <linux/timer.h> #include <linux/freezer.h> #include <linux/compat.h> #include <linux/uaccess.h> #include <trace/events/timer.h> #include "tick-internal.h" /* * Masks for selecting the soft and hard context timers from * cpu_base->active */ #define MASK_SHIFT (HRTIMER_BASE_MONOTONIC_SOFT) #define HRTIMER_ACTIVE_HARD ((1U << MASK_SHIFT) - 1) #define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT) #define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD) static void retrigger_next_event(void *arg); /* * The timer bases: * * There are more clockids than hrtimer bases. Thus, we index * into the timer bases by the hrtimer_base_type enum. When trying * to reach a base using a clockid, hrtimer_clockid_to_base() * is used to convert from clockid to the proper hrtimer_base_type. */ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), .clock_base = { { .index = HRTIMER_BASE_MONOTONIC, .clockid = CLOCK_MONOTONIC, .get_time = &ktime_get, }, { .index = HRTIMER_BASE_REALTIME, .clockid = CLOCK_REALTIME, .get_time = &ktime_get_real, }, { .index = HRTIMER_BASE_BOOTTIME, .clockid = CLOCK_BOOTTIME, .get_time = &ktime_get_boottime, }, { .index = HRTIMER_BASE_TAI, .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, }, { .index = HRTIMER_BASE_MONOTONIC_SOFT, .clockid = CLOCK_MONOTONIC, .get_time = &ktime_get, }, { .index = HRTIMER_BASE_REALTIME_SOFT, .clockid = CLOCK_REALTIME, .get_time = &ktime_get_real, }, { .index = HRTIMER_BASE_BOOTTIME_SOFT, .clockid = CLOCK_BOOTTIME, .get_time = &ktime_get_boottime, }, { .index = HRTIMER_BASE_TAI_SOFT, .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, }, }, .csd = CSD_INIT(retrigger_next_event, NULL) }; static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { /* Make sure we catch unsupported clockids */ [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES, [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, [CLOCK_TAI] = HRTIMER_BASE_TAI, }; static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base) { if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) return true; else return likely(base->online); } /* * Functions and macros which are different for UP/SMP systems are kept in a * single place */ #ifdef CONFIG_SMP /* * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base() * such that hrtimer_callback_running() can unconditionally dereference * timer->base->cpu_base */ static struct hrtimer_cpu_base migration_cpu_base = { .clock_base = { { .cpu_base = &migration_cpu_base, .seq = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq, &migration_cpu_base.lock), }, }, }; #define migration_base migration_cpu_base.clock_base[0] /* * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock * means that all timers which are tied to this base via timer->base are * locked, and the base itself is locked too. * * So __run_timers/migrate_timers can safely modify all timers which could * be found on the lists/queues. * * When the timer's base is locked, and the timer removed from list, it is * possible to set timer->base = &migration_base and drop the lock: the timer * remains locked. */ static struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) __acquires(&timer->base->lock) { struct hrtimer_clock_base *base; for (;;) { base = READ_ONCE(timer->base); if (likely(base != &migration_base)) { raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); if (likely(base == timer->base)) return base; /* The timer has migrated to another CPU: */ raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags); } cpu_relax(); } } /* * Check if the elected target is suitable considering its next * event and the hotplug state of the current CPU. * * If the elected target is remote and its next event is after the timer * to queue, then a remote reprogram is necessary. However there is no * guarantee the IPI handling the operation would arrive in time to meet * the high resolution deadline. In this case the local CPU becomes a * preferred target, unless it is offline. * * High and low resolution modes are handled the same way for simplicity. * * Called with cpu_base->lock of target cpu held. */ static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base, struct hrtimer_cpu_base *new_cpu_base, struct hrtimer_cpu_base *this_cpu_base) { ktime_t expires; /* * The local CPU clockevent can be reprogrammed. Also get_target_base() * guarantees it is online. */ if (new_cpu_base == this_cpu_base) return true; /* * The offline local CPU can't be the default target if the * next remote target event is after this timer. Keep the * elected new base. An IPI will we issued to reprogram * it as a last resort. */ if (!hrtimer_base_is_online(this_cpu_base)) return true; expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset); return expires >= new_base->cpu_base->expires_next; } static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned) { if (!hrtimer_base_is_online(base)) { int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER)); return &per_cpu(hrtimer_bases, cpu); } #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) if (static_branch_likely(&timers_migration_enabled) && !pinned) return &per_cpu(hrtimer_bases, get_nohz_timer_target()); #endif return base; } /* * We switch the timer base to a power-optimized selected CPU target, * if: * - NO_HZ_COMMON is enabled * - timer migration is enabled * - the timer callback is not running * - the timer is not the first expiring timer on the new target * * If one of the above requirements is not fulfilled we move the timer * to the current CPU or leave it on the previously assigned CPU if * the timer callback is currently running. */ static inline struct hrtimer_clock_base * switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, int pinned) { struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base; struct hrtimer_clock_base *new_base; int basenum = base->index; this_cpu_base = this_cpu_ptr(&hrtimer_bases); new_cpu_base = get_target_base(this_cpu_base, pinned); again: new_base = &new_cpu_base->clock_base[basenum]; if (base != new_base) { /* * We are trying to move timer to new_base. * However we can't change timer's base while it is running, * so we keep it on the same CPU. No hassle vs. reprogramming * the event source in the high resolution case. The softirq * code will take care of this when the timer function has * completed. There is no conflict as we hold the lock until * the timer is enqueued. */ if (unlikely(hrtimer_callback_running(timer))) return base; /* See the comment in lock_hrtimer_base() */ WRITE_ONCE(timer->base, &migration_base); raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock); if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, this_cpu_base)) { raw_spin_unlock(&new_base->cpu_base->lock); raw_spin_lock(&base->cpu_base->lock); new_cpu_base = this_cpu_base; WRITE_ONCE(timer->base, base); goto again; } WRITE_ONCE(timer->base, new_base); } else { if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, this_cpu_base)) { new_cpu_base = this_cpu_base; goto again; } } return new_base; } #else /* CONFIG_SMP */ static inline struct hrtimer_clock_base * lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) __acquires(&timer->base->cpu_base->lock) { struct hrtimer_clock_base *base = timer->base; raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); return base; } # define switch_hrtimer_base(t, b, p) (b) #endif /* !CONFIG_SMP */ /* * Functions for the union type storage format of ktime_t which are * too large for inlining: */ #if BITS_PER_LONG < 64 /* * Divide a ktime value by a nanosecond value */ s64 __ktime_divns(const ktime_t kt, s64 div) { int sft = 0; s64 dclc; u64 tmp; dclc = ktime_to_ns(kt); tmp = dclc < 0 ? -dclc : dclc; /* Make sure the divisor is less than 2^32: */ while (div >> 32) { sft++; div >>= 1; } tmp >>= sft; do_div(tmp, (u32) div); return dclc < 0 ? -tmp : tmp; } EXPORT_SYMBOL_GPL(__ktime_divns); #endif /* BITS_PER_LONG >= 64 */ /* * Add two ktime values and do a safety check for overflow: */ ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs) { ktime_t res = ktime_add_unsafe(lhs, rhs); /* * We use KTIME_SEC_MAX here, the maximum timeout which we can * return to user space in a timespec: */ if (res < 0 || res < lhs || res < rhs) res = ktime_set(KTIME_SEC_MAX, 0); return res; } EXPORT_SYMBOL_GPL(ktime_add_safe); #ifdef CONFIG_DEBUG_OBJECTS_TIMERS static const struct debug_obj_descr hrtimer_debug_descr; static void *hrtimer_debug_hint(void *addr) { return ((struct hrtimer *) addr)->function; } /* * fixup_init is called when: * - an active object is initialized */ static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state) { struct hrtimer *timer = addr; switch (state) { case ODEBUG_STATE_ACTIVE: hrtimer_cancel(timer); debug_object_init(timer, &hrtimer_debug_descr); return true; default: return false; } } /* * fixup_activate is called when: * - an active object is activated * - an unknown non-static object is activated */ static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state) { switch (state) { case ODEBUG_STATE_ACTIVE: WARN_ON(1); fallthrough; default: return false; } } /* * fixup_free is called when: * - an active object is freed */ static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state) { struct hrtimer *timer = addr; switch (state) { case ODEBUG_STATE_ACTIVE: hrtimer_cancel(timer); debug_object_free(timer, &hrtimer_debug_descr); return true; default: return false; } } static const struct debug_obj_descr hrtimer_debug_descr = { .name = "hrtimer", .debug_hint = hrtimer_debug_hint, .fixup_init = hrtimer_fixup_init, .fixup_activate = hrtimer_fixup_activate, .fixup_free = hrtimer_fixup_free, }; static inline void debug_hrtimer_init(struct hrtimer *timer) { debug_object_init(timer, &hrtimer_debug_descr); } static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer) { debug_object_init_on_stack(timer, &hrtimer_debug_descr); } static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode) { debug_object_activate(timer, &hrtimer_debug_descr); } static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { debug_object_deactivate(timer, &hrtimer_debug_descr); } void destroy_hrtimer_on_stack(struct hrtimer *timer) { debug_object_free(timer, &hrtimer_debug_descr); } EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack); #else static inline void debug_hrtimer_init(struct hrtimer *timer) { } static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer) { } static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode) { } static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } #endif static inline void debug_init(struct hrtimer *timer, clockid_t clockid, enum hrtimer_mode mode) { debug_hrtimer_init(timer); trace_hrtimer_init(timer, clockid, mode); } static inline void debug_init_on_stack(struct hrtimer *timer, clockid_t clockid, enum hrtimer_mode mode) { debug_hrtimer_init_on_stack(timer); trace_hrtimer_init(timer, clockid, mode); } static inline void debug_activate(struct hrtimer *timer, enum hrtimer_mode mode) { debug_hrtimer_activate(timer, mode); trace_hrtimer_start(timer, mode); } static inline void debug_deactivate(struct hrtimer *timer) { debug_hrtimer_deactivate(timer); trace_hrtimer_cancel(timer); } static struct hrtimer_clock_base * __next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active) { unsigned int idx; if (!*active) return NULL; idx = __ffs(*active); *active &= ~(1U << idx); return &cpu_base->clock_base[idx]; } #define for_each_active_base(base, cpu_base, active) \ while ((base = __next_base((cpu_base), &(active)))) static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, const struct hrtimer *exclude, unsigned int active, ktime_t expires_next) { struct hrtimer_clock_base *base; ktime_t expires; for_each_active_base(base, cpu_base, active) { struct timerqueue_node *next; struct hrtimer *timer; next = timerqueue_getnext(&base->active); timer = container_of(next, struct hrtimer, node); if (timer == exclude) { /* Get to the next timer in the queue. */ next = timerqueue_iterate_next(next); if (!next) continue; timer = container_of(next, struct hrtimer, node); } expires = ktime_sub(hrtimer_get_expires(timer), base->offset); if (expires < expires_next) { expires_next = expires; /* Skip cpu_base update if a timer is being excluded. */ if (exclude) continue; if (timer->is_soft) cpu_base->softirq_next_timer = timer; else cpu_base->next_timer = timer; } } /* * clock_was_set() might have changed base->offset of any of * the clock bases so the result might be negative. Fix it up * to prevent a false positive in clockevents_program_event(). */ if (expires_next < 0) expires_next = 0; return expires_next; } /* * Recomputes cpu_base::*next_timer and returns the earliest expires_next * but does not set cpu_base::*expires_next, that is done by * hrtimer[_force]_reprogram and hrtimer_interrupt only. When updating * cpu_base::*expires_next right away, reprogramming logic would no longer * work. * * When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases, * those timers will get run whenever the softirq gets handled, at the end of * hrtimer_run_softirq(), hrtimer_update_softirq_timer() will re-add these bases. * * Therefore softirq values are those from the HRTIMER_ACTIVE_SOFT clock bases. * The !softirq values are the minima across HRTIMER_ACTIVE_ALL, unless an actual * softirq is pending, in which case they're the minima of HRTIMER_ACTIVE_HARD. * * @active_mask must be one of: * - HRTIMER_ACTIVE_ALL, * - HRTIMER_ACTIVE_SOFT, or * - HRTIMER_ACTIVE_HARD. */ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask) { unsigned int active; struct hrtimer *next_timer = NULL; ktime_t expires_next = KTIME_MAX; if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) { active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; cpu_base->softirq_next_timer = NULL; expires_next = __hrtimer_next_event_base(cpu_base, NULL, active, KTIME_MAX); next_timer = cpu_base->softirq_next_timer; } if (active_mask & HRTIMER_ACTIVE_HARD) { active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; cpu_base->next_timer = next_timer; expires_next = __hrtimer_next_event_base(cpu_base, NULL, active, expires_next); } return expires_next; } static ktime_t hrtimer_update_next_event(struct hrtimer_cpu_base *cpu_base) { ktime_t expires_next, soft = KTIME_MAX; /* * If the soft interrupt has already been activated, ignore the * soft bases. They will be handled in the already raised soft * interrupt. */ if (!cpu_base->softirq_activated) { soft = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT); /* * Update the soft expiry time. clock_settime() might have * affected it. */ cpu_base->softirq_expires_next = soft; } expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_HARD); /* * If a softirq timer is expiring first, update cpu_base->next_timer * and program the hardware with the soft expiry time. */ if (expires_next > soft) { cpu_base->next_timer = cpu_base->softirq_next_timer; expires_next = soft; } return expires_next; } static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) { ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq, offs_real, offs_boot, offs_tai); base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real; base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot; base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai; return now; } /* * Is the high resolution mode active ? */ static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) { return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ? cpu_base->hres_active : 0; } static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, struct hrtimer *next_timer, ktime_t expires_next) { cpu_base->expires_next = expires_next; /* * If hres is not active, hardware does not have to be * reprogrammed yet. * * If a hang was detected in the last timer interrupt then we * leave the hang delay active in the hardware. We want the * system to make progress. That also prevents the following * scenario: * T1 expires 50ms from now * T2 expires 5s from now * * T1 is removed, so this code is called and would reprogram * the hardware to 5s from now. Any hrtimer_start after that * will not reprogram the hardware due to hang_detected being * set. So we'd effectively block all timers until the T2 event * fires. */ if (!hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) return; tick_program_event(expires_next, 1); } /* * Reprogram the event source with checking both queues for the * next event * Called with interrupts disabled and base->lock held */ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) { ktime_t expires_next; expires_next = hrtimer_update_next_event(cpu_base); if (skip_equal && expires_next == cpu_base->expires_next) return; __hrtimer_reprogram(cpu_base, cpu_base->next_timer, expires_next); } /* High resolution timer related functions */ #ifdef CONFIG_HIGH_RES_TIMERS /* * High resolution timer enabled ? */ static bool hrtimer_hres_enabled __read_mostly = true; unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC; EXPORT_SYMBOL_GPL(hrtimer_resolution); /* * Enable / Disable high resolution mode */ static int __init setup_hrtimer_hres(char *str) { return (kstrtobool(str, &hrtimer_hres_enabled) == 0); } __setup("highres=", setup_hrtimer_hres); /* * hrtimer_high_res_enabled - query, if the highres mode is enabled */ static inline int hrtimer_is_hres_enabled(void) { return hrtimer_hres_enabled; } /* * Switch to high resolution mode */ static void hrtimer_switch_to_hres(void) { struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); if (tick_init_highres()) { pr_warn("Could not switch to high resolution mode on CPU %u\n", base->cpu); return; } base->hres_active = 1; hrtimer_resolution = HIGH_RES_NSEC; tick_setup_sched_timer(true); /* "Retrigger" the interrupt to get things going */ retrigger_next_event(NULL); } #else static inline int hrtimer_is_hres_enabled(void) { return 0; } static inline void hrtimer_switch_to_hres(void) { } #endif /* CONFIG_HIGH_RES_TIMERS */ /* * Retrigger next event is called after clock was set with interrupts * disabled through an SMP function call or directly from low level * resume code. * * This is only invoked when: * - CONFIG_HIGH_RES_TIMERS is enabled. * - CONFIG_NOHZ_COMMON is enabled * * For the other cases this function is empty and because the call sites * are optimized out it vanishes as well, i.e. no need for lots of * #ifdeffery. */ static void retrigger_next_event(void *arg) { struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); /* * When high resolution mode or nohz is active, then the offsets of * CLOCK_REALTIME/TAI/BOOTTIME have to be updated. Otherwise the * next tick will take care of that. * * If high resolution mode is active then the next expiring timer * must be reevaluated and the clock event device reprogrammed if * necessary. * * In the NOHZ case the update of the offset and the reevaluation * of the next expiring timer is enough. The return from the SMP * function call will take care of the reprogramming in case the * CPU was in a NOHZ idle sleep. */ if (!hrtimer_hres_active(base) && !tick_nohz_active) return; raw_spin_lock(&base->lock); hrtimer_update_base(base); if (hrtimer_hres_active(base)) hrtimer_force_reprogram(base, 0); else hrtimer_update_next_event(base); raw_spin_unlock(&base->lock); } /* * When a timer is enqueued and expires earlier than the already enqueued * timers, we have to check, whether it expires earlier than the timer for * which the clock event device was armed. * * Called with interrupts disabled and base->cpu_base.lock held */ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); struct hrtimer_clock_base *base = timer->base; ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0); /* * CLOCK_REALTIME timer might be requested with an absolute * expiry time which is less than base->offset. Set it to 0. */ if (expires < 0) expires = 0; if (timer->is_soft) { /* * soft hrtimer could be started on a remote CPU. In this * case softirq_expires_next needs to be updated on the * remote CPU. The soft hrtimer will not expire before the * first hard hrtimer on the remote CPU - * hrtimer_check_target() prevents this case. */ struct hrtimer_cpu_base *timer_cpu_base = base->cpu_base; if (timer_cpu_base->softirq_activated) return; if (!ktime_before(expires, timer_cpu_base->softirq_expires_next)) return; timer_cpu_base->softirq_next_timer = timer; timer_cpu_base->softirq_expires_next = expires; if (!ktime_before(expires, timer_cpu_base->expires_next) || !reprogram) return; } /* * If the timer is not on the current cpu, we cannot reprogram * the other cpus clock event device. */ if (base->cpu_base != cpu_base) return; if (expires >= cpu_base->expires_next) return; /* * If the hrtimer interrupt is running, then it will reevaluate the * clock bases and reprogram the clock event device. */ if (cpu_base->in_hrtirq) return; cpu_base->next_timer = timer; __hrtimer_reprogram(cpu_base, timer, expires); } static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, unsigned int active) { struct hrtimer_clock_base *base; unsigned int seq; ktime_t expires; /* * Update the base offsets unconditionally so the following * checks whether the SMP function call is required works. * * The update is safe even when the remote CPU is in the hrtimer * interrupt or the hrtimer soft interrupt and expiring affected * bases. Either it will see the update before handling a base or * it will see it when it finishes the processing and reevaluates * the next expiring timer. */ seq = cpu_base->clock_was_set_seq; hrtimer_update_base(cpu_base); /* * If the sequence did not change over the update then the * remote CPU already handled it. */ if (seq == cpu_base->clock_was_set_seq) return false; /* * If the remote CPU is currently handling an hrtimer interrupt, it * will reevaluate the first expiring timer of all clock bases * before reprogramming. Nothing to do here. */ if (cpu_base->in_hrtirq) return false; /* * Walk the affected clock bases and check whether the first expiring * timer in a clock base is moving ahead of the first expiring timer of * @cpu_base. If so, the IPI must be invoked because per CPU clock * event devices cannot be remotely reprogrammed. */ active &= cpu_base->active_bases; for_each_active_base(base, cpu_base, active) { struct timerqueue_node *next; next = timerqueue_getnext(&base->active); expires = ktime_sub(next->expires, base->offset); if (expires < cpu_base->expires_next) return true; /* Extra check for softirq clock bases */ if (base->clockid < HRTIMER_BASE_MONOTONIC_SOFT) continue; if (cpu_base->softirq_activated) continue; if (expires < cpu_base->softirq_expires_next) return true; } return false; } /* * Clock was set. This might affect CLOCK_REALTIME, CLOCK_TAI and * CLOCK_BOOTTIME (for late sleep time injection). * * This requires to update the offsets for these clocks * vs. CLOCK_MONOTONIC. When high resolution timers are enabled, then this * also requires to eventually reprogram the per CPU clock event devices * when the change moves an affected timer ahead of the first expiring * timer on that CPU. Obviously remote per CPU clock event devices cannot * be reprogrammed. The other reason why an IPI has to be sent is when the * system is in !HIGH_RES and NOHZ mode. The NOHZ mode updates the offsets * in the tick, which obviously might be stopped, so this has to bring out * the remote CPU which might sleep in idle to get this sorted. */ void clock_was_set(unsigned int bases) { struct hrtimer_cpu_base *cpu_base = raw_cpu_ptr(&hrtimer_bases); cpumask_var_t mask; int cpu; if (!hrtimer_hres_active(cpu_base) && !tick_nohz_active) goto out_timerfd; if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { on_each_cpu(retrigger_next_event, NULL, 1); goto out_timerfd; } /* Avoid interrupting CPUs if possible */ cpus_read_lock(); for_each_online_cpu(cpu) { unsigned long flags; cpu_base = &per_cpu(hrtimer_bases, cpu); raw_spin_lock_irqsave(&cpu_base->lock, flags); if (update_needs_ipi(cpu_base, bases)) cpumask_set_cpu(cpu, mask); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); } preempt_disable(); smp_call_function_many(mask, retrigger_next_event, NULL, 1); preempt_enable(); cpus_read_unlock(); free_cpumask_var(mask); out_timerfd: timerfd_clock_was_set(); } static void clock_was_set_work(struct work_struct *work) { clock_was_set(CLOCK_SET_WALL); } static DECLARE_WORK(hrtimer_work, clock_was_set_work); /* * Called from timekeeping code to reprogram the hrtimer interrupt device * on all cpus and to notify timerfd. */ void clock_was_set_delayed(void) { schedule_work(&hrtimer_work); } /* * Called during resume either directly from via timekeeping_resume() * or in the case of s2idle from tick_unfreeze() to ensure that the * hrtimers are up to date. */ void hrtimers_resume_local(void) { lockdep_assert_irqs_disabled(); /* Retrigger on the local CPU */ retrigger_next_event(NULL); } /* * Counterpart to lock_hrtimer_base above: */ static inline void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) __releases(&timer->base->cpu_base->lock) { raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); } /** * hrtimer_forward() - forward the timer expiry * @timer: hrtimer to forward * @now: forward past this time * @interval: the interval to forward * * Forward the timer expiry so it will expire in the future. * * .. note:: * This only updates the timer expiry value and does not requeue the timer. * * There is also a variant of the function hrtimer_forward_now(). * * Context: Can be safely called from the callback function of @timer. If called * from other contexts @timer must neither be enqueued nor running the * callback and the caller needs to take care of serialization. * * Return: The number of overruns are returned. */ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) { u64 orun = 1; ktime_t delta; delta = ktime_sub(now, hrtimer_get_expires(timer)); if (delta < 0) return 0; if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED)) return 0; if (interval < hrtimer_resolution) interval = hrtimer_resolution; if (unlikely(delta >= interval)) { s64 incr = ktime_to_ns(interval); orun = ktime_divns(delta, incr); hrtimer_add_expires_ns(timer, incr * orun); if (hrtimer_get_expires_tv64(timer) > now) return orun; /* * This (and the ktime_add() below) is the * correction for exact: */ orun++; } hrtimer_add_expires(timer, interval); return orun; } EXPORT_SYMBOL_GPL(hrtimer_forward); /* * enqueue_hrtimer - internal function to (re)start a timer * * The timer is inserted in expiry order. Insertion into the * red black tree is O(log(n)). Must hold the base lock. * * Returns true when the new timer is the leftmost timer in the tree. */ static bool enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, enum hrtimer_mode mode) { debug_activate(timer, mode); WARN_ON_ONCE(!base->cpu_base->online); base->cpu_base->active_bases |= 1 << base->index; /* Pairs with the lockless read in hrtimer_is_queued() */ WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED); return timerqueue_add(&base->active, &timer->node); } /* * __remove_hrtimer - internal function to remove a timer * * Caller must hold the base lock. * * High resolution timer mode reprograms the clock event device when the * timer is the one which expires next. The caller can disable this by setting * reprogram to zero. This is useful, when the context does a reprogramming * anyway (e.g. timer interrupt) */ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, u8 newstate, int reprogram) { struct hrtimer_cpu_base *cpu_base = base->cpu_base; u8 state = timer->state; /* Pairs with the lockless read in hrtimer_is_queued() */ WRITE_ONCE(timer->state, newstate); if (!(state & HRTIMER_STATE_ENQUEUED)) return; if (!timerqueue_del(&base->active, &timer->node)) cpu_base->active_bases &= ~(1 << base->index); /* * Note: If reprogram is false we do not update * cpu_base->next_timer. This happens when we remove the first * timer on a remote cpu. No harm as we never dereference * cpu_base->next_timer. So the worst thing what can happen is * an superfluous call to hrtimer_force_reprogram() on the * remote cpu later on if the same timer gets enqueued again. */ if (reprogram && timer == cpu_base->next_timer) hrtimer_force_reprogram(cpu_base, 1); } /* * remove hrtimer, called with base lock held */ static inline int remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart, bool keep_local) { u8 state = timer->state; if (state & HRTIMER_STATE_ENQUEUED) { bool reprogram; /* * Remove the timer and force reprogramming when high * resolution mode is active and the timer is on the current * CPU. If we remove a timer on another CPU, reprogramming is * skipped. The interrupt event on this CPU is fired and * reprogramming happens in the interrupt handler. This is a * rare case and less expensive than a smp call. */ debug_deactivate(timer); reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); /* * If the timer is not restarted then reprogramming is * required if the timer is local. If it is local and about * to be restarted, avoid programming it twice (on removal * and a moment later when it's requeued). */ if (!restart) state = HRTIMER_STATE_INACTIVE; else reprogram &= !keep_local; __remove_hrtimer(timer, base, state, reprogram); return 1; } return 0; } static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) { #ifdef CONFIG_TIME_LOW_RES /* * CONFIG_TIME_LOW_RES indicates that the system has no way to return * granular time values. For relative timers we add hrtimer_resolution * (i.e. one jiffy) to prevent short timeouts. */ timer->is_rel = mode & HRTIMER_MODE_REL; if (timer->is_rel) tim = ktime_add_safe(tim, hrtimer_resolution); #endif return tim; } static void hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram) { ktime_t expires; /* * Find the next SOFT expiration. */ expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT); /* * reprogramming needs to be triggered, even if the next soft * hrtimer expires at the same time than the next hard * hrtimer. cpu_base->softirq_expires_next needs to be updated! */ if (expires == KTIME_MAX) return; /* * cpu_base->*next_timer is recomputed by __hrtimer_get_next_event() * cpu_base->*expires_next is only set by hrtimer_reprogram() */ hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram); } static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, const enum hrtimer_mode mode, struct hrtimer_clock_base *base) { struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases); struct hrtimer_clock_base *new_base; bool force_local, first; /* * If the timer is on the local cpu base and is the first expiring * timer then this might end up reprogramming the hardware twice * (on removal and on enqueue). To avoid that by prevent the * reprogram on removal, keep the timer local to the current CPU * and enforce reprogramming after it is queued no matter whether * it is the new first expiring timer again or not. */ force_local = base->cpu_base == this_cpu_base; force_local &= base->cpu_base->next_timer == timer; /* * Don't force local queuing if this enqueue happens on a unplugged * CPU after hrtimer_cpu_dying() has been invoked. */ force_local &= this_cpu_base->online; /* * Remove an active timer from the queue. In case it is not queued * on the current CPU, make sure that remove_hrtimer() updates the * remote data correctly. * * If it's on the current CPU and the first expiring timer, then * skip reprogramming, keep the timer local and enforce * reprogramming later if it was the first expiring timer. This * avoids programming the underlying clock event twice (once at * removal and once after enqueue). */ remove_hrtimer(timer, base, true, force_local); if (mode & HRTIMER_MODE_REL) tim = ktime_add_safe(tim, base->get_time()); tim = hrtimer_update_lowres(timer, tim, mode); hrtimer_set_expires_range_ns(timer, tim, delta_ns); /* Switch the timer base, if necessary: */ if (!force_local) { new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); } else { new_base = base; } first = enqueue_hrtimer(timer, new_base, mode); if (!force_local) { /* * If the current CPU base is online, then the timer is * never queued on a remote CPU if it would be the first * expiring timer there. */ if (hrtimer_base_is_online(this_cpu_base)) return first; /* * Timer was enqueued remote because the current base is * already offline. If the timer is the first to expire, * kick the remote CPU to reprogram the clock event. */ if (first) { struct hrtimer_cpu_base *new_cpu_base = new_base->cpu_base; smp_call_function_single_async(new_cpu_base->cpu, &new_cpu_base->csd); } return 0; } /* * Timer was forced to stay on the current CPU to avoid * reprogramming on removal and enqueue. Force reprogram the * hardware by evaluating the new first expiring timer. */ hrtimer_force_reprogram(new_base->cpu_base, 1); return 0; } /** * hrtimer_start_range_ns - (re)start an hrtimer * @timer: the timer to be added * @tim: expiry time * @delta_ns: "slack" range for the timer * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED); * softirq based mode is considered for debug purpose only! */ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, const enum hrtimer_mode mode) { struct hrtimer_clock_base *base; unsigned long flags; if (WARN_ON_ONCE(!timer->function)) return; /* * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard * expiry mode because unmarked timers are moved to softirq expiry. */ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft); else WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard); base = lock_hrtimer_base(timer, &flags); if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base)) hrtimer_reprogram(timer, true); unlock_hrtimer_base(timer, &flags); } EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); /** * hrtimer_try_to_cancel - try to deactivate a timer * @timer: hrtimer to stop * * Returns: * * * 0 when the timer was not active * * 1 when the timer was active * * -1 when the timer is currently executing the callback function and * cannot be stopped */ int hrtimer_try_to_cancel(struct hrtimer *timer) { struct hrtimer_clock_base *base; unsigned long flags; int ret = -1; /* * Check lockless first. If the timer is not active (neither * enqueued nor running the callback, nothing to do here. The * base lock does not serialize against a concurrent enqueue, * so we can avoid taking it. */ if (!hrtimer_active(timer)) return 0; base = lock_hrtimer_base(timer, &flags); if (!hrtimer_callback_running(timer)) ret = remove_hrtimer(timer, base, false, false); unlock_hrtimer_base(timer, &flags); return ret; } EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel); #ifdef CONFIG_PREEMPT_RT static void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { spin_lock_init(&base->softirq_expiry_lock); } static void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) __acquires(&base->softirq_expiry_lock) { spin_lock(&base->softirq_expiry_lock); } static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) __releases(&base->softirq_expiry_lock) { spin_unlock(&base->softirq_expiry_lock); } /* * The counterpart to hrtimer_cancel_wait_running(). * * If there is a waiter for cpu_base->expiry_lock, then it was waiting for * the timer callback to finish. Drop expiry_lock and reacquire it. That * allows the waiter to acquire the lock and make progress. */ static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, unsigned long flags) { if (atomic_read(&cpu_base->timer_waiters)) { raw_spin_unlock_irqrestore(&cpu_base->lock, flags); spin_unlock(&cpu_base->softirq_expiry_lock); spin_lock(&cpu_base->softirq_expiry_lock); raw_spin_lock_irq(&cpu_base->lock); } } #ifdef CONFIG_SMP static __always_inline bool is_migration_base(struct hrtimer_clock_base *base) { return base == &migration_base; } #else static __always_inline bool is_migration_base(struct hrtimer_clock_base *base) { return false; } #endif /* * This function is called on PREEMPT_RT kernels when the fast path * deletion of a timer failed because the timer callback function was * running. * * This prevents priority inversion: if the soft irq thread is preempted * in the middle of a timer callback, then calling del_timer_sync() can * lead to two issues: * * - If the caller is on a remote CPU then it has to spin wait for the timer * handler to complete. This can result in unbound priority inversion. * * - If the caller originates from the task which preempted the timer * handler on the same CPU, then spin waiting for the timer handler to * complete is never going to end. */ void hrtimer_cancel_wait_running(const struct hrtimer *timer) { /* Lockless read. Prevent the compiler from reloading it below */ struct hrtimer_clock_base *base = READ_ONCE(timer->base); /* * Just relax if the timer expires in hard interrupt context or if * it is currently on the migration base. */ if (!timer->is_soft || is_migration_base(base)) { cpu_relax(); return; } /* * Mark the base as contended and grab the expiry lock, which is * held by the softirq across the timer callback. Drop the lock * immediately so the softirq can expire the next timer. In theory * the timer could already be running again, but that's more than * unlikely and just causes another wait loop. */ atomic_inc(&base->cpu_base->timer_waiters); spin_lock_bh(&base->cpu_base->softirq_expiry_lock); atomic_dec(&base->cpu_base->timer_waiters); spin_unlock_bh(&base->cpu_base->softirq_expiry_lock); } #else static inline void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { } static inline void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { } static inline void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { } static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base, unsigned long flags) { } #endif /** * hrtimer_cancel - cancel a timer and wait for the handler to finish. * @timer: the timer to be cancelled * * Returns: * 0 when the timer was not active * 1 when the timer was active */ int hrtimer_cancel(struct hrtimer *timer) { int ret; do { ret = hrtimer_try_to_cancel(timer); if (ret < 0) hrtimer_cancel_wait_running(timer); } while (ret < 0); return ret; } EXPORT_SYMBOL_GPL(hrtimer_cancel); /** * __hrtimer_get_remaining - get remaining time for the timer * @timer: the timer to read * @adjust: adjust relative timers when CONFIG_TIME_LOW_RES=y */ ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust) { unsigned long flags; ktime_t rem; lock_hrtimer_base(timer, &flags); if (IS_ENABLED(CONFIG_TIME_LOW_RES) && adjust) rem = hrtimer_expires_remaining_adjusted(timer); else rem = hrtimer_expires_remaining(timer); unlock_hrtimer_base(timer, &flags); return rem; } EXPORT_SYMBOL_GPL(__hrtimer_get_remaining); #ifdef CONFIG_NO_HZ_COMMON /** * hrtimer_get_next_event - get the time until next expiry event * * Returns the next expiry time or KTIME_MAX if no timer is pending. */ u64 hrtimer_get_next_event(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); u64 expires = KTIME_MAX; unsigned long flags; raw_spin_lock_irqsave(&cpu_base->lock, flags); if (!hrtimer_hres_active(cpu_base)) expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); return expires; } /** * hrtimer_next_event_without - time until next expiry event w/o one timer * @exclude: timer to exclude * * Returns the next expiry time over all timers except for the @exclude one or * KTIME_MAX if none of them is pending. */ u64 hrtimer_next_event_without(const struct hrtimer *exclude) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); u64 expires = KTIME_MAX; unsigned long flags; raw_spin_lock_irqsave(&cpu_base->lock, flags); if (hrtimer_hres_active(cpu_base)) { unsigned int active; if (!cpu_base->softirq_activated) { active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; expires = __hrtimer_next_event_base(cpu_base, exclude, active, KTIME_MAX); } active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; expires = __hrtimer_next_event_base(cpu_base, exclude, active, expires); } raw_spin_unlock_irqrestore(&cpu_base->lock, flags); return expires; } #endif static inline int hrtimer_clockid_to_base(clockid_t clock_id) { if (likely(clock_id < MAX_CLOCKS)) { int base = hrtimer_clock_to_base_table[clock_id]; if (likely(base != HRTIMER_MAX_CLOCK_BASES)) return base; } WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); return HRTIMER_BASE_MONOTONIC; } static enum hrtimer_restart hrtimer_dummy_timeout(struct hrtimer *unused) { return HRTIMER_NORESTART; } static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { bool softtimer = !!(mode & HRTIMER_MODE_SOFT); struct hrtimer_cpu_base *cpu_base; int base; /* * On PREEMPT_RT enabled kernels hrtimers which are not explicitly * marked for hard interrupt expiry mode are moved into soft * interrupt context for latency reasons and because the callbacks * can invoke functions which might sleep on RT, e.g. spin_lock(). */ if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(mode & HRTIMER_MODE_HARD)) softtimer = true; memset(timer, 0, sizeof(struct hrtimer)); cpu_base = raw_cpu_ptr(&hrtimer_bases); /* * POSIX magic: Relative CLOCK_REALTIME timers are not affected by * clock modifications, so they needs to become CLOCK_MONOTONIC to * ensure POSIX compliance. */ if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL) clock_id = CLOCK_MONOTONIC; base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0; base += hrtimer_clockid_to_base(clock_id); timer->is_soft = softtimer; timer->is_hard = !!(mode & HRTIMER_MODE_HARD); timer->base = &cpu_base->clock_base[base]; timerqueue_init(&timer->node); } static void __hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode) { __hrtimer_init(timer, clock_id, mode); if (WARN_ON_ONCE(!function)) timer->function = hrtimer_dummy_timeout; else timer->function = function; } /** * hrtimer_init - initialize a timer to the given clock * @timer: the timer to be initialized * @clock_id: the clock to be used * @mode: The modes which are relevant for initialization: * HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT, * HRTIMER_MODE_REL_SOFT * * The PINNED variants of the above can be handed in, * but the PINNED bit is ignored as pinning happens * when the hrtimer is started */ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { debug_init(timer, clock_id, mode); __hrtimer_init(timer, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_init); /** * hrtimer_setup - initialize a timer to the given clock * @timer: the timer to be initialized * @function: the callback function * @clock_id: the clock to be used * @mode: The modes which are relevant for initialization: * HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT, * HRTIMER_MODE_REL_SOFT * * The PINNED variants of the above can be handed in, * but the PINNED bit is ignored as pinning happens * when the hrtimer is started */ void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode) { debug_init(timer, clock_id, mode); __hrtimer_setup(timer, function, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_setup); /** * hrtimer_setup_on_stack - initialize a timer on stack memory * @timer: The timer to be initialized * @function: the callback function * @clock_id: The clock to be used * @mode: The timer mode * * Similar to hrtimer_setup(), except that this one must be used if struct hrtimer is in stack * memory. */ void hrtimer_setup_on_stack(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode) { debug_init_on_stack(timer, clock_id, mode); __hrtimer_setup(timer, function, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_setup_on_stack); /* * A timer is active, when it is enqueued into the rbtree or the * callback function is running or it's in the state of being migrated * to another cpu. * * It is important for this function to not return a false negative. */ bool hrtimer_active(const struct hrtimer *timer) { struct hrtimer_clock_base *base; unsigned int seq; do { base = READ_ONCE(timer->base); seq = raw_read_seqcount_begin(&base->seq); if (timer->state != HRTIMER_STATE_INACTIVE || base->running == timer) return true; } while (read_seqcount_retry(&base->seq, seq) || base != READ_ONCE(timer->base)); return false; } EXPORT_SYMBOL_GPL(hrtimer_active); /* * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3 * distinct sections: * * - queued: the timer is queued * - callback: the timer is being ran * - post: the timer is inactive or (re)queued * * On the read side we ensure we observe timer->state and cpu_base->running * from the same section, if anything changed while we looked at it, we retry. * This includes timer->base changing because sequence numbers alone are * insufficient for that. * * The sequence numbers are required because otherwise we could still observe * a false negative if the read side got smeared over multiple consecutive * __run_hrtimer() invocations. */ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, struct hrtimer_clock_base *base, struct hrtimer *timer, ktime_t *now, unsigned long flags) __must_hold(&cpu_base->lock) { enum hrtimer_restart (*fn)(struct hrtimer *); bool expires_in_hardirq; int restart; lockdep_assert_held(&cpu_base->lock); debug_deactivate(timer); base->running = timer; /* * Separate the ->running assignment from the ->state assignment. * * As with a regular write barrier, this ensures the read side in * hrtimer_active() cannot observe base->running == NULL && * timer->state == INACTIVE. */ raw_write_seqcount_barrier(&base->seq); __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0); fn = timer->function; /* * Clear the 'is relative' flag for the TIME_LOW_RES case. If the * timer is restarted with a period then it becomes an absolute * timer. If its not restarted it does not matter. */ if (IS_ENABLED(CONFIG_TIME_LOW_RES)) timer->is_rel = false; /* * The timer is marked as running in the CPU base, so it is * protected against migration to a different CPU even if the lock * is dropped. */ raw_spin_unlock_irqrestore(&cpu_base->lock, flags); trace_hrtimer_expire_entry(timer, now); expires_in_hardirq = lockdep_hrtimer_enter(timer); restart = fn(timer); lockdep_hrtimer_exit(expires_in_hardirq); trace_hrtimer_expire_exit(timer); raw_spin_lock_irq(&cpu_base->lock); /* * Note: We clear the running state after enqueue_hrtimer and * we do not reprogram the event hardware. Happens either in * hrtimer_start_range_ns() or in hrtimer_interrupt() * * Note: Because we dropped the cpu_base->lock above, * hrtimer_start_range_ns() can have popped in and enqueued the timer * for us already. */ if (restart != HRTIMER_NORESTART && !(timer->state & HRTIMER_STATE_ENQUEUED)) enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS); /* * Separate the ->running assignment from the ->state assignment. * * As with a regular write barrier, this ensures the read side in * hrtimer_active() cannot observe base->running.timer == NULL && * timer->state == INACTIVE. */ raw_write_seqcount_barrier(&base->seq); WARN_ON_ONCE(base->running != timer); base->running = NULL; } static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, unsigned long flags, unsigned int active_mask) { struct hrtimer_clock_base *base; unsigned int active = cpu_base->active_bases & active_mask; for_each_active_base(base, cpu_base, active) { struct timerqueue_node *node; ktime_t basenow; basenow = ktime_add(now, base->offset); while ((node = timerqueue_getnext(&base->active))) { struct hrtimer *timer; timer = container_of(node, struct hrtimer, node); /* * The immediate goal for using the softexpires is * minimizing wakeups, not running timers at the * earliest interrupt after their soft expiration. * This allows us to avoid using a Priority Search * Tree, which can answer a stabbing query for * overlapping intervals and instead use the simple * BST we already have. * We don't add extra wakeups by delaying timers that * are right-of a not yet expired timer, because that * timer will have to trigger a wakeup anyway. */ if (basenow < hrtimer_get_softexpires_tv64(timer)) break; __run_hrtimer(cpu_base, base, timer, &basenow, flags); if (active_mask == HRTIMER_ACTIVE_SOFT) hrtimer_sync_wait_running(cpu_base, flags); } } } static __latent_entropy void hrtimer_run_softirq(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); unsigned long flags; ktime_t now; hrtimer_cpu_base_lock_expiry(cpu_base); raw_spin_lock_irqsave(&cpu_base->lock, flags); now = hrtimer_update_base(cpu_base); __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT); cpu_base->softirq_activated = 0; hrtimer_update_softirq_timer(cpu_base, true); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); hrtimer_cpu_base_unlock_expiry(cpu_base); } #ifdef CONFIG_HIGH_RES_TIMERS /* * High resolution timer interrupt * Called with interrupts disabled */ void hrtimer_interrupt(struct clock_event_device *dev) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); ktime_t expires_next, now, entry_time, delta; unsigned long flags; int retries = 0; BUG_ON(!cpu_base->hres_active); cpu_base->nr_events++; dev->next_event = KTIME_MAX; raw_spin_lock_irqsave(&cpu_base->lock, flags); entry_time = now = hrtimer_update_base(cpu_base); retry: cpu_base->in_hrtirq = 1; /* * We set expires_next to KTIME_MAX here with cpu_base->lock * held to prevent that a timer is enqueued in our queue via * the migration code. This does not affect enqueueing of * timers which run their callback and need to be requeued on * this CPU. */ cpu_base->expires_next = KTIME_MAX; if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->softirq_activated = 1; raise_timer_softirq(HRTIMER_SOFTIRQ); } __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); /* Reevaluate the clock bases for the [soft] next expiry */ expires_next = hrtimer_update_next_event(cpu_base); /* * Store the new expiry value so the migration code can verify * against it. */ cpu_base->expires_next = expires_next; cpu_base->in_hrtirq = 0; raw_spin_unlock_irqrestore(&cpu_base->lock, flags); /* Reprogramming necessary ? */ if (!tick_program_event(expires_next, 0)) { cpu_base->hang_detected = 0; return; } /* * The next timer was already expired due to: * - tracing * - long lasting callbacks * - being scheduled away when running in a VM * * We need to prevent that we loop forever in the hrtimer * interrupt routine. We give it 3 attempts to avoid * overreacting on some spurious event. * * Acquire base lock for updating the offsets and retrieving * the current time. */ raw_spin_lock_irqsave(&cpu_base->lock, flags); now = hrtimer_update_base(cpu_base); cpu_base->nr_retries++; if (++retries < 3) goto retry; /* * Give the system a chance to do something else than looping * here. We stored the entry time, so we know exactly how long * we spent here. We schedule the next event this amount of * time away. */ cpu_base->nr_hangs++; cpu_base->hang_detected = 1; raw_spin_unlock_irqrestore(&cpu_base->lock, flags); delta = ktime_sub(now, entry_time); if ((unsigned int)delta > cpu_base->max_hang_time) cpu_base->max_hang_time = (unsigned int) delta; /* * Limit it to a sensible value as we enforce a longer * delay. Give the CPU at least 100ms to catch up. */ if (delta > 100 * NSEC_PER_MSEC) expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC); else expires_next = ktime_add(now, delta); tick_program_event(expires_next, 1); pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta)); } #endif /* !CONFIG_HIGH_RES_TIMERS */ /* * Called from run_local_timers in hardirq context every jiffy */ void hrtimer_run_queues(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); unsigned long flags; ktime_t now; if (hrtimer_hres_active(cpu_base)) return; /* * This _is_ ugly: We have to check periodically, whether we * can switch to highres and / or nohz mode. The clocksource * switch happens with xtime_lock held. Notification from * there only sets the check bit in the tick_oneshot code, * otherwise we might deadlock vs. xtime_lock. */ if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) { hrtimer_switch_to_hres(); return; } raw_spin_lock_irqsave(&cpu_base->lock, flags); now = hrtimer_update_base(cpu_base); if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->softirq_activated = 1; raise_timer_softirq(HRTIMER_SOFTIRQ); } __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); } /* * Sleep related functions: */ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) { struct hrtimer_sleeper *t = container_of(timer, struct hrtimer_sleeper, timer); struct task_struct *task = t->task; t->task = NULL; if (task) wake_up_process(task); return HRTIMER_NORESTART; } /** * hrtimer_sleeper_start_expires - Start a hrtimer sleeper timer * @sl: sleeper to be started * @mode: timer mode abs/rel * * Wrapper around hrtimer_start_expires() for hrtimer_sleeper based timers * to allow PREEMPT_RT to tweak the delivery mode (soft/hardirq context) */ void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, enum hrtimer_mode mode) { /* * Make the enqueue delivery mode check work on RT. If the sleeper * was initialized for hard interrupt delivery, force the mode bit. * This is a special case for hrtimer_sleepers because * __hrtimer_init_sleeper() determines the delivery mode on RT so the * fiddling with this decision is avoided at the call sites. */ if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard) mode |= HRTIMER_MODE_HARD; hrtimer_start_expires(&sl->timer, mode); } EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires); static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode) { /* * On PREEMPT_RT enabled kernels hrtimers which are not explicitly * marked for hard interrupt expiry mode are moved into soft * interrupt context either for latency reasons or because the * hrtimer callback takes regular spinlocks or invokes other * functions which are not suitable for hard interrupt context on * PREEMPT_RT. * * The hrtimer_sleeper callback is RT compatible in hard interrupt * context, but there is a latency concern: Untrusted userspace can * spawn many threads which arm timers for the same expiry time on * the same CPU. That causes a latency spike due to the wakeup of * a gazillion threads. * * OTOH, privileged real-time user space applications rely on the * low latency of hard interrupt wakeups. If the current task is in * a real-time scheduling class, mark the mode for hard interrupt * expiry. */ if (IS_ENABLED(CONFIG_PREEMPT_RT)) { if (rt_or_dl_task_policy(current) && !(mode & HRTIMER_MODE_SOFT)) mode |= HRTIMER_MODE_HARD; } __hrtimer_init(&sl->timer, clock_id, mode); sl->timer.function = hrtimer_wakeup; sl->task = current; } /** * hrtimer_setup_sleeper_on_stack - initialize a sleeper in stack memory * @sl: sleeper to be initialized * @clock_id: the clock to be used * @mode: timer mode abs/rel */ void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode) { debug_init_on_stack(&sl->timer, clock_id, mode); __hrtimer_init_sleeper(sl, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_setup_sleeper_on_stack); int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts) { switch(restart->nanosleep.type) { #ifdef CONFIG_COMPAT_32BIT_TIME case TT_COMPAT: if (put_old_timespec32(ts, restart->nanosleep.compat_rmtp)) return -EFAULT; break; #endif case TT_NATIVE: if (put_timespec64(ts, restart->nanosleep.rmtp)) return -EFAULT; break; default: BUG(); } return -ERESTART_RESTARTBLOCK; } static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) { struct restart_block *restart; do { set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); hrtimer_sleeper_start_expires(t, mode); if (likely(t->task)) schedule(); hrtimer_cancel(&t->timer); mode = HRTIMER_MODE_ABS; } while (t->task && !signal_pending(current)); __set_current_state(TASK_RUNNING); if (!t->task) return 0; restart = &current->restart_block; if (restart->nanosleep.type != TT_NONE) { ktime_t rem = hrtimer_expires_remaining(&t->timer); struct timespec64 rmt; if (rem <= 0) return 0; rmt = ktime_to_timespec64(rem); return nanosleep_copyout(restart, &rmt); } return -ERESTART_RESTARTBLOCK; } static long __sched hrtimer_nanosleep_restart(struct restart_block *restart) { struct hrtimer_sleeper t; int ret; hrtimer_setup_sleeper_on_stack(&t, restart->nanosleep.clockid, HRTIMER_MODE_ABS); hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); ret = do_nanosleep(&t, HRTIMER_MODE_ABS); destroy_hrtimer_on_stack(&t.timer); return ret; } long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, const clockid_t clockid) { struct restart_block *restart; struct hrtimer_sleeper t; int ret = 0; hrtimer_setup_sleeper_on_stack(&t, clockid, mode); hrtimer_set_expires_range_ns(&t.timer, rqtp, current->timer_slack_ns); ret = do_nanosleep(&t, mode); if (ret != -ERESTART_RESTARTBLOCK) goto out; /* Absolute timers do not update the rmtp value and restart: */ if (mode == HRTIMER_MODE_ABS) { ret = -ERESTARTNOHAND; goto out; } restart = &current->restart_block; restart->nanosleep.clockid = t.timer.base->clockid; restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); set_restart_fn(restart, hrtimer_nanosleep_restart); out: destroy_hrtimer_on_stack(&t.timer); return ret; } #ifdef CONFIG_64BIT SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, struct __kernel_timespec __user *, rmtp) { struct timespec64 tu; if (get_timespec64(&tu, rqtp)) return -EFAULT; if (!timespec64_valid(&tu)) return -EINVAL; current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC); } #endif #ifdef CONFIG_COMPAT_32BIT_TIME SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp, struct old_timespec32 __user *, rmtp) { struct timespec64 tu; if (get_old_timespec32(&tu, rqtp)) return -EFAULT; if (!timespec64_valid(&tu)) return -EINVAL; current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC); } #endif /* * Functions related to boot-time initialization: */ int hrtimers_prepare_cpu(unsigned int cpu) { struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); int i; for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i]; clock_b->cpu_base = cpu_base; seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock); timerqueue_init_head(&clock_b->active); } cpu_base->cpu = cpu; hrtimer_cpu_base_init_expiry_lock(cpu_base); return 0; } int hrtimers_cpu_starting(unsigned int cpu) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); /* Clear out any left over state from a CPU down operation */ cpu_base->active_bases = 0; cpu_base->hres_active = 0; cpu_base->hang_detected = 0; cpu_base->next_timer = NULL; cpu_base->softirq_next_timer = NULL; cpu_base->expires_next = KTIME_MAX; cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->online = 1; return 0; } #ifdef CONFIG_HOTPLUG_CPU static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, struct hrtimer_clock_base *new_base) { struct hrtimer *timer; struct timerqueue_node *node; while ((node = timerqueue_getnext(&old_base->active))) { timer = container_of(node, struct hrtimer, node); BUG_ON(hrtimer_callback_running(timer)); debug_deactivate(timer); /* * Mark it as ENQUEUED not INACTIVE otherwise the * timer could be seen as !active and just vanish away * under us on another CPU */ __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0); timer->base = new_base; /* * Enqueue the timers on the new cpu. This does not * reprogram the event device in case the timer * expires before the earliest on this CPU, but we run * hrtimer_interrupt after we migrated everything to * sort out already expired timers and reprogram the * event device. */ enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS); } } int hrtimers_cpu_dying(unsigned int dying_cpu) { int i, ncpu = cpumask_any_and(cpu_active_mask, housekeeping_cpumask(HK_TYPE_TIMER)); struct hrtimer_cpu_base *old_base, *new_base; old_base = this_cpu_ptr(&hrtimer_bases); new_base = &per_cpu(hrtimer_bases, ncpu); /* * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. */ raw_spin_lock(&old_base->lock); raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { migrate_hrtimer_list(&old_base->clock_base[i], &new_base->clock_base[i]); } /* * The migration might have changed the first expiring softirq * timer on this CPU. Update it. */ __hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT); /* Tell the other CPU to retrigger the next event */ smp_call_function_single(ncpu, retrigger_next_event, NULL, 0); raw_spin_unlock(&new_base->lock); old_base->online = 0; raw_spin_unlock(&old_base->lock); return 0; } #endif /* CONFIG_HOTPLUG_CPU */ void __init hrtimers_init(void) { hrtimers_prepare_cpu(smp_processor_id()); hrtimers_cpu_starting(smp_processor_id()); open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq); }
3 3 3 4 4 2 3 3 3 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 // SPDX-License-Identifier: GPL-2.0-or-later /* * * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) */ #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/spinlock.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/slab.h> #include <net/ax25.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/interrupt.h> static struct ax25_protocol *protocol_list; static DEFINE_RWLOCK(protocol_list_lock); static HLIST_HEAD(ax25_linkfail_list); static DEFINE_SPINLOCK(linkfail_lock); static struct listen_struct { struct listen_struct *next; ax25_address callsign; struct net_device *dev; } *listen_list = NULL; static DEFINE_SPINLOCK(listen_lock); /* * Do not register the internal protocols AX25_P_TEXT, AX25_P_SEGMENT, * AX25_P_IP or AX25_P_ARP ... */ void ax25_register_pid(struct ax25_protocol *ap) { write_lock_bh(&protocol_list_lock); ap->next = protocol_list; protocol_list = ap; write_unlock_bh(&protocol_list_lock); } EXPORT_SYMBOL_GPL(ax25_register_pid); void ax25_protocol_release(unsigned int pid) { struct ax25_protocol *protocol; write_lock_bh(&protocol_list_lock); protocol = protocol_list; if (protocol == NULL) goto out; if (protocol->pid == pid) { protocol_list = protocol->next; goto out; } while (protocol != NULL && protocol->next != NULL) { if (protocol->next->pid == pid) { protocol->next = protocol->next->next; goto out; } protocol = protocol->next; } out: write_unlock_bh(&protocol_list_lock); } EXPORT_SYMBOL(ax25_protocol_release); void ax25_linkfail_register(struct ax25_linkfail *lf) { spin_lock_bh(&linkfail_lock); hlist_add_head(&lf->lf_node, &ax25_linkfail_list); spin_unlock_bh(&linkfail_lock); } EXPORT_SYMBOL(ax25_linkfail_register); void ax25_linkfail_release(struct ax25_linkfail *lf) { spin_lock_bh(&linkfail_lock); hlist_del_init(&lf->lf_node); spin_unlock_bh(&linkfail_lock); } EXPORT_SYMBOL(ax25_linkfail_release); int ax25_listen_register(const ax25_address *callsign, struct net_device *dev) { struct listen_struct *listen; if (ax25_listen_mine(callsign, dev)) return 0; if ((listen = kmalloc(sizeof(*listen), GFP_ATOMIC)) == NULL) return -ENOMEM; listen->callsign = *callsign; listen->dev = dev; spin_lock_bh(&listen_lock); listen->next = listen_list; listen_list = listen; spin_unlock_bh(&listen_lock); return 0; } EXPORT_SYMBOL(ax25_listen_register); void ax25_listen_release(const ax25_address *callsign, struct net_device *dev) { struct listen_struct *s, *listen; spin_lock_bh(&listen_lock); listen = listen_list; if (listen == NULL) { spin_unlock_bh(&listen_lock); return; } if (ax25cmp(&listen->callsign, callsign) == 0 && listen->dev == dev) { listen_list = listen->next; spin_unlock_bh(&listen_lock); kfree(listen); return; } while (listen != NULL && listen->next != NULL) { if (ax25cmp(&listen->next->callsign, callsign) == 0 && listen->next->dev == dev) { s = listen->next; listen->next = listen->next->next; spin_unlock_bh(&listen_lock); kfree(s); return; } listen = listen->next; } spin_unlock_bh(&listen_lock); } EXPORT_SYMBOL(ax25_listen_release); int (*ax25_protocol_function(unsigned int pid))(struct sk_buff *, ax25_cb *) { int (*res)(struct sk_buff *, ax25_cb *) = NULL; struct ax25_protocol *protocol; read_lock(&protocol_list_lock); for (protocol = protocol_list; protocol != NULL; protocol = protocol->next) if (protocol->pid == pid) { res = protocol->func; break; } read_unlock(&protocol_list_lock); return res; } int ax25_listen_mine(const ax25_address *callsign, struct net_device *dev) { struct listen_struct *listen; spin_lock_bh(&listen_lock); for (listen = listen_list; listen != NULL; listen = listen->next) if (ax25cmp(&listen->callsign, callsign) == 0 && (listen->dev == dev || listen->dev == NULL)) { spin_unlock_bh(&listen_lock); return 1; } spin_unlock_bh(&listen_lock); return 0; } void ax25_link_failed(ax25_cb *ax25, int reason) { struct ax25_linkfail *lf; spin_lock_bh(&linkfail_lock); hlist_for_each_entry(lf, &ax25_linkfail_list, lf_node) lf->func(ax25, reason); spin_unlock_bh(&linkfail_lock); } int ax25_protocol_is_registered(unsigned int pid) { struct ax25_protocol *protocol; int res = 0; read_lock_bh(&protocol_list_lock); for (protocol = protocol_list; protocol != NULL; protocol = protocol->next) if (protocol->pid == pid) { res = 1; break; } read_unlock_bh(&protocol_list_lock); return res; }
14 13 1 12 1 9 4 5 1 3 1 2 2 14 1 1 2 10 2 3 2 2 2 2 2 1 1 1 1 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 // SPDX-License-Identifier: GPL-2.0-only /* * net/sched/sch_mqprio.c * * Copyright (c) 2010 John Fastabend <john.r.fastabend@intel.com> */ #include <linux/ethtool_netlink.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/module.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/sch_generic.h> #include <net/pkt_cls.h> #include "sch_mqprio_lib.h" struct mqprio_sched { struct Qdisc **qdiscs; u16 mode; u16 shaper; int hw_offload; u32 flags; u64 min_rate[TC_QOPT_MAX_QUEUE]; u64 max_rate[TC_QOPT_MAX_QUEUE]; u32 fp[TC_QOPT_MAX_QUEUE]; }; static int mqprio_enable_offload(struct Qdisc *sch, const struct tc_mqprio_qopt *qopt, struct netlink_ext_ack *extack) { struct mqprio_sched *priv = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct tc_mqprio_qopt_offload mqprio = { .qopt = *qopt, .extack = extack, }; int err, i; switch (priv->mode) { case TC_MQPRIO_MODE_DCB: if (priv->shaper != TC_MQPRIO_SHAPER_DCB) return -EINVAL; break; case TC_MQPRIO_MODE_CHANNEL: mqprio.flags = priv->flags; if (priv->flags & TC_MQPRIO_F_MODE) mqprio.mode = priv->mode; if (priv->flags & TC_MQPRIO_F_SHAPER) mqprio.shaper = priv->shaper; if (priv->flags & TC_MQPRIO_F_MIN_RATE) for (i = 0; i < mqprio.qopt.num_tc; i++) mqprio.min_rate[i] = priv->min_rate[i]; if (priv->flags & TC_MQPRIO_F_MAX_RATE) for (i = 0; i < mqprio.qopt.num_tc; i++) mqprio.max_rate[i] = priv->max_rate[i]; break; default: return -EINVAL; } mqprio_fp_to_offload(priv->fp, &mqprio); err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQPRIO, &mqprio); if (err) return err; priv->hw_offload = mqprio.qopt.hw; return 0; } static void mqprio_disable_offload(struct Qdisc *sch) { struct tc_mqprio_qopt_offload mqprio = { { 0 } }; struct mqprio_sched *priv = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); switch (priv->mode) { case TC_MQPRIO_MODE_DCB: case TC_MQPRIO_MODE_CHANNEL: dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQPRIO, &mqprio); break; } } static void mqprio_destroy(struct Qdisc *sch) { struct net_device *dev = qdisc_dev(sch); struct mqprio_sched *priv = qdisc_priv(sch); unsigned int ntx; if (priv->qdiscs) { for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++) qdisc_put(priv->qdiscs[ntx]); kfree(priv->qdiscs); } if (priv->hw_offload && dev->netdev_ops->ndo_setup_tc) mqprio_disable_offload(sch); else netdev_set_num_tc(dev, 0); } static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt, const struct tc_mqprio_caps *caps, struct netlink_ext_ack *extack) { int err; /* Limit qopt->hw to maximum supported offload value. Drivers have * the option of overriding this later if they don't support the a * given offload type. */ if (qopt->hw > TC_MQPRIO_HW_OFFLOAD_MAX) qopt->hw = TC_MQPRIO_HW_OFFLOAD_MAX; /* If hardware offload is requested, we will leave 3 options to the * device driver: * - populate the queue counts itself (and ignore what was requested) * - validate the provided queue counts by itself (and apply them) * - request queue count validation here (and apply them) */ err = mqprio_validate_qopt(dev, qopt, !qopt->hw || caps->validate_queue_counts, false, extack); if (err) return err; /* If ndo_setup_tc is not present then hardware doesn't support offload * and we should return an error. */ if (qopt->hw && !dev->netdev_ops->ndo_setup_tc) { NL_SET_ERR_MSG(extack, "Device does not support hardware offload"); return -EINVAL; } return 0; } static const struct nla_policy mqprio_tc_entry_policy[TCA_MQPRIO_TC_ENTRY_MAX + 1] = { [TCA_MQPRIO_TC_ENTRY_INDEX] = NLA_POLICY_MAX(NLA_U32, TC_QOPT_MAX_QUEUE), [TCA_MQPRIO_TC_ENTRY_FP] = NLA_POLICY_RANGE(NLA_U32, TC_FP_EXPRESS, TC_FP_PREEMPTIBLE), }; static const struct nla_policy mqprio_policy[TCA_MQPRIO_MAX + 1] = { [TCA_MQPRIO_MODE] = { .len = sizeof(u16) }, [TCA_MQPRIO_SHAPER] = { .len = sizeof(u16) }, [TCA_MQPRIO_MIN_RATE64] = { .type = NLA_NESTED }, [TCA_MQPRIO_MAX_RATE64] = { .type = NLA_NESTED }, [TCA_MQPRIO_TC_ENTRY] = { .type = NLA_NESTED }, }; static int mqprio_parse_tc_entry(u32 fp[TC_QOPT_MAX_QUEUE], struct nlattr *opt, unsigned long *seen_tcs, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_MQPRIO_TC_ENTRY_MAX + 1]; int err, tc; err = nla_parse_nested(tb, TCA_MQPRIO_TC_ENTRY_MAX, opt, mqprio_tc_entry_policy, extack); if (err < 0) return err; if (NL_REQ_ATTR_CHECK(extack, opt, tb, TCA_MQPRIO_TC_ENTRY_INDEX)) { NL_SET_ERR_MSG(extack, "TC entry index missing"); return -EINVAL; } tc = nla_get_u32(tb[TCA_MQPRIO_TC_ENTRY_INDEX]); if (*seen_tcs & BIT(tc)) { NL_SET_ERR_MSG_ATTR(extack, tb[TCA_MQPRIO_TC_ENTRY_INDEX], "Duplicate tc entry"); return -EINVAL; } *seen_tcs |= BIT(tc); if (tb[TCA_MQPRIO_TC_ENTRY_FP]) fp[tc] = nla_get_u32(tb[TCA_MQPRIO_TC_ENTRY_FP]); return 0; } static int mqprio_parse_tc_entries(struct Qdisc *sch, struct nlattr *nlattr_opt, int nlattr_opt_len, struct netlink_ext_ack *extack) { struct mqprio_sched *priv = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); bool have_preemption = false; unsigned long seen_tcs = 0; u32 fp[TC_QOPT_MAX_QUEUE]; struct nlattr *n; int tc, rem; int err = 0; for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) fp[tc] = priv->fp[tc]; nla_for_each_attr_type(n, TCA_MQPRIO_TC_ENTRY, nlattr_opt, nlattr_opt_len, rem) { err = mqprio_parse_tc_entry(fp, n, &seen_tcs, extack); if (err) goto out; } for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) { priv->fp[tc] = fp[tc]; if (fp[tc] == TC_FP_PREEMPTIBLE) have_preemption = true; } if (have_preemption && !ethtool_dev_mm_supported(dev)) { NL_SET_ERR_MSG(extack, "Device does not support preemption"); return -EOPNOTSUPP; } out: return err; } /* Parse the other netlink attributes that represent the payload of * TCA_OPTIONS, which are appended right after struct tc_mqprio_qopt. */ static int mqprio_parse_nlattr(struct Qdisc *sch, struct tc_mqprio_qopt *qopt, struct nlattr *opt, struct netlink_ext_ack *extack) { struct nlattr *nlattr_opt = nla_data(opt) + NLA_ALIGN(sizeof(*qopt)); int nlattr_opt_len = nla_len(opt) - NLA_ALIGN(sizeof(*qopt)); struct mqprio_sched *priv = qdisc_priv(sch); struct nlattr *tb[TCA_MQPRIO_MAX + 1] = {}; struct nlattr *attr; int i, rem, err; if (nlattr_opt_len >= nla_attr_size(0)) { err = nla_parse_deprecated(tb, TCA_MQPRIO_MAX, nlattr_opt, nlattr_opt_len, mqprio_policy, NULL); if (err < 0) return err; } if (!qopt->hw) { NL_SET_ERR_MSG(extack, "mqprio TCA_OPTIONS can only contain netlink attributes in hardware mode"); return -EINVAL; } if (tb[TCA_MQPRIO_MODE]) { priv->flags |= TC_MQPRIO_F_MODE; priv->mode = nla_get_u16(tb[TCA_MQPRIO_MODE]); } if (tb[TCA_MQPRIO_SHAPER]) { priv->flags |= TC_MQPRIO_F_SHAPER; priv->shaper = nla_get_u16(tb[TCA_MQPRIO_SHAPER]); } if (tb[TCA_MQPRIO_MIN_RATE64]) { if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE) { NL_SET_ERR_MSG_ATTR(extack, tb[TCA_MQPRIO_MIN_RATE64], "min_rate accepted only when shaper is in bw_rlimit mode"); return -EINVAL; } i = 0; nla_for_each_nested(attr, tb[TCA_MQPRIO_MIN_RATE64], rem) { if (nla_type(attr) != TCA_MQPRIO_MIN_RATE64) { NL_SET_ERR_MSG_ATTR(extack, attr, "Attribute type expected to be TCA_MQPRIO_MIN_RATE64"); return -EINVAL; } if (nla_len(attr) != sizeof(u64)) { NL_SET_ERR_MSG_ATTR(extack, attr, "Attribute TCA_MQPRIO_MIN_RATE64 expected to have 8 bytes length"); return -EINVAL; } if (i >= qopt->num_tc) break; priv->min_rate[i] = nla_get_u64(attr); i++; } priv->flags |= TC_MQPRIO_F_MIN_RATE; } if (tb[TCA_MQPRIO_MAX_RATE64]) { if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE) { NL_SET_ERR_MSG_ATTR(extack, tb[TCA_MQPRIO_MAX_RATE64], "max_rate accepted only when shaper is in bw_rlimit mode"); return -EINVAL; } i = 0; nla_for_each_nested(attr, tb[TCA_MQPRIO_MAX_RATE64], rem) { if (nla_type(attr) != TCA_MQPRIO_MAX_RATE64) { NL_SET_ERR_MSG_ATTR(extack, attr, "Attribute type expected to be TCA_MQPRIO_MAX_RATE64"); return -EINVAL; } if (nla_len(attr) != sizeof(u64)) { NL_SET_ERR_MSG_ATTR(extack, attr, "Attribute TCA_MQPRIO_MAX_RATE64 expected to have 8 bytes length"); return -EINVAL; } if (i >= qopt->num_tc) break; priv->max_rate[i] = nla_get_u64(attr); i++; } priv->flags |= TC_MQPRIO_F_MAX_RATE; } if (tb[TCA_MQPRIO_TC_ENTRY]) { err = mqprio_parse_tc_entries(sch, nlattr_opt, nlattr_opt_len, extack); if (err) return err; } return 0; } static int mqprio_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct net_device *dev = qdisc_dev(sch); struct mqprio_sched *priv = qdisc_priv(sch); struct netdev_queue *dev_queue; struct Qdisc *qdisc; int i, err = -EOPNOTSUPP; struct tc_mqprio_qopt *qopt = NULL; struct tc_mqprio_caps caps; int len, tc; BUILD_BUG_ON(TC_MAX_QUEUE != TC_QOPT_MAX_QUEUE); BUILD_BUG_ON(TC_BITMASK != TC_QOPT_BITMASK); if (sch->parent != TC_H_ROOT) return -EOPNOTSUPP; if (!netif_is_multiqueue(dev)) return -EOPNOTSUPP; /* make certain can allocate enough classids to handle queues */ if (dev->num_tx_queues >= TC_H_MIN_PRIORITY) return -ENOMEM; if (!opt || nla_len(opt) < sizeof(*qopt)) return -EINVAL; for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) priv->fp[tc] = TC_FP_EXPRESS; qdisc_offload_query_caps(dev, TC_SETUP_QDISC_MQPRIO, &caps, sizeof(caps)); qopt = nla_data(opt); if (mqprio_parse_opt(dev, qopt, &caps, extack)) return -EINVAL; len = nla_len(opt) - NLA_ALIGN(sizeof(*qopt)); if (len > 0) { err = mqprio_parse_nlattr(sch, qopt, opt, extack); if (err) return err; } /* pre-allocate qdisc, attachment can't fail */ priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]), GFP_KERNEL); if (!priv->qdiscs) return -ENOMEM; for (i = 0; i < dev->num_tx_queues; i++) { dev_queue = netdev_get_tx_queue(dev, i); qdisc = qdisc_create_dflt(dev_queue, get_default_qdisc_ops(dev, i), TC_H_MAKE(TC_H_MAJ(sch->handle), TC_H_MIN(i + 1)), extack); if (!qdisc) return -ENOMEM; priv->qdiscs[i] = qdisc; qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; } /* If the mqprio options indicate that hardware should own * the queue mapping then run ndo_setup_tc otherwise use the * supplied and verified mapping */ if (qopt->hw) { err = mqprio_enable_offload(sch, qopt, extack); if (err) return err; } else { netdev_set_num_tc(dev, qopt->num_tc); for (i = 0; i < qopt->num_tc; i++) netdev_set_tc_queue(dev, i, qopt->count[i], qopt->offset[i]); } /* Always use supplied priority mappings */ for (i = 0; i < TC_BITMASK + 1; i++) netdev_set_prio_tc_map(dev, i, qopt->prio_tc_map[i]); sch->flags |= TCQ_F_MQROOT; return 0; } static void mqprio_attach(struct Qdisc *sch) { struct net_device *dev = qdisc_dev(sch); struct mqprio_sched *priv = qdisc_priv(sch); struct Qdisc *qdisc, *old; unsigned int ntx; /* Attach underlying qdisc */ for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { qdisc = priv->qdiscs[ntx]; old = dev_graft_qdisc(qdisc->dev_queue, qdisc); if (old) qdisc_put(old); if (ntx < dev->real_num_tx_queues) qdisc_hash_add(qdisc, false); } kfree(priv->qdiscs); priv->qdiscs = NULL; } static struct netdev_queue *mqprio_queue_get(struct Qdisc *sch, unsigned long cl) { struct net_device *dev = qdisc_dev(sch); unsigned long ntx = cl - 1; if (ntx >= dev->num_tx_queues) return NULL; return netdev_get_tx_queue(dev, ntx); } static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { struct net_device *dev = qdisc_dev(sch); struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); if (!dev_queue) return -EINVAL; if (dev->flags & IFF_UP) dev_deactivate(dev); *old = dev_graft_qdisc(dev_queue, new); if (new) new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; if (dev->flags & IFF_UP) dev_activate(dev); return 0; } static int dump_rates(struct mqprio_sched *priv, struct tc_mqprio_qopt *opt, struct sk_buff *skb) { struct nlattr *nest; int i; if (priv->flags & TC_MQPRIO_F_MIN_RATE) { nest = nla_nest_start_noflag(skb, TCA_MQPRIO_MIN_RATE64); if (!nest) goto nla_put_failure; for (i = 0; i < opt->num_tc; i++) { if (nla_put(skb, TCA_MQPRIO_MIN_RATE64, sizeof(priv->min_rate[i]), &priv->min_rate[i])) goto nla_put_failure; } nla_nest_end(skb, nest); } if (priv->flags & TC_MQPRIO_F_MAX_RATE) { nest = nla_nest_start_noflag(skb, TCA_MQPRIO_MAX_RATE64); if (!nest) goto nla_put_failure; for (i = 0; i < opt->num_tc; i++) { if (nla_put(skb, TCA_MQPRIO_MAX_RATE64, sizeof(priv->max_rate[i]), &priv->max_rate[i])) goto nla_put_failure; } nla_nest_end(skb, nest); } return 0; nla_put_failure: nla_nest_cancel(skb, nest); return -1; } static int mqprio_dump_tc_entries(struct mqprio_sched *priv, struct sk_buff *skb) { struct nlattr *n; int tc; for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) { n = nla_nest_start(skb, TCA_MQPRIO_TC_ENTRY); if (!n) return -EMSGSIZE; if (nla_put_u32(skb, TCA_MQPRIO_TC_ENTRY_INDEX, tc)) goto nla_put_failure; if (nla_put_u32(skb, TCA_MQPRIO_TC_ENTRY_FP, priv->fp[tc])) goto nla_put_failure; nla_nest_end(skb, n); } return 0; nla_put_failure: nla_nest_cancel(skb, n); return -EMSGSIZE; } static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) { struct net_device *dev = qdisc_dev(sch); struct mqprio_sched *priv = qdisc_priv(sch); struct nlattr *nla = (struct nlattr *)skb_tail_pointer(skb); struct tc_mqprio_qopt opt = { 0 }; struct Qdisc *qdisc; unsigned int ntx; sch->q.qlen = 0; gnet_stats_basic_sync_init(&sch->bstats); memset(&sch->qstats, 0, sizeof(sch->qstats)); /* MQ supports lockless qdiscs. However, statistics accounting needs * to account for all, none, or a mix of locked and unlocked child * qdiscs. Percpu stats are added to counters in-band and locking * qdisc totals are added at end. */ for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { qdisc = rtnl_dereference(netdev_get_tx_queue(dev, ntx)->qdisc_sleeping); spin_lock_bh(qdisc_lock(qdisc)); gnet_stats_add_basic(&sch->bstats, qdisc->cpu_bstats, &qdisc->bstats, false); gnet_stats_add_queue(&sch->qstats, qdisc->cpu_qstats, &qdisc->qstats); sch->q.qlen += qdisc_qlen(qdisc); spin_unlock_bh(qdisc_lock(qdisc)); } mqprio_qopt_reconstruct(dev, &opt); opt.hw = priv->hw_offload; if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) goto nla_put_failure; if ((priv->flags & TC_MQPRIO_F_MODE) && nla_put_u16(skb, TCA_MQPRIO_MODE, priv->mode)) goto nla_put_failure; if ((priv->flags & TC_MQPRIO_F_SHAPER) && nla_put_u16(skb, TCA_MQPRIO_SHAPER, priv->shaper)) goto nla_put_failure; if ((priv->flags & TC_MQPRIO_F_MIN_RATE || priv->flags & TC_MQPRIO_F_MAX_RATE) && (dump_rates(priv, &opt, skb) != 0)) goto nla_put_failure; if (mqprio_dump_tc_entries(priv, skb)) goto nla_put_failure; return nla_nest_end(skb, nla); nla_put_failure: nlmsg_trim(skb, nla); return -1; } static struct Qdisc *mqprio_leaf(struct Qdisc *sch, unsigned long cl) { struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); if (!dev_queue) return NULL; return rtnl_dereference(dev_queue->qdisc_sleeping); } static unsigned long mqprio_find(struct Qdisc *sch, u32 classid) { struct net_device *dev = qdisc_dev(sch); unsigned int ntx = TC_H_MIN(classid); /* There are essentially two regions here that have valid classid * values. The first region will have a classid value of 1 through * num_tx_queues. All of these are backed by actual Qdiscs. */ if (ntx < TC_H_MIN_PRIORITY) return (ntx <= dev->num_tx_queues) ? ntx : 0; /* The second region represents the hardware traffic classes. These * are represented by classid values of TC_H_MIN_PRIORITY through * TC_H_MIN_PRIORITY + netdev_get_num_tc - 1 */ return ((ntx - TC_H_MIN_PRIORITY) < netdev_get_num_tc(dev)) ? ntx : 0; } static int mqprio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { if (cl < TC_H_MIN_PRIORITY) { struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); struct net_device *dev = qdisc_dev(sch); int tc = netdev_txq_to_tc(dev, cl - 1); tcm->tcm_parent = (tc < 0) ? 0 : TC_H_MAKE(TC_H_MAJ(sch->handle), TC_H_MIN(tc + TC_H_MIN_PRIORITY)); tcm->tcm_info = rtnl_dereference(dev_queue->qdisc_sleeping)->handle; } else { tcm->tcm_parent = TC_H_ROOT; tcm->tcm_info = 0; } tcm->tcm_handle |= TC_H_MIN(cl); return 0; } static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct gnet_dump *d) __releases(d->lock) __acquires(d->lock) { if (cl >= TC_H_MIN_PRIORITY) { int i; __u32 qlen; struct gnet_stats_queue qstats = {0}; struct gnet_stats_basic_sync bstats; struct net_device *dev = qdisc_dev(sch); struct netdev_tc_txq tc = dev->tc_to_txq[cl & TC_BITMASK]; gnet_stats_basic_sync_init(&bstats); /* Drop lock here it will be reclaimed before touching * statistics this is required because the d->lock we * hold here is the look on dev_queue->qdisc_sleeping * also acquired below. */ if (d->lock) spin_unlock_bh(d->lock); for (i = tc.offset; i < tc.offset + tc.count; i++) { struct netdev_queue *q = netdev_get_tx_queue(dev, i); struct Qdisc *qdisc = rtnl_dereference(q->qdisc); spin_lock_bh(qdisc_lock(qdisc)); gnet_stats_add_basic(&bstats, qdisc->cpu_bstats, &qdisc->bstats, false); gnet_stats_add_queue(&qstats, qdisc->cpu_qstats, &qdisc->qstats); sch->q.qlen += qdisc_qlen(qdisc); spin_unlock_bh(qdisc_lock(qdisc)); } qlen = qdisc_qlen(sch) + qstats.qlen; /* Reclaim root sleeping lock before completing stats */ if (d->lock) spin_lock_bh(d->lock); if (gnet_stats_copy_basic(d, NULL, &bstats, false) < 0 || gnet_stats_copy_queue(d, NULL, &qstats, qlen) < 0) return -1; } else { struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); sch = rtnl_dereference(dev_queue->qdisc_sleeping); if (gnet_stats_copy_basic(d, sch->cpu_bstats, &sch->bstats, true) < 0 || qdisc_qstats_copy(d, sch) < 0) return -1; } return 0; } static void mqprio_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct net_device *dev = qdisc_dev(sch); unsigned long ntx; if (arg->stop) return; /* Walk hierarchy with a virtual class per tc */ arg->count = arg->skip; for (ntx = arg->skip; ntx < netdev_get_num_tc(dev); ntx++) { if (!tc_qdisc_stats_dump(sch, ntx + TC_H_MIN_PRIORITY, arg)) return; } /* Pad the values and skip over unused traffic classes */ if (ntx < TC_MAX_QUEUE) { arg->count = TC_MAX_QUEUE; ntx = TC_MAX_QUEUE; } /* Reset offset, sort out remaining per-queue qdiscs */ for (ntx -= TC_MAX_QUEUE; ntx < dev->num_tx_queues; ntx++) { if (arg->fn(sch, ntx + 1, arg) < 0) { arg->stop = 1; return; } arg->count++; } } static struct netdev_queue *mqprio_select_queue(struct Qdisc *sch, struct tcmsg *tcm) { return mqprio_queue_get(sch, TC_H_MIN(tcm->tcm_parent)); } static const struct Qdisc_class_ops mqprio_class_ops = { .graft = mqprio_graft, .leaf = mqprio_leaf, .find = mqprio_find, .walk = mqprio_walk, .dump = mqprio_dump_class, .dump_stats = mqprio_dump_class_stats, .select_queue = mqprio_select_queue, }; static struct Qdisc_ops mqprio_qdisc_ops __read_mostly = { .cl_ops = &mqprio_class_ops, .id = "mqprio", .priv_size = sizeof(struct mqprio_sched), .init = mqprio_init, .destroy = mqprio_destroy, .attach = mqprio_attach, .change_real_num_tx = mq_change_real_num_tx, .dump = mqprio_dump, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("mqprio"); static int __init mqprio_module_init(void) { return register_qdisc(&mqprio_qdisc_ops); } static void __exit mqprio_module_exit(void) { unregister_qdisc(&mqprio_qdisc_ops); } module_init(mqprio_module_init); module_exit(mqprio_module_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Classful multiqueue prio qdisc");
7878 7868 1 3 68 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 // SPDX-License-Identifier: GPL-2.0-only #include <linux/ethtool.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <net/net_namespace.h> #include <linux/if_arp.h> #include <net/rtnetlink.h> static netdev_tx_t nlmon_xmit(struct sk_buff *skb, struct net_device *dev) { dev_lstats_add(dev, skb->len); dev_kfree_skb(skb); return NETDEV_TX_OK; } struct nlmon { struct netlink_tap nt; }; static int nlmon_open(struct net_device *dev) { struct nlmon *nlmon = netdev_priv(dev); nlmon->nt.dev = dev; nlmon->nt.module = THIS_MODULE; return netlink_add_tap(&nlmon->nt); } static int nlmon_close(struct net_device *dev) { struct nlmon *nlmon = netdev_priv(dev); return netlink_remove_tap(&nlmon->nt); } static void nlmon_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { dev_lstats_read(dev, &stats->rx_packets, &stats->rx_bytes); } static u32 always_on(struct net_device *dev) { return 1; } static const struct ethtool_ops nlmon_ethtool_ops = { .get_link = always_on, }; static const struct net_device_ops nlmon_ops = { .ndo_open = nlmon_open, .ndo_stop = nlmon_close, .ndo_start_xmit = nlmon_xmit, .ndo_get_stats64 = nlmon_get_stats64, }; static void nlmon_setup(struct net_device *dev) { dev->type = ARPHRD_NETLINK; dev->priv_flags |= IFF_NO_QUEUE; dev->lltx = true; dev->netdev_ops = &nlmon_ops; dev->ethtool_ops = &nlmon_ethtool_ops; dev->needs_free_netdev = true; dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA; dev->flags = IFF_NOARP; dev->pcpu_stat_type = NETDEV_PCPU_STAT_LSTATS; /* That's rather a softlimit here, which, of course, * can be altered. Not a real MTU, but what is to be * expected in most cases. */ dev->mtu = NLMSG_GOODSIZE; dev->min_mtu = sizeof(struct nlmsghdr); } static int nlmon_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) return -EINVAL; return 0; } static struct rtnl_link_ops nlmon_link_ops __read_mostly = { .kind = "nlmon", .priv_size = sizeof(struct nlmon), .setup = nlmon_setup, .validate = nlmon_validate, }; static __init int nlmon_register(void) { return rtnl_link_register(&nlmon_link_ops); } static __exit void nlmon_unregister(void) { rtnl_link_unregister(&nlmon_link_ops); } module_init(nlmon_register); module_exit(nlmon_unregister); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>"); MODULE_AUTHOR("Mathieu Geli <geli@enseirb.fr>"); MODULE_DESCRIPTION("Netlink monitoring device"); MODULE_ALIAS_RTNL_LINK("nlmon");
1 1 1 1 2 1 2 2 2 2 3 1 2 1 1 2 2 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 // SPDX-License-Identifier: GPL-2.0-only #include <linux/ethtool_netlink.h> #include <net/udp_tunnel.h> #include <net/vxlan.h> #include "bitset.h" #include "common.h" #include "netlink.h" const struct nla_policy ethnl_tunnel_info_get_policy[] = { [ETHTOOL_A_TUNNEL_INFO_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static_assert(ETHTOOL_UDP_TUNNEL_TYPE_VXLAN == ilog2(UDP_TUNNEL_TYPE_VXLAN)); static_assert(ETHTOOL_UDP_TUNNEL_TYPE_GENEVE == ilog2(UDP_TUNNEL_TYPE_GENEVE)); static_assert(ETHTOOL_UDP_TUNNEL_TYPE_VXLAN_GPE == ilog2(UDP_TUNNEL_TYPE_VXLAN_GPE)); static ssize_t ethnl_udp_table_reply_size(unsigned int types, bool compact) { ssize_t size; size = ethnl_bitset32_size(&types, NULL, __ETHTOOL_UDP_TUNNEL_TYPE_CNT, udp_tunnel_type_names, compact); if (size < 0) return size; return size + nla_total_size(0) + /* _UDP_TABLE */ nla_total_size(sizeof(u32)); /* _UDP_TABLE_SIZE */ } static ssize_t ethnl_tunnel_info_reply_size(const struct ethnl_req_info *req_base, struct netlink_ext_ack *extack) { bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; const struct udp_tunnel_nic_info *info; unsigned int i; ssize_t ret; size_t size; info = req_base->dev->udp_tunnel_nic_info; if (!info) { NL_SET_ERR_MSG(extack, "device does not report tunnel offload info"); return -EOPNOTSUPP; } size = nla_total_size(0); /* _INFO_UDP_PORTS */ for (i = 0; i < UDP_TUNNEL_NIC_MAX_TABLES; i++) { if (!info->tables[i].n_entries) break; ret = ethnl_udp_table_reply_size(info->tables[i].tunnel_types, compact); if (ret < 0) return ret; size += ret; size += udp_tunnel_nic_dump_size(req_base->dev, i); } if (info->flags & UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN) { ret = ethnl_udp_table_reply_size(0, compact); if (ret < 0) return ret; size += ret; size += nla_total_size(0) + /* _TABLE_ENTRY */ nla_total_size(sizeof(__be16)) + /* _ENTRY_PORT */ nla_total_size(sizeof(u32)); /* _ENTRY_TYPE */ } return size; } static int ethnl_tunnel_info_fill_reply(const struct ethnl_req_info *req_base, struct sk_buff *skb) { bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; const struct udp_tunnel_nic_info *info; struct nlattr *ports, *table, *entry; unsigned int i; info = req_base->dev->udp_tunnel_nic_info; if (!info) return -EOPNOTSUPP; ports = nla_nest_start(skb, ETHTOOL_A_TUNNEL_INFO_UDP_PORTS); if (!ports) return -EMSGSIZE; for (i = 0; i < UDP_TUNNEL_NIC_MAX_TABLES; i++) { if (!info->tables[i].n_entries) break; table = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE); if (!table) goto err_cancel_ports; if (nla_put_u32(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_SIZE, info->tables[i].n_entries)) goto err_cancel_table; if (ethnl_put_bitset32(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_TYPES, &info->tables[i].tunnel_types, NULL, __ETHTOOL_UDP_TUNNEL_TYPE_CNT, udp_tunnel_type_names, compact)) goto err_cancel_table; if (udp_tunnel_nic_dump_write(req_base->dev, i, skb)) goto err_cancel_table; nla_nest_end(skb, table); } if (info->flags & UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN) { u32 zero = 0; table = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE); if (!table) goto err_cancel_ports; if (nla_put_u32(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_SIZE, 1)) goto err_cancel_table; if (ethnl_put_bitset32(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_TYPES, &zero, NULL, __ETHTOOL_UDP_TUNNEL_TYPE_CNT, udp_tunnel_type_names, compact)) goto err_cancel_table; entry = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY); if (!entry) goto err_cancel_entry; if (nla_put_be16(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT, htons(IANA_VXLAN_UDP_PORT)) || nla_put_u32(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_TYPE, ilog2(UDP_TUNNEL_TYPE_VXLAN))) goto err_cancel_entry; nla_nest_end(skb, entry); nla_nest_end(skb, table); } nla_nest_end(skb, ports); return 0; err_cancel_entry: nla_nest_cancel(skb, entry); err_cancel_table: nla_nest_cancel(skb, table); err_cancel_ports: nla_nest_cancel(skb, ports); return -EMSGSIZE; } int ethnl_tunnel_info_doit(struct sk_buff *skb, struct genl_info *info) { struct ethnl_req_info req_info = {}; struct nlattr **tb = info->attrs; struct sk_buff *rskb; void *reply_payload; int reply_len; int ret; ret = ethnl_parse_header_dev_get(&req_info, tb[ETHTOOL_A_TUNNEL_INFO_HEADER], genl_info_net(info), info->extack, true); if (ret < 0) return ret; rtnl_lock(); ret = ethnl_tunnel_info_reply_size(&req_info, info->extack); if (ret < 0) goto err_unlock_rtnl; reply_len = ret + ethnl_reply_header_size(); rskb = ethnl_reply_init(reply_len, req_info.dev, ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY, ETHTOOL_A_TUNNEL_INFO_HEADER, info, &reply_payload); if (!rskb) { ret = -ENOMEM; goto err_unlock_rtnl; } ret = ethnl_tunnel_info_fill_reply(&req_info, rskb); if (ret) goto err_free_msg; rtnl_unlock(); ethnl_parse_header_dev_put(&req_info); genlmsg_end(rskb, reply_payload); return genlmsg_reply(rskb, info); err_free_msg: nlmsg_free(rskb); err_unlock_rtnl: rtnl_unlock(); ethnl_parse_header_dev_put(&req_info); return ret; } struct ethnl_tunnel_info_dump_ctx { struct ethnl_req_info req_info; unsigned long ifindex; }; int ethnl_tunnel_info_start(struct netlink_callback *cb) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct ethnl_tunnel_info_dump_ctx *ctx = (void *)cb->ctx; struct nlattr **tb = info->info.attrs; int ret; BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); memset(ctx, 0, sizeof(*ctx)); ret = ethnl_parse_header_dev_get(&ctx->req_info, tb[ETHTOOL_A_TUNNEL_INFO_HEADER], sock_net(cb->skb->sk), cb->extack, false); if (ctx->req_info.dev) { ethnl_parse_header_dev_put(&ctx->req_info); ctx->req_info.dev = NULL; } return ret; } int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct ethnl_tunnel_info_dump_ctx *ctx = (void *)cb->ctx; struct net *net = sock_net(skb->sk); struct net_device *dev; int ret = 0; void *ehdr; rtnl_lock(); for_each_netdev_dump(net, dev, ctx->ifindex) { ehdr = ethnl_dump_put(skb, cb, ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY); if (!ehdr) { ret = -EMSGSIZE; break; } ret = ethnl_fill_reply_header(skb, dev, ETHTOOL_A_TUNNEL_INFO_HEADER); if (ret < 0) { genlmsg_cancel(skb, ehdr); break; } ctx->req_info.dev = dev; ret = ethnl_tunnel_info_fill_reply(&ctx->req_info, skb); ctx->req_info.dev = NULL; if (ret < 0) { genlmsg_cancel(skb, ehdr); if (ret == -EOPNOTSUPP) continue; break; } genlmsg_end(skb, ehdr); } rtnl_unlock(); if (ret == -EMSGSIZE && skb->len) return skb->len; return ret; }
8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 // SPDX-License-Identifier: GPL-2.0-or-later /* * (C) 2010 Pablo Neira Ayuso <pablo@netfilter.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/netfilter.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/moduleparam.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_conntrack_timestamp.h> static bool nf_ct_tstamp __read_mostly; module_param_named(tstamp, nf_ct_tstamp, bool, 0644); MODULE_PARM_DESC(tstamp, "Enable connection tracking flow timestamping."); void nf_conntrack_tstamp_pernet_init(struct net *net) { net->ct.sysctl_tstamp = nf_ct_tstamp; }
7 7 1 1 4 3 1 1 1 2 5 1 1 1 2 1 1 2 2 10 10 9 4 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2013 Eric Leblond <eric@regit.org> * * Development of this code partly funded by OISF * (http://www.openinfosecfoundation.org/) */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/jhash.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_queue.h> static u32 jhash_initval __read_mostly; struct nft_queue { u8 sreg_qnum; u16 queuenum; u16 queues_total; u16 flags; }; static void nft_queue_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_queue *priv = nft_expr_priv(expr); u32 queue = priv->queuenum; u32 ret; if (priv->queues_total > 1) { if (priv->flags & NFT_QUEUE_FLAG_CPU_FANOUT) { int cpu = raw_smp_processor_id(); queue = priv->queuenum + cpu % priv->queues_total; } else { queue = nfqueue_hash(pkt->skb, queue, priv->queues_total, nft_pf(pkt), jhash_initval); } } ret = NF_QUEUE_NR(queue); if (priv->flags & NFT_QUEUE_FLAG_BYPASS) ret |= NF_VERDICT_FLAG_QUEUE_BYPASS; regs->verdict.code = ret; } static void nft_queue_sreg_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_queue *priv = nft_expr_priv(expr); u32 queue, ret; queue = regs->data[priv->sreg_qnum]; ret = NF_QUEUE_NR(queue); if (priv->flags & NFT_QUEUE_FLAG_BYPASS) ret |= NF_VERDICT_FLAG_QUEUE_BYPASS; regs->verdict.code = ret; } static int nft_queue_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { static const unsigned int supported_hooks = ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING)); switch (ctx->family) { case NFPROTO_IPV4: case NFPROTO_IPV6: case NFPROTO_INET: case NFPROTO_BRIDGE: break; case NFPROTO_NETDEV: /* lacks okfn */ fallthrough; default: return -EOPNOTSUPP; } return nft_chain_validate_hooks(ctx->chain, supported_hooks); } static const struct nla_policy nft_queue_policy[NFTA_QUEUE_MAX + 1] = { [NFTA_QUEUE_NUM] = { .type = NLA_U16 }, [NFTA_QUEUE_TOTAL] = { .type = NLA_U16 }, [NFTA_QUEUE_FLAGS] = { .type = NLA_U16 }, [NFTA_QUEUE_SREG_QNUM] = { .type = NLA_U32 }, }; static int nft_queue_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_queue *priv = nft_expr_priv(expr); u32 maxid; priv->queuenum = ntohs(nla_get_be16(tb[NFTA_QUEUE_NUM])); if (tb[NFTA_QUEUE_TOTAL]) priv->queues_total = ntohs(nla_get_be16(tb[NFTA_QUEUE_TOTAL])); else priv->queues_total = 1; if (priv->queues_total == 0) return -EINVAL; maxid = priv->queues_total - 1 + priv->queuenum; if (maxid > U16_MAX) return -ERANGE; if (tb[NFTA_QUEUE_FLAGS]) { priv->flags = ntohs(nla_get_be16(tb[NFTA_QUEUE_FLAGS])); if (priv->flags & ~NFT_QUEUE_FLAG_MASK) return -EINVAL; } return 0; } static int nft_queue_sreg_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_queue *priv = nft_expr_priv(expr); int err; err = nft_parse_register_load(ctx, tb[NFTA_QUEUE_SREG_QNUM], &priv->sreg_qnum, sizeof(u32)); if (err < 0) return err; if (tb[NFTA_QUEUE_FLAGS]) { priv->flags = ntohs(nla_get_be16(tb[NFTA_QUEUE_FLAGS])); if (priv->flags & ~NFT_QUEUE_FLAG_MASK) return -EINVAL; if (priv->flags & NFT_QUEUE_FLAG_CPU_FANOUT) return -EOPNOTSUPP; } return 0; } static int nft_queue_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_queue *priv = nft_expr_priv(expr); if (nla_put_be16(skb, NFTA_QUEUE_NUM, htons(priv->queuenum)) || nla_put_be16(skb, NFTA_QUEUE_TOTAL, htons(priv->queues_total)) || nla_put_be16(skb, NFTA_QUEUE_FLAGS, htons(priv->flags))) goto nla_put_failure; return 0; nla_put_failure: return -1; } static int nft_queue_sreg_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_queue *priv = nft_expr_priv(expr); if (nft_dump_register(skb, NFTA_QUEUE_SREG_QNUM, priv->sreg_qnum) || nla_put_be16(skb, NFTA_QUEUE_FLAGS, htons(priv->flags))) goto nla_put_failure; return 0; nla_put_failure: return -1; } static struct nft_expr_type nft_queue_type; static const struct nft_expr_ops nft_queue_ops = { .type = &nft_queue_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_queue)), .eval = nft_queue_eval, .init = nft_queue_init, .dump = nft_queue_dump, .validate = nft_queue_validate, .reduce = NFT_REDUCE_READONLY, }; static const struct nft_expr_ops nft_queue_sreg_ops = { .type = &nft_queue_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_queue)), .eval = nft_queue_sreg_eval, .init = nft_queue_sreg_init, .dump = nft_queue_sreg_dump, .validate = nft_queue_validate, .reduce = NFT_REDUCE_READONLY, }; static const struct nft_expr_ops * nft_queue_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { if (tb[NFTA_QUEUE_NUM] && tb[NFTA_QUEUE_SREG_QNUM]) return ERR_PTR(-EINVAL); init_hashrandom(&jhash_initval); if (tb[NFTA_QUEUE_NUM]) return &nft_queue_ops; if (tb[NFTA_QUEUE_SREG_QNUM]) return &nft_queue_sreg_ops; return ERR_PTR(-EINVAL); } static struct nft_expr_type nft_queue_type __read_mostly = { .name = "queue", .select_ops = nft_queue_select_ops, .policy = nft_queue_policy, .maxattr = NFTA_QUEUE_MAX, .owner = THIS_MODULE, }; static int __init nft_queue_module_init(void) { return nft_register_expr(&nft_queue_type); } static void __exit nft_queue_module_exit(void) { nft_unregister_expr(&nft_queue_type); } module_init(nft_queue_module_init); module_exit(nft_queue_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Eric Leblond <eric@regit.org>"); MODULE_ALIAS_NFT_EXPR("queue"); MODULE_DESCRIPTION("Netfilter nftables queue module");
8 8 8 8 8 8 8 1 2 2 1 4 4 4 4 4 4 3 4 4 4 3 3 3 3 3 5 2 3 3 3 3 1 1 1 1 1 12 12 12 12 8 2 3 1 1 1 1 2 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/dccp/input.c * * An implementation of the DCCP protocol * Arnaldo Carvalho de Melo <acme@conectiva.com.br> */ #include <linux/dccp.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/sock.h> #include "ackvec.h" #include "ccid.h" #include "dccp.h" /* rate-limit for syncs in reply to sequence-invalid packets; RFC 4340, 7.5.4 */ int sysctl_dccp_sync_ratelimit __read_mostly = HZ / 8; static void dccp_enqueue_skb(struct sock *sk, struct sk_buff *skb) { __skb_pull(skb, dccp_hdr(skb)->dccph_doff * 4); __skb_queue_tail(&sk->sk_receive_queue, skb); skb_set_owner_r(skb, sk); sk->sk_data_ready(sk); } static void dccp_fin(struct sock *sk, struct sk_buff *skb) { /* * On receiving Close/CloseReq, both RD/WR shutdown are performed. * RFC 4340, 8.3 says that we MAY send further Data/DataAcks after * receiving the closing segment, but there is no guarantee that such * data will be processed at all. */ sk->sk_shutdown = SHUTDOWN_MASK; sock_set_flag(sk, SOCK_DONE); dccp_enqueue_skb(sk, skb); } static int dccp_rcv_close(struct sock *sk, struct sk_buff *skb) { int queued = 0; switch (sk->sk_state) { /* * We ignore Close when received in one of the following states: * - CLOSED (may be a late or duplicate packet) * - PASSIVE_CLOSEREQ (the peer has sent a CloseReq earlier) * - RESPOND (already handled by dccp_check_req) */ case DCCP_CLOSING: /* * Simultaneous-close: receiving a Close after sending one. This * can happen if both client and server perform active-close and * will result in an endless ping-pong of crossing and retrans- * mitted Close packets, which only terminates when one of the * nodes times out (min. 64 seconds). Quicker convergence can be * achieved when one of the nodes acts as tie-breaker. * This is ok as both ends are done with data transfer and each * end is just waiting for the other to acknowledge termination. */ if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT) break; fallthrough; case DCCP_REQUESTING: case DCCP_ACTIVE_CLOSEREQ: dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED); dccp_done(sk); break; case DCCP_OPEN: case DCCP_PARTOPEN: /* Give waiting application a chance to read pending data */ queued = 1; dccp_fin(sk, skb); dccp_set_state(sk, DCCP_PASSIVE_CLOSE); fallthrough; case DCCP_PASSIVE_CLOSE: /* * Retransmitted Close: we have already enqueued the first one. */ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); } return queued; } static int dccp_rcv_closereq(struct sock *sk, struct sk_buff *skb) { int queued = 0; /* * Step 7: Check for unexpected packet types * If (S.is_server and P.type == CloseReq) * Send Sync packet acknowledging P.seqno * Drop packet and return */ if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT) { dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNC); return queued; } /* Step 13: process relevant Client states < CLOSEREQ */ switch (sk->sk_state) { case DCCP_REQUESTING: dccp_send_close(sk, 0); dccp_set_state(sk, DCCP_CLOSING); break; case DCCP_OPEN: case DCCP_PARTOPEN: /* Give waiting application a chance to read pending data */ queued = 1; dccp_fin(sk, skb); dccp_set_state(sk, DCCP_PASSIVE_CLOSEREQ); fallthrough; case DCCP_PASSIVE_CLOSEREQ: sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); } return queued; } static u16 dccp_reset_code_convert(const u8 code) { static const u16 error_code[] = { [DCCP_RESET_CODE_CLOSED] = 0, /* normal termination */ [DCCP_RESET_CODE_UNSPECIFIED] = 0, /* nothing known */ [DCCP_RESET_CODE_ABORTED] = ECONNRESET, [DCCP_RESET_CODE_NO_CONNECTION] = ECONNREFUSED, [DCCP_RESET_CODE_CONNECTION_REFUSED] = ECONNREFUSED, [DCCP_RESET_CODE_TOO_BUSY] = EUSERS, [DCCP_RESET_CODE_AGGRESSION_PENALTY] = EDQUOT, [DCCP_RESET_CODE_PACKET_ERROR] = ENOMSG, [DCCP_RESET_CODE_BAD_INIT_COOKIE] = EBADR, [DCCP_RESET_CODE_BAD_SERVICE_CODE] = EBADRQC, [DCCP_RESET_CODE_OPTION_ERROR] = EILSEQ, [DCCP_RESET_CODE_MANDATORY_ERROR] = EOPNOTSUPP, }; return code >= DCCP_MAX_RESET_CODES ? 0 : error_code[code]; } static void dccp_rcv_reset(struct sock *sk, struct sk_buff *skb) { u16 err = dccp_reset_code_convert(dccp_hdr_reset(skb)->dccph_reset_code); sk->sk_err = err; /* Queue the equivalent of TCP fin so that dccp_recvmsg exits the loop */ dccp_fin(sk, skb); if (err && !sock_flag(sk, SOCK_DEAD)) sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); dccp_time_wait(sk, DCCP_TIME_WAIT, 0); } static void dccp_handle_ackvec_processing(struct sock *sk, struct sk_buff *skb) { struct dccp_ackvec *av = dccp_sk(sk)->dccps_hc_rx_ackvec; if (av == NULL) return; if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) dccp_ackvec_clear_state(av, DCCP_SKB_CB(skb)->dccpd_ack_seq); dccp_ackvec_input(av, skb); } static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb) { const struct dccp_sock *dp = dccp_sk(sk); /* Don't deliver to RX CCID when node has shut down read end. */ if (!(sk->sk_shutdown & RCV_SHUTDOWN)) ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb); /* * Until the TX queue has been drained, we can not honour SHUT_WR, since * we need received feedback as input to adjust congestion control. */ if (sk->sk_write_queue.qlen > 0 || !(sk->sk_shutdown & SEND_SHUTDOWN)) ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb); } static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb) { const struct dccp_hdr *dh = dccp_hdr(skb); struct dccp_sock *dp = dccp_sk(sk); u64 lswl, lawl, seqno = DCCP_SKB_CB(skb)->dccpd_seq, ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; /* * Step 5: Prepare sequence numbers for Sync * If P.type == Sync or P.type == SyncAck, * If S.AWL <= P.ackno <= S.AWH and P.seqno >= S.SWL, * / * P is valid, so update sequence number variables * accordingly. After this update, P will pass the tests * in Step 6. A SyncAck is generated if necessary in * Step 15 * / * Update S.GSR, S.SWL, S.SWH * Otherwise, * Drop packet and return */ if (dh->dccph_type == DCCP_PKT_SYNC || dh->dccph_type == DCCP_PKT_SYNCACK) { if (between48(ackno, dp->dccps_awl, dp->dccps_awh) && dccp_delta_seqno(dp->dccps_swl, seqno) >= 0) dccp_update_gsr(sk, seqno); else return -1; } /* * Step 6: Check sequence numbers * Let LSWL = S.SWL and LAWL = S.AWL * If P.type == CloseReq or P.type == Close or P.type == Reset, * LSWL := S.GSR + 1, LAWL := S.GAR * If LSWL <= P.seqno <= S.SWH * and (P.ackno does not exist or LAWL <= P.ackno <= S.AWH), * Update S.GSR, S.SWL, S.SWH * If P.type != Sync, * Update S.GAR */ lswl = dp->dccps_swl; lawl = dp->dccps_awl; if (dh->dccph_type == DCCP_PKT_CLOSEREQ || dh->dccph_type == DCCP_PKT_CLOSE || dh->dccph_type == DCCP_PKT_RESET) { lswl = ADD48(dp->dccps_gsr, 1); lawl = dp->dccps_gar; } if (between48(seqno, lswl, dp->dccps_swh) && (ackno == DCCP_PKT_WITHOUT_ACK_SEQ || between48(ackno, lawl, dp->dccps_awh))) { dccp_update_gsr(sk, seqno); if (dh->dccph_type != DCCP_PKT_SYNC && ackno != DCCP_PKT_WITHOUT_ACK_SEQ && after48(ackno, dp->dccps_gar)) dp->dccps_gar = ackno; } else { unsigned long now = jiffies; /* * Step 6: Check sequence numbers * Otherwise, * If P.type == Reset, * Send Sync packet acknowledging S.GSR * Otherwise, * Send Sync packet acknowledging P.seqno * Drop packet and return * * These Syncs are rate-limited as per RFC 4340, 7.5.4: * at most 1 / (dccp_sync_rate_limit * HZ) Syncs per second. */ if (time_before(now, (dp->dccps_rate_last + sysctl_dccp_sync_ratelimit))) return -1; DCCP_WARN("Step 6 failed for %s packet, " "(LSWL(%llu) <= P.seqno(%llu) <= S.SWH(%llu)) and " "(P.ackno %s or LAWL(%llu) <= P.ackno(%llu) <= S.AWH(%llu), " "sending SYNC...\n", dccp_packet_name(dh->dccph_type), (unsigned long long) lswl, (unsigned long long) seqno, (unsigned long long) dp->dccps_swh, (ackno == DCCP_PKT_WITHOUT_ACK_SEQ) ? "doesn't exist" : "exists", (unsigned long long) lawl, (unsigned long long) ackno, (unsigned long long) dp->dccps_awh); dp->dccps_rate_last = now; if (dh->dccph_type == DCCP_PKT_RESET) seqno = dp->dccps_gsr; dccp_send_sync(sk, seqno, DCCP_PKT_SYNC); return -1; } return 0; } static int __dccp_rcv_established(struct sock *sk, struct sk_buff *skb, const struct dccp_hdr *dh, const unsigned int len) { struct dccp_sock *dp = dccp_sk(sk); switch (dccp_hdr(skb)->dccph_type) { case DCCP_PKT_DATAACK: case DCCP_PKT_DATA: /* * FIXME: schedule DATA_DROPPED (RFC 4340, 11.7.2) if and when * - sk_shutdown == RCV_SHUTDOWN, use Code 1, "Not Listening" * - sk_receive_queue is full, use Code 2, "Receive Buffer" */ dccp_enqueue_skb(sk, skb); return 0; case DCCP_PKT_ACK: goto discard; case DCCP_PKT_RESET: /* * Step 9: Process Reset * If P.type == Reset, * Tear down connection * S.state := TIMEWAIT * Set TIMEWAIT timer * Drop packet and return */ dccp_rcv_reset(sk, skb); return 0; case DCCP_PKT_CLOSEREQ: if (dccp_rcv_closereq(sk, skb)) return 0; goto discard; case DCCP_PKT_CLOSE: if (dccp_rcv_close(sk, skb)) return 0; goto discard; case DCCP_PKT_REQUEST: /* Step 7 * or (S.is_server and P.type == Response) * or (S.is_client and P.type == Request) * or (S.state >= OPEN and P.type == Request * and P.seqno >= S.OSR) * or (S.state >= OPEN and P.type == Response * and P.seqno >= S.OSR) * or (S.state == RESPOND and P.type == Data), * Send Sync packet acknowledging P.seqno * Drop packet and return */ if (dp->dccps_role != DCCP_ROLE_LISTEN) goto send_sync; goto check_seq; case DCCP_PKT_RESPONSE: if (dp->dccps_role != DCCP_ROLE_CLIENT) goto send_sync; check_seq: if (dccp_delta_seqno(dp->dccps_osr, DCCP_SKB_CB(skb)->dccpd_seq) >= 0) { send_sync: dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNC); } break; case DCCP_PKT_SYNC: dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNCACK); /* * From RFC 4340, sec. 5.7 * * As with DCCP-Ack packets, DCCP-Sync and DCCP-SyncAck packets * MAY have non-zero-length application data areas, whose * contents receivers MUST ignore. */ goto discard; } DCCP_INC_STATS(DCCP_MIB_INERRS); discard: __kfree_skb(skb); return 0; } int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, const struct dccp_hdr *dh, const unsigned int len) { if (dccp_check_seqno(sk, skb)) goto discard; if (dccp_parse_options(sk, NULL, skb)) return 1; dccp_handle_ackvec_processing(sk, skb); dccp_deliver_input_to_ccids(sk, skb); return __dccp_rcv_established(sk, skb, dh, len); discard: __kfree_skb(skb); return 0; } EXPORT_SYMBOL_GPL(dccp_rcv_established); static int dccp_rcv_request_sent_state_process(struct sock *sk, struct sk_buff *skb, const struct dccp_hdr *dh, const unsigned int len) { /* * Step 4: Prepare sequence numbers in REQUEST * If S.state == REQUEST, * If (P.type == Response or P.type == Reset) * and S.AWL <= P.ackno <= S.AWH, * / * Set sequence number variables corresponding to the * other endpoint, so P will pass the tests in Step 6 * / * Set S.GSR, S.ISR, S.SWL, S.SWH * / * Response processing continues in Step 10; Reset * processing continues in Step 9 * / */ if (dh->dccph_type == DCCP_PKT_RESPONSE) { const struct inet_connection_sock *icsk = inet_csk(sk); struct dccp_sock *dp = dccp_sk(sk); long tstamp = dccp_timestamp(); if (!between48(DCCP_SKB_CB(skb)->dccpd_ack_seq, dp->dccps_awl, dp->dccps_awh)) { dccp_pr_debug("invalid ackno: S.AWL=%llu, " "P.ackno=%llu, S.AWH=%llu\n", (unsigned long long)dp->dccps_awl, (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq, (unsigned long long)dp->dccps_awh); goto out_invalid_packet; } /* * If option processing (Step 8) failed, return 1 here so that * dccp_v4_do_rcv() sends a Reset. The Reset code depends on * the option type and is set in dccp_parse_options(). */ if (dccp_parse_options(sk, NULL, skb)) return 1; /* Obtain usec RTT sample from SYN exchange (used by TFRC). */ if (likely(dp->dccps_options_received.dccpor_timestamp_echo)) dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * (tstamp - dp->dccps_options_received.dccpor_timestamp_echo)); /* Stop the REQUEST timer */ inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); WARN_ON(sk->sk_send_head == NULL); kfree_skb(sk->sk_send_head); sk->sk_send_head = NULL; /* * Set ISR, GSR from packet. ISS was set in dccp_v{4,6}_connect * and GSS in dccp_transmit_skb(). Setting AWL/AWH and SWL/SWH * is done as part of activating the feature values below, since * these settings depend on the local/remote Sequence Window * features, which were undefined or not confirmed until now. */ dp->dccps_gsr = dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq; dccp_sync_mss(sk, icsk->icsk_pmtu_cookie); /* * Step 10: Process REQUEST state (second part) * If S.state == REQUEST, * / * If we get here, P is a valid Response from the * server (see Step 4), and we should move to * PARTOPEN state. PARTOPEN means send an Ack, * don't send Data packets, retransmit Acks * periodically, and always include any Init Cookie * from the Response * / * S.state := PARTOPEN * Set PARTOPEN timer * Continue with S.state == PARTOPEN * / * Step 12 will send the Ack completing the * three-way handshake * / */ dccp_set_state(sk, DCCP_PARTOPEN); /* * If feature negotiation was successful, activate features now; * an activation failure means that this host could not activate * one ore more features (e.g. insufficient memory), which would * leave at least one feature in an undefined state. */ if (dccp_feat_activate_values(sk, &dp->dccps_featneg)) goto unable_to_proceed; /* Make sure socket is routed, for correct metrics. */ icsk->icsk_af_ops->rebuild_header(sk); if (!sock_flag(sk, SOCK_DEAD)) { sk->sk_state_change(sk); sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); } if (sk->sk_write_pending || inet_csk_in_pingpong_mode(sk) || icsk->icsk_accept_queue.rskq_defer_accept) { /* Save one ACK. Data will be ready after * several ticks, if write_pending is set. * * It may be deleted, but with this feature tcpdumps * look so _wonderfully_ clever, that I was not able * to stand against the temptation 8) --ANK */ /* * OK, in DCCP we can as well do a similar trick, its * even in the draft, but there is no need for us to * schedule an ack here, as dccp_sendmsg does this for * us, also stated in the draft. -acme */ __kfree_skb(skb); return 0; } dccp_send_ack(sk); return -1; } out_invalid_packet: /* dccp_v4_do_rcv will send a reset */ DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR; return 1; unable_to_proceed: DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_ABORTED; /* * We mark this socket as no longer usable, so that the loop in * dccp_sendmsg() terminates and the application gets notified. */ dccp_set_state(sk, DCCP_CLOSED); sk->sk_err = ECOMM; return 1; } static int dccp_rcv_respond_partopen_state_process(struct sock *sk, struct sk_buff *skb, const struct dccp_hdr *dh, const unsigned int len) { struct dccp_sock *dp = dccp_sk(sk); u32 sample = dp->dccps_options_received.dccpor_timestamp_echo; int queued = 0; switch (dh->dccph_type) { case DCCP_PKT_RESET: inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); break; case DCCP_PKT_DATA: if (sk->sk_state == DCCP_RESPOND) break; fallthrough; case DCCP_PKT_DATAACK: case DCCP_PKT_ACK: /* * FIXME: we should be resetting the PARTOPEN (DELACK) timer * here but only if we haven't used the DELACK timer for * something else, like sending a delayed ack for a TIMESTAMP * echo, etc, for now were not clearing it, sending an extra * ACK when there is nothing else to do in DELACK is not a big * deal after all. */ /* Stop the PARTOPEN timer */ if (sk->sk_state == DCCP_PARTOPEN) inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); /* Obtain usec RTT sample from SYN exchange (used by TFRC). */ if (likely(sample)) { long delta = dccp_timestamp() - sample; dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * delta); } dp->dccps_osr = DCCP_SKB_CB(skb)->dccpd_seq; dccp_set_state(sk, DCCP_OPEN); if (dh->dccph_type == DCCP_PKT_DATAACK || dh->dccph_type == DCCP_PKT_DATA) { __dccp_rcv_established(sk, skb, dh, len); queued = 1; /* packet was queued (by __dccp_rcv_established) */ } break; } return queued; } int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, struct dccp_hdr *dh, unsigned int len) { struct dccp_sock *dp = dccp_sk(sk); struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); const int old_state = sk->sk_state; bool acceptable; int queued = 0; /* * Step 3: Process LISTEN state * * If S.state == LISTEN, * If P.type == Request or P contains a valid Init Cookie option, * (* Must scan the packet's options to check for Init * Cookies. Only Init Cookies are processed here, * however; other options are processed in Step 8. This * scan need only be performed if the endpoint uses Init * Cookies *) * (* Generate a new socket and switch to that socket *) * Set S := new socket for this port pair * S.state = RESPOND * Choose S.ISS (initial seqno) or set from Init Cookies * Initialize S.GAR := S.ISS * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init * Cookies Continue with S.state == RESPOND * (* A Response packet will be generated in Step 11 *) * Otherwise, * Generate Reset(No Connection) unless P.type == Reset * Drop packet and return */ if (sk->sk_state == DCCP_LISTEN) { if (dh->dccph_type == DCCP_PKT_REQUEST) { /* It is possible that we process SYN packets from backlog, * so we need to make sure to disable BH and RCU right there. */ rcu_read_lock(); local_bh_disable(); acceptable = inet_csk(sk)->icsk_af_ops->conn_request(sk, skb) >= 0; local_bh_enable(); rcu_read_unlock(); if (!acceptable) return 1; consume_skb(skb); return 0; } if (dh->dccph_type == DCCP_PKT_RESET) goto discard; /* Caller (dccp_v4_do_rcv) will send Reset */ dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; return 1; } else if (sk->sk_state == DCCP_CLOSED) { dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; return 1; } /* Step 6: Check sequence numbers (omitted in LISTEN/REQUEST state) */ if (sk->sk_state != DCCP_REQUESTING && dccp_check_seqno(sk, skb)) goto discard; /* * Step 7: Check for unexpected packet types * If (S.is_server and P.type == Response) * or (S.is_client and P.type == Request) * or (S.state == RESPOND and P.type == Data), * Send Sync packet acknowledging P.seqno * Drop packet and return */ if ((dp->dccps_role != DCCP_ROLE_CLIENT && dh->dccph_type == DCCP_PKT_RESPONSE) || (dp->dccps_role == DCCP_ROLE_CLIENT && dh->dccph_type == DCCP_PKT_REQUEST) || (sk->sk_state == DCCP_RESPOND && dh->dccph_type == DCCP_PKT_DATA)) { dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC); goto discard; } /* Step 8: Process options */ if (dccp_parse_options(sk, NULL, skb)) return 1; /* * Step 9: Process Reset * If P.type == Reset, * Tear down connection * S.state := TIMEWAIT * Set TIMEWAIT timer * Drop packet and return */ if (dh->dccph_type == DCCP_PKT_RESET) { dccp_rcv_reset(sk, skb); return 0; } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) { /* Step 13 */ if (dccp_rcv_closereq(sk, skb)) return 0; goto discard; } else if (dh->dccph_type == DCCP_PKT_CLOSE) { /* Step 14 */ if (dccp_rcv_close(sk, skb)) return 0; goto discard; } switch (sk->sk_state) { case DCCP_REQUESTING: queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len); if (queued >= 0) return queued; __kfree_skb(skb); return 0; case DCCP_PARTOPEN: /* Step 8: if using Ack Vectors, mark packet acknowledgeable */ dccp_handle_ackvec_processing(sk, skb); dccp_deliver_input_to_ccids(sk, skb); fallthrough; case DCCP_RESPOND: queued = dccp_rcv_respond_partopen_state_process(sk, skb, dh, len); break; } if (dh->dccph_type == DCCP_PKT_ACK || dh->dccph_type == DCCP_PKT_DATAACK) { switch (old_state) { case DCCP_PARTOPEN: sk->sk_state_change(sk); sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); break; } } else if (unlikely(dh->dccph_type == DCCP_PKT_SYNC)) { dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNCACK); goto discard; } if (!queued) { discard: __kfree_skb(skb); } return 0; } EXPORT_SYMBOL_GPL(dccp_rcv_state_process); /** * dccp_sample_rtt - Validate and finalise computation of RTT sample * @sk: socket structure * @delta: number of microseconds between packet and acknowledgment * * The routine is kept generic to work in different contexts. It should be * called immediately when the ACK used for the RTT sample arrives. */ u32 dccp_sample_rtt(struct sock *sk, long delta) { /* dccpor_elapsed_time is either zeroed out or set and > 0 */ delta -= dccp_sk(sk)->dccps_options_received.dccpor_elapsed_time * 10; if (unlikely(delta <= 0)) { DCCP_WARN("unusable RTT sample %ld, using min\n", delta); return DCCP_SANE_RTT_MIN; } if (unlikely(delta > DCCP_SANE_RTT_MAX)) { DCCP_WARN("RTT sample %ld too large, using max\n", delta); return DCCP_SANE_RTT_MAX; } return delta; }
8 8 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 // SPDX-License-Identifier: GPL-2.0-or-later /* * ip_vs_ftp.c: IPVS ftp application module * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * * Changes: * * Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference * is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp. * * IP_MASQ_FTP ftp masquerading module * * Version: @(#)ip_masq_ftp.c 0.04 02/05/96 * * Author: Wouter Gadeyne */ #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/ctype.h> #include <linux/inet.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/netfilter.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_helper.h> #include <linux/gfp.h> #include <net/protocol.h> #include <net/tcp.h> #include <linux/unaligned.h> #include <net/ip_vs.h> #define SERVER_STRING_PASV "227 " #define CLIENT_STRING_PORT "PORT" #define SERVER_STRING_EPSV "229 " #define CLIENT_STRING_EPRT "EPRT" enum { IP_VS_FTP_ACTIVE = 0, IP_VS_FTP_PORT = 0, IP_VS_FTP_PASV, IP_VS_FTP_EPRT, IP_VS_FTP_EPSV, }; /* * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper * First port is set to the default port. */ static unsigned int ports_count = 1; static unsigned short ports[IP_VS_APP_MAX_PORTS] = {21, 0}; module_param_array(ports, ushort, &ports_count, 0444); MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands"); static char *ip_vs_ftp_data_ptr(struct sk_buff *skb, struct ip_vs_iphdr *ipvsh) { struct tcphdr *th = (struct tcphdr *)((char *)skb->data + ipvsh->len); if ((th->doff << 2) < sizeof(struct tcphdr)) return NULL; return (char *)th + (th->doff << 2); } static int ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) { /* We use connection tracking for the command connection */ cp->flags |= IP_VS_CONN_F_NFCT; return 0; } static int ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) { return 0; } /* Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started * with the "pattern". <addr,port> is in network order. * Parse extended format depending on ext. In this case addr can be pre-set. */ static int ip_vs_ftp_get_addrport(char *data, char *data_limit, const char *pattern, size_t plen, char skip, bool ext, int mode, union nf_inet_addr *addr, __be16 *port, __u16 af, char **start, char **end) { char *s, c; unsigned char p[6]; char edelim; __u16 hport; int i = 0; if (data_limit - data < plen) { /* check if there is partial match */ if (strncasecmp(data, pattern, data_limit - data) == 0) return -1; else return 0; } if (strncasecmp(data, pattern, plen) != 0) { return 0; } s = data + plen; if (skip) { bool found = false; for (;; s++) { if (s == data_limit) return -1; if (!found) { /* "(" is optional for non-extended format, * so catch the start of IPv4 address */ if (!ext && isdigit(*s)) break; if (*s == skip) found = true; } else if (*s != skip) { break; } } } /* Old IPv4-only format? */ if (!ext) { p[0] = 0; for (data = s; ; data++) { if (data == data_limit) return -1; c = *data; if (isdigit(c)) { p[i] = p[i]*10 + c - '0'; } else if (c == ',' && i < 5) { i++; p[i] = 0; } else { /* unexpected character or terminator */ break; } } if (i != 5) return -1; *start = s; *end = data; addr->ip = get_unaligned((__be32 *) p); *port = get_unaligned((__be16 *) (p + 4)); return 1; } if (s == data_limit) return -1; *start = s; edelim = *s++; if (edelim < 33 || edelim > 126) return -1; if (s == data_limit) return -1; if (*s == edelim) { /* Address family is usually missing for EPSV response */ if (mode != IP_VS_FTP_EPSV) return -1; s++; if (s == data_limit) return -1; /* Then address should be missing too */ if (*s != edelim) return -1; /* Caller can pre-set addr, if needed */ s++; } else { const char *ep; /* We allow address only from same family */ if (af == AF_INET6 && *s != '2') return -1; if (af == AF_INET && *s != '1') return -1; s++; if (s == data_limit) return -1; if (*s != edelim) return -1; s++; if (s == data_limit) return -1; if (af == AF_INET6) { if (in6_pton(s, data_limit - s, (u8 *)addr, edelim, &ep) <= 0) return -1; } else { if (in4_pton(s, data_limit - s, (u8 *)addr, edelim, &ep) <= 0) return -1; } s = (char *) ep; if (s == data_limit) return -1; if (*s != edelim) return -1; s++; } for (hport = 0; ; s++) { if (s == data_limit) return -1; if (!isdigit(*s)) break; hport = hport * 10 + *s - '0'; } if (s == data_limit || !hport || *s != edelim) return -1; s++; *end = s; *port = htons(hport); return 1; } /* Look at outgoing ftp packets to catch the response to a PASV/EPSV command * from the server (inside-to-outside). * When we see one, we build a connection entry with the client address, * client port 0 (unknown at the moment), the server address and the * server port. Mark the current connection entry as a control channel * of the new entry. All this work is just to make the data connection * can be scheduled to the right server later. * * The outgoing packet should be something like * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)". * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number. * The extended format for EPSV response provides usually only port: * "229 Entering Extended Passive Mode (|||ppp|)" */ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, struct sk_buff *skb, int *diff, struct ip_vs_iphdr *ipvsh) { char *data, *data_limit; char *start, *end; union nf_inet_addr from; __be16 port; struct ip_vs_conn *n_cp; char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */ unsigned int buf_len; int ret = 0; enum ip_conntrack_info ctinfo; struct nf_conn *ct; *diff = 0; /* Only useful for established sessions */ if (cp->state != IP_VS_TCP_S_ESTABLISHED) return 1; /* Linear packets are much easier to deal with. */ if (skb_ensure_writable(skb, skb->len)) return 0; if (cp->app_data == (void *) IP_VS_FTP_PASV) { data = ip_vs_ftp_data_ptr(skb, ipvsh); data_limit = skb_tail_pointer(skb); if (!data || data >= data_limit) return 1; if (ip_vs_ftp_get_addrport(data, data_limit, SERVER_STRING_PASV, sizeof(SERVER_STRING_PASV)-1, '(', false, IP_VS_FTP_PASV, &from, &port, cp->af, &start, &end) != 1) return 1; IP_VS_DBG(7, "PASV response (%pI4:%u) -> %pI4:%u detected\n", &from.ip, ntohs(port), &cp->caddr.ip, 0); } else if (cp->app_data == (void *) IP_VS_FTP_EPSV) { data = ip_vs_ftp_data_ptr(skb, ipvsh); data_limit = skb_tail_pointer(skb); if (!data || data >= data_limit) return 1; /* Usually, data address is not specified but * we support different address, so pre-set it. */ from = cp->daddr; if (ip_vs_ftp_get_addrport(data, data_limit, SERVER_STRING_EPSV, sizeof(SERVER_STRING_EPSV)-1, '(', true, IP_VS_FTP_EPSV, &from, &port, cp->af, &start, &end) != 1) return 1; IP_VS_DBG_BUF(7, "EPSV response (%s:%u) -> %s:%u detected\n", IP_VS_DBG_ADDR(cp->af, &from), ntohs(port), IP_VS_DBG_ADDR(cp->af, &cp->caddr), 0); } else { return 1; } /* Now update or create a connection entry for it */ { struct ip_vs_conn_param p; ip_vs_conn_fill_param(cp->ipvs, cp->af, ipvsh->protocol, &from, port, &cp->caddr, 0, &p); n_cp = ip_vs_conn_out_get(&p); } if (!n_cp) { struct ip_vs_conn_param p; ip_vs_conn_fill_param(cp->ipvs, cp->af, ipvsh->protocol, &cp->caddr, 0, &cp->vaddr, port, &p); n_cp = ip_vs_conn_new(&p, cp->af, &from, port, IP_VS_CONN_F_NO_CPORT | IP_VS_CONN_F_NFCT, cp->dest, skb->mark); if (!n_cp) return 0; /* add its controller */ ip_vs_control_add(n_cp, cp); } /* Replace the old passive address with the new one */ if (cp->app_data == (void *) IP_VS_FTP_PASV) { from.ip = n_cp->vaddr.ip; port = n_cp->vport; snprintf(buf, sizeof(buf), "%u,%u,%u,%u,%u,%u", ((unsigned char *)&from.ip)[0], ((unsigned char *)&from.ip)[1], ((unsigned char *)&from.ip)[2], ((unsigned char *)&from.ip)[3], ntohs(port) >> 8, ntohs(port) & 0xFF); } else if (cp->app_data == (void *) IP_VS_FTP_EPSV) { from = n_cp->vaddr; port = n_cp->vport; /* Only port, client will use VIP for the data connection */ snprintf(buf, sizeof(buf), "|||%u|", ntohs(port)); } else { *buf = 0; } buf_len = strlen(buf); ct = nf_ct_get(skb, &ctinfo); if (ct) { bool mangled; /* If mangling fails this function will return 0 * which will cause the packet to be dropped. * Mangling can only fail under memory pressure, * hopefully it will succeed on the retransmitted * packet. */ mangled = nf_nat_mangle_tcp_packet(skb, ct, ctinfo, ipvsh->len, start - data, end - start, buf, buf_len); if (mangled) { ip_vs_nfct_expect_related(skb, ct, n_cp, ipvsh->protocol, 0, 0); if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_UNNECESSARY; /* csum is updated */ ret = 1; } } /* Not setting 'diff' is intentional, otherwise the sequence * would be adjusted twice. */ cp->app_data = (void *) IP_VS_FTP_ACTIVE; ip_vs_tcp_conn_listen(n_cp); ip_vs_conn_put(n_cp); return ret; } /* Look at incoming ftp packets to catch the PASV/PORT/EPRT/EPSV command * (outside-to-inside). * * The incoming packet having the PORT command should be something like * "PORT xxx,xxx,xxx,xxx,ppp,ppp\n". * xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number. * In this case, we create a connection entry using the client address and * port, so that the active ftp data connection from the server can reach * the client. * Extended format: * "EPSV\r\n" when client requests server address from same family * "EPSV 1\r\n" when client requests IPv4 server address * "EPSV 2\r\n" when client requests IPv6 server address * "EPSV ALL\r\n" - not supported * EPRT with specified delimiter (ASCII 33..126), "|" by default: * "EPRT |1|IPv4ADDR|PORT|\r\n" when client provides IPv4 addrport * "EPRT |2|IPv6ADDR|PORT|\r\n" when client provides IPv6 addrport */ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, struct sk_buff *skb, int *diff, struct ip_vs_iphdr *ipvsh) { char *data, *data_start, *data_limit; char *start, *end; union nf_inet_addr to; __be16 port; struct ip_vs_conn *n_cp; /* no diff required for incoming packets */ *diff = 0; /* Only useful for established sessions */ if (cp->state != IP_VS_TCP_S_ESTABLISHED) return 1; /* Linear packets are much easier to deal with. */ if (skb_ensure_writable(skb, skb->len)) return 0; data = data_start = ip_vs_ftp_data_ptr(skb, ipvsh); data_limit = skb_tail_pointer(skb); if (!data || data >= data_limit) return 1; while (data <= data_limit - 6) { if (cp->af == AF_INET && strncasecmp(data, "PASV\r\n", 6) == 0) { /* Passive mode on */ IP_VS_DBG(7, "got PASV at %td of %td\n", data - data_start, data_limit - data_start); cp->app_data = (void *) IP_VS_FTP_PASV; return 1; } /* EPSV or EPSV<space><net-prt> */ if (strncasecmp(data, "EPSV", 4) == 0 && (data[4] == ' ' || data[4] == '\r')) { if (data[4] == ' ') { char proto = data[5]; if (data > data_limit - 7 || data[6] != '\r') return 1; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6 && proto == '2') { } else #endif if (cp->af == AF_INET && proto == '1') { } else { return 1; } } /* Extended Passive mode on */ IP_VS_DBG(7, "got EPSV at %td of %td\n", data - data_start, data_limit - data_start); cp->app_data = (void *) IP_VS_FTP_EPSV; return 1; } data++; } /* * To support virtual FTP server, the scenerio is as follows: * FTP client ----> Load Balancer ----> FTP server * First detect the port number in the application data, * then create a new connection entry for the coming data * connection. */ if (cp->af == AF_INET && ip_vs_ftp_get_addrport(data_start, data_limit, CLIENT_STRING_PORT, sizeof(CLIENT_STRING_PORT)-1, ' ', false, IP_VS_FTP_PORT, &to, &port, cp->af, &start, &end) == 1) { IP_VS_DBG(7, "PORT %pI4:%u detected\n", &to.ip, ntohs(port)); /* Now update or create a connection entry for it */ IP_VS_DBG(7, "protocol %s %pI4:%u %pI4:%u\n", ip_vs_proto_name(ipvsh->protocol), &to.ip, ntohs(port), &cp->vaddr.ip, ntohs(cp->vport)-1); } else if (ip_vs_ftp_get_addrport(data_start, data_limit, CLIENT_STRING_EPRT, sizeof(CLIENT_STRING_EPRT)-1, ' ', true, IP_VS_FTP_EPRT, &to, &port, cp->af, &start, &end) == 1) { IP_VS_DBG_BUF(7, "EPRT %s:%u detected\n", IP_VS_DBG_ADDR(cp->af, &to), ntohs(port)); /* Now update or create a connection entry for it */ IP_VS_DBG_BUF(7, "protocol %s %s:%u %s:%u\n", ip_vs_proto_name(ipvsh->protocol), IP_VS_DBG_ADDR(cp->af, &to), ntohs(port), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport)-1); } else { return 1; } /* Passive mode off */ cp->app_data = (void *) IP_VS_FTP_ACTIVE; { struct ip_vs_conn_param p; ip_vs_conn_fill_param(cp->ipvs, cp->af, ipvsh->protocol, &to, port, &cp->vaddr, htons(ntohs(cp->vport)-1), &p); n_cp = ip_vs_conn_in_get(&p); if (!n_cp) { n_cp = ip_vs_conn_new(&p, cp->af, &cp->daddr, htons(ntohs(cp->dport)-1), IP_VS_CONN_F_NFCT, cp->dest, skb->mark); if (!n_cp) return 0; /* add its controller */ ip_vs_control_add(n_cp, cp); } } /* * Move tunnel to listen state */ ip_vs_tcp_conn_listen(n_cp); ip_vs_conn_put(n_cp); return 1; } static struct ip_vs_app ip_vs_ftp = { .name = "ftp", .type = IP_VS_APP_TYPE_FTP, .protocol = IPPROTO_TCP, .module = THIS_MODULE, .incs_list = LIST_HEAD_INIT(ip_vs_ftp.incs_list), .init_conn = ip_vs_ftp_init_conn, .done_conn = ip_vs_ftp_done_conn, .bind_conn = NULL, .unbind_conn = NULL, .pkt_out = ip_vs_ftp_out, .pkt_in = ip_vs_ftp_in, }; /* * per netns ip_vs_ftp initialization */ static int __net_init __ip_vs_ftp_init(struct net *net) { int i, ret; struct ip_vs_app *app; struct netns_ipvs *ipvs = net_ipvs(net); if (!ipvs) return -ENOENT; app = register_ip_vs_app(ipvs, &ip_vs_ftp); if (IS_ERR(app)) return PTR_ERR(app); for (i = 0; i < ports_count; i++) { if (!ports[i]) continue; ret = register_ip_vs_app_inc(ipvs, app, app->protocol, ports[i]); if (ret) goto err_unreg; } return 0; err_unreg: unregister_ip_vs_app(ipvs, &ip_vs_ftp); return ret; } /* * netns exit */ static void __ip_vs_ftp_exit(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); if (!ipvs) return; unregister_ip_vs_app(ipvs, &ip_vs_ftp); } static struct pernet_operations ip_vs_ftp_ops = { .init = __ip_vs_ftp_init, .exit = __ip_vs_ftp_exit, }; static int __init ip_vs_ftp_init(void) { /* rcu_barrier() is called by netns on error */ return register_pernet_subsys(&ip_vs_ftp_ops); } /* * ip_vs_ftp finish. */ static void __exit ip_vs_ftp_exit(void) { unregister_pernet_subsys(&ip_vs_ftp_ops); /* rcu_barrier() is called by netns */ } module_init(ip_vs_ftp_init); module_exit(ip_vs_ftp_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("ipvs ftp helper");
3277 3273 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM workqueue #if !defined(_TRACE_WORKQUEUE_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_WORKQUEUE_H #include <linux/tracepoint.h> #include <linux/workqueue.h> struct pool_workqueue; /** * workqueue_queue_work - called when a work gets queued * @req_cpu: the requested cpu * @pwq: pointer to struct pool_workqueue * @work: pointer to struct work_struct * * This event occurs when a work is queued immediately or once a * delayed work is actually queued on a workqueue (ie: once the delay * has been reached). */ TRACE_EVENT(workqueue_queue_work, TP_PROTO(int req_cpu, struct pool_workqueue *pwq, struct work_struct *work), TP_ARGS(req_cpu, pwq, work), TP_STRUCT__entry( __field( void *, work ) __field( void *, function) __string( workqueue, pwq->wq->name) __field( int, req_cpu ) __field( int, cpu ) ), TP_fast_assign( __entry->work = work; __entry->function = work->func; __assign_str(workqueue); __entry->req_cpu = req_cpu; __entry->cpu = pwq->pool->cpu; ), TP_printk("work struct=%p function=%ps workqueue=%s req_cpu=%d cpu=%d", __entry->work, __entry->function, __get_str(workqueue), __entry->req_cpu, __entry->cpu) ); /** * workqueue_activate_work - called when a work gets activated * @work: pointer to struct work_struct * * This event occurs when a queued work is put on the active queue, * which happens immediately after queueing unless @max_active limit * is reached. */ TRACE_EVENT(workqueue_activate_work, TP_PROTO(struct work_struct *work), TP_ARGS(work), TP_STRUCT__entry( __field( void *, work ) __field( void *, function) ), TP_fast_assign( __entry->work = work; __entry->function = work->func; ), TP_printk("work struct %p function=%ps ", __entry->work, __entry->function) ); /** * workqueue_execute_start - called immediately before the workqueue callback * @work: pointer to struct work_struct * * Allows to track workqueue execution. */ TRACE_EVENT(workqueue_execute_start, TP_PROTO(struct work_struct *work), TP_ARGS(work), TP_STRUCT__entry( __field( void *, work ) __field( void *, function) ), TP_fast_assign( __entry->work = work; __entry->function = work->func; ), TP_printk("work struct %p: function %ps", __entry->work, __entry->function) ); /** * workqueue_execute_end - called immediately after the workqueue callback * @work: pointer to struct work_struct * @function: pointer to worker function * * Allows to track workqueue execution. */ TRACE_EVENT(workqueue_execute_end, TP_PROTO(struct work_struct *work, work_func_t function), TP_ARGS(work, function), TP_STRUCT__entry( __field( void *, work ) __field( void *, function) ), TP_fast_assign( __entry->work = work; __entry->function = function; ), TP_printk("work struct %p: function %ps", __entry->work, __entry->function) ); #endif /* _TRACE_WORKQUEUE_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
4 4 1 3 3 8 8 86 85 85 85 85 85 85 7 2 5 5 5 2 2 14 39 39 39 38 33 6 5 5 36 1 35 14 20 20 1 12 1 1 1 11 2 4 1 20 20 39 3 6 3 66 64 20 14 62 66 8 8 8 8 6 6 5 3 2 8 8 1 6 6 6 1 5 6 3 1 2 2 3 3 3 1 1 1 1 1 1 1 3 6 6 8 2 4 2 6 5 1 6 6 6 3 6 959 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ #include "soft-interface.h" #include "main.h" #include <linux/atomic.h> #include <linux/byteorder/generic.h> #include <linux/cache.h> #include <linux/compiler.h> #include <linux/container_of.h> #include <linux/cpumask.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/jiffies.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/percpu.h> #include <linux/random.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/socket.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> #include <net/net_namespace.h> #include <net/netlink.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "bat_algo.h" #include "bridge_loop_avoidance.h" #include "distributed-arp-table.h" #include "gateway_client.h" #include "hard-interface.h" #include "multicast.h" #include "network-coding.h" #include "send.h" #include "translation-table.h" /** * batadv_skb_head_push() - Increase header size and move (push) head pointer * @skb: packet buffer which should be modified * @len: number of bytes to add * * Return: 0 on success or negative error number in case of failure */ int batadv_skb_head_push(struct sk_buff *skb, unsigned int len) { int result; /* TODO: We must check if we can release all references to non-payload * data using __skb_header_release in our skbs to allow skb_cow_header * to work optimally. This means that those skbs are not allowed to read * or write any data which is before the current position of skb->data * after that call and thus allow other skbs with the same data buffer * to write freely in that area. */ result = skb_cow_head(skb, len); if (result < 0) return result; skb_push(skb, len); return 0; } static int batadv_interface_open(struct net_device *dev) { netif_start_queue(dev); return 0; } static int batadv_interface_release(struct net_device *dev) { netif_stop_queue(dev); return 0; } /** * batadv_sum_counter() - Sum the cpu-local counters for index 'idx' * @bat_priv: the bat priv with all the soft interface information * @idx: index of counter to sum up * * Return: sum of all cpu-local counters */ static u64 batadv_sum_counter(struct batadv_priv *bat_priv, size_t idx) { u64 *counters, sum = 0; int cpu; for_each_possible_cpu(cpu) { counters = per_cpu_ptr(bat_priv->bat_counters, cpu); sum += counters[idx]; } return sum; } static struct net_device_stats *batadv_interface_stats(struct net_device *dev) { struct batadv_priv *bat_priv = netdev_priv(dev); struct net_device_stats *stats = &dev->stats; stats->tx_packets = batadv_sum_counter(bat_priv, BATADV_CNT_TX); stats->tx_bytes = batadv_sum_counter(bat_priv, BATADV_CNT_TX_BYTES); stats->tx_dropped = batadv_sum_counter(bat_priv, BATADV_CNT_TX_DROPPED); stats->rx_packets = batadv_sum_counter(bat_priv, BATADV_CNT_RX); stats->rx_bytes = batadv_sum_counter(bat_priv, BATADV_CNT_RX_BYTES); return stats; } static int batadv_interface_set_mac_addr(struct net_device *dev, void *p) { struct batadv_priv *bat_priv = netdev_priv(dev); struct batadv_softif_vlan *vlan; struct sockaddr *addr = p; u8 old_addr[ETH_ALEN]; if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; ether_addr_copy(old_addr, dev->dev_addr); eth_hw_addr_set(dev, addr->sa_data); /* only modify transtable if it has been initialized before */ if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) return 0; rcu_read_lock(); hlist_for_each_entry_rcu(vlan, &bat_priv->softif_vlan_list, list) { batadv_tt_local_remove(bat_priv, old_addr, vlan->vid, "mac address changed", false); batadv_tt_local_add(dev, addr->sa_data, vlan->vid, BATADV_NULL_IFINDEX, BATADV_NO_MARK); } rcu_read_unlock(); return 0; } static int batadv_interface_change_mtu(struct net_device *dev, int new_mtu) { struct batadv_priv *bat_priv = netdev_priv(dev); /* check ranges */ if (new_mtu < ETH_MIN_MTU || new_mtu > batadv_hardif_min_mtu(dev)) return -EINVAL; WRITE_ONCE(dev->mtu, new_mtu); bat_priv->mtu_set_by_user = new_mtu; return 0; } /** * batadv_interface_set_rx_mode() - set the rx mode of a device * @dev: registered network device to modify * * We do not actually need to set any rx filters for the virtual batman * soft interface. However a dummy handler enables a user to set static * multicast listeners for instance. */ static void batadv_interface_set_rx_mode(struct net_device *dev) { } static netdev_tx_t batadv_interface_tx(struct sk_buff *skb, struct net_device *soft_iface) { struct ethhdr *ethhdr; struct batadv_priv *bat_priv = netdev_priv(soft_iface); struct batadv_hard_iface *primary_if = NULL; struct batadv_bcast_packet *bcast_packet; static const u8 stp_addr[ETH_ALEN] = {0x01, 0x80, 0xC2, 0x00, 0x00, 0x00}; static const u8 ectp_addr[ETH_ALEN] = {0xCF, 0x00, 0x00, 0x00, 0x00, 0x00}; enum batadv_dhcp_recipient dhcp_rcp = BATADV_DHCP_NO; u8 *dst_hint = NULL, chaddr[ETH_ALEN]; struct vlan_ethhdr *vhdr; unsigned int header_len = 0; int data_len = skb->len, ret; unsigned long brd_delay = 0; bool do_bcast = false, client_added; unsigned short vid; u32 seqno; int gw_mode; enum batadv_forw_mode forw_mode = BATADV_FORW_BCAST; int mcast_is_routable = 0; int network_offset = ETH_HLEN; __be16 proto; if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) goto dropped; /* reset control block to avoid left overs from previous users */ memset(skb->cb, 0, sizeof(struct batadv_skb_cb)); netif_trans_update(soft_iface); vid = batadv_get_vid(skb, 0); skb_reset_mac_header(skb); ethhdr = eth_hdr(skb); proto = ethhdr->h_proto; switch (ntohs(proto)) { case ETH_P_8021Q: if (!pskb_may_pull(skb, sizeof(*vhdr))) goto dropped; vhdr = vlan_eth_hdr(skb); proto = vhdr->h_vlan_encapsulated_proto; /* drop batman-in-batman packets to prevent loops */ if (proto != htons(ETH_P_BATMAN)) { network_offset += VLAN_HLEN; break; } fallthrough; case ETH_P_BATMAN: goto dropped; } skb_set_network_header(skb, network_offset); if (batadv_bla_tx(bat_priv, skb, vid)) goto dropped; /* skb->data might have been reallocated by batadv_bla_tx() */ ethhdr = eth_hdr(skb); /* Register the client MAC in the transtable */ if (!is_multicast_ether_addr(ethhdr->h_source) && !batadv_bla_is_loopdetect_mac(ethhdr->h_source)) { client_added = batadv_tt_local_add(soft_iface, ethhdr->h_source, vid, skb->skb_iif, skb->mark); if (!client_added) goto dropped; } /* Snoop address candidates from DHCPACKs for early DAT filling */ batadv_dat_snoop_outgoing_dhcp_ack(bat_priv, skb, proto, vid); /* don't accept stp packets. STP does not help in meshes. * better use the bridge loop avoidance ... * * The same goes for ECTP sent at least by some Cisco Switches, * it might confuse the mesh when used with bridge loop avoidance. */ if (batadv_compare_eth(ethhdr->h_dest, stp_addr)) goto dropped; if (batadv_compare_eth(ethhdr->h_dest, ectp_addr)) goto dropped; gw_mode = atomic_read(&bat_priv->gw.mode); if (is_multicast_ether_addr(ethhdr->h_dest)) { /* if gw mode is off, broadcast every packet */ if (gw_mode == BATADV_GW_MODE_OFF) { do_bcast = true; goto send; } dhcp_rcp = batadv_gw_dhcp_recipient_get(skb, &header_len, chaddr); /* skb->data may have been modified by * batadv_gw_dhcp_recipient_get() */ ethhdr = eth_hdr(skb); /* if gw_mode is on, broadcast any non-DHCP message. * All the DHCP packets are going to be sent as unicast */ if (dhcp_rcp == BATADV_DHCP_NO) { do_bcast = true; goto send; } if (dhcp_rcp == BATADV_DHCP_TO_CLIENT) dst_hint = chaddr; else if ((gw_mode == BATADV_GW_MODE_SERVER) && (dhcp_rcp == BATADV_DHCP_TO_SERVER)) /* gateways should not forward any DHCP message if * directed to a DHCP server */ goto dropped; send: if (do_bcast && !is_broadcast_ether_addr(ethhdr->h_dest)) { forw_mode = batadv_mcast_forw_mode(bat_priv, skb, vid, &mcast_is_routable); switch (forw_mode) { case BATADV_FORW_BCAST: break; case BATADV_FORW_UCASTS: case BATADV_FORW_MCAST: do_bcast = false; break; case BATADV_FORW_NONE: fallthrough; default: goto dropped; } } } batadv_skb_set_priority(skb, 0); /* ethernet packet should be broadcasted */ if (do_bcast) { primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) goto dropped; /* in case of ARP request, we do not immediately broadcasti the * packet, instead we first wait for DAT to try to retrieve the * correct ARP entry */ if (batadv_dat_snoop_outgoing_arp_request(bat_priv, skb)) brd_delay = msecs_to_jiffies(ARP_REQ_DELAY); if (batadv_skb_head_push(skb, sizeof(*bcast_packet)) < 0) goto dropped; bcast_packet = (struct batadv_bcast_packet *)skb->data; bcast_packet->version = BATADV_COMPAT_VERSION; bcast_packet->ttl = BATADV_TTL - 1; /* batman packet type: broadcast */ bcast_packet->packet_type = BATADV_BCAST; bcast_packet->reserved = 0; /* hw address of first interface is the orig mac because only * this mac is known throughout the mesh */ ether_addr_copy(bcast_packet->orig, primary_if->net_dev->dev_addr); /* set broadcast sequence number */ seqno = atomic_inc_return(&bat_priv->bcast_seqno); bcast_packet->seqno = htonl(seqno); batadv_send_bcast_packet(bat_priv, skb, brd_delay, true); /* unicast packet */ } else { /* DHCP packets going to a server will use the GW feature */ if (dhcp_rcp == BATADV_DHCP_TO_SERVER) { ret = batadv_gw_out_of_range(bat_priv, skb); if (ret) goto dropped; ret = batadv_send_skb_via_gw(bat_priv, skb, vid); } else if (forw_mode == BATADV_FORW_UCASTS) { ret = batadv_mcast_forw_send(bat_priv, skb, vid, mcast_is_routable); } else if (forw_mode == BATADV_FORW_MCAST) { ret = batadv_mcast_forw_mcsend(bat_priv, skb); } else { if (batadv_dat_snoop_outgoing_arp_request(bat_priv, skb)) goto dropped; batadv_dat_snoop_outgoing_arp_reply(bat_priv, skb); ret = batadv_send_skb_via_tt(bat_priv, skb, dst_hint, vid); } if (ret != NET_XMIT_SUCCESS) goto dropped_freed; } batadv_inc_counter(bat_priv, BATADV_CNT_TX); batadv_add_counter(bat_priv, BATADV_CNT_TX_BYTES, data_len); goto end; dropped: kfree_skb(skb); dropped_freed: batadv_inc_counter(bat_priv, BATADV_CNT_TX_DROPPED); end: batadv_hardif_put(primary_if); return NETDEV_TX_OK; } /** * batadv_interface_rx() - receive ethernet frame on local batman-adv interface * @soft_iface: local interface which will receive the ethernet frame * @skb: ethernet frame for @soft_iface * @hdr_size: size of already parsed batman-adv header * @orig_node: originator from which the batman-adv packet was sent * * Sends an ethernet frame to the receive path of the local @soft_iface. * skb->data has still point to the batman-adv header with the size @hdr_size. * The caller has to have parsed this header already and made sure that at least * @hdr_size bytes are still available for pull in @skb. * * The packet may still get dropped. This can happen when the encapsulated * ethernet frame is invalid or contains again an batman-adv packet. Also * unicast packets will be dropped directly when it was sent between two * isolated clients. */ void batadv_interface_rx(struct net_device *soft_iface, struct sk_buff *skb, int hdr_size, struct batadv_orig_node *orig_node) { struct batadv_bcast_packet *batadv_bcast_packet; struct batadv_priv *bat_priv = netdev_priv(soft_iface); struct vlan_ethhdr *vhdr; struct ethhdr *ethhdr; unsigned short vid; int packet_type; batadv_bcast_packet = (struct batadv_bcast_packet *)skb->data; packet_type = batadv_bcast_packet->packet_type; skb_pull_rcsum(skb, hdr_size); skb_reset_mac_header(skb); /* clean the netfilter state now that the batman-adv header has been * removed */ nf_reset_ct(skb); if (unlikely(!pskb_may_pull(skb, ETH_HLEN))) goto dropped; vid = batadv_get_vid(skb, 0); ethhdr = eth_hdr(skb); switch (ntohs(ethhdr->h_proto)) { case ETH_P_8021Q: if (!pskb_may_pull(skb, VLAN_ETH_HLEN)) goto dropped; vhdr = skb_vlan_eth_hdr(skb); /* drop batman-in-batman packets to prevent loops */ if (vhdr->h_vlan_encapsulated_proto != htons(ETH_P_BATMAN)) break; fallthrough; case ETH_P_BATMAN: goto dropped; } /* skb->dev & skb->pkt_type are set here */ skb->protocol = eth_type_trans(skb, soft_iface); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); batadv_inc_counter(bat_priv, BATADV_CNT_RX); batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES, skb->len + ETH_HLEN); /* Let the bridge loop avoidance check the packet. If will * not handle it, we can safely push it up. */ if (batadv_bla_rx(bat_priv, skb, vid, packet_type)) goto out; if (orig_node) batadv_tt_add_temporary_global_entry(bat_priv, orig_node, ethhdr->h_source, vid); if (is_multicast_ether_addr(ethhdr->h_dest)) { /* set the mark on broadcast packets if AP isolation is ON and * the packet is coming from an "isolated" client */ if (batadv_vlan_ap_isola_get(bat_priv, vid) && batadv_tt_global_is_isolated(bat_priv, ethhdr->h_source, vid)) { /* save bits in skb->mark not covered by the mask and * apply the mark on the rest */ skb->mark &= ~bat_priv->isolation_mark_mask; skb->mark |= bat_priv->isolation_mark; } } else if (batadv_is_ap_isolated(bat_priv, ethhdr->h_source, ethhdr->h_dest, vid)) { goto dropped; } netif_rx(skb); goto out; dropped: kfree_skb(skb); out: return; } /** * batadv_softif_vlan_release() - release vlan from lists and queue for free * after rcu grace period * @ref: kref pointer of the vlan object */ void batadv_softif_vlan_release(struct kref *ref) { struct batadv_softif_vlan *vlan; vlan = container_of(ref, struct batadv_softif_vlan, refcount); spin_lock_bh(&vlan->bat_priv->softif_vlan_list_lock); hlist_del_rcu(&vlan->list); spin_unlock_bh(&vlan->bat_priv->softif_vlan_list_lock); kfree_rcu(vlan, rcu); } /** * batadv_softif_vlan_get() - get the vlan object for a specific vid * @bat_priv: the bat priv with all the soft interface information * @vid: the identifier of the vlan object to retrieve * * Return: the private data of the vlan matching the vid passed as argument or * NULL otherwise. The refcounter of the returned object is incremented by 1. */ struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv, unsigned short vid) { struct batadv_softif_vlan *vlan_tmp, *vlan = NULL; rcu_read_lock(); hlist_for_each_entry_rcu(vlan_tmp, &bat_priv->softif_vlan_list, list) { if (vlan_tmp->vid != vid) continue; if (!kref_get_unless_zero(&vlan_tmp->refcount)) continue; vlan = vlan_tmp; break; } rcu_read_unlock(); return vlan; } /** * batadv_softif_create_vlan() - allocate the needed resources for a new vlan * @bat_priv: the bat priv with all the soft interface information * @vid: the VLAN identifier * * Return: 0 on success, a negative error otherwise. */ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) { struct batadv_softif_vlan *vlan; spin_lock_bh(&bat_priv->softif_vlan_list_lock); vlan = batadv_softif_vlan_get(bat_priv, vid); if (vlan) { batadv_softif_vlan_put(vlan); spin_unlock_bh(&bat_priv->softif_vlan_list_lock); return -EEXIST; } vlan = kzalloc(sizeof(*vlan), GFP_ATOMIC); if (!vlan) { spin_unlock_bh(&bat_priv->softif_vlan_list_lock); return -ENOMEM; } vlan->bat_priv = bat_priv; vlan->vid = vid; kref_init(&vlan->refcount); atomic_set(&vlan->ap_isolation, 0); kref_get(&vlan->refcount); hlist_add_head_rcu(&vlan->list, &bat_priv->softif_vlan_list); spin_unlock_bh(&bat_priv->softif_vlan_list_lock); /* add a new TT local entry. This one will be marked with the NOPURGE * flag */ batadv_tt_local_add(bat_priv->soft_iface, bat_priv->soft_iface->dev_addr, vid, BATADV_NULL_IFINDEX, BATADV_NO_MARK); /* don't return reference to new softif_vlan */ batadv_softif_vlan_put(vlan); return 0; } /** * batadv_softif_destroy_vlan() - remove and destroy a softif_vlan object * @bat_priv: the bat priv with all the soft interface information * @vlan: the object to remove */ static void batadv_softif_destroy_vlan(struct batadv_priv *bat_priv, struct batadv_softif_vlan *vlan) { /* explicitly remove the associated TT local entry because it is marked * with the NOPURGE flag */ batadv_tt_local_remove(bat_priv, bat_priv->soft_iface->dev_addr, vlan->vid, "vlan interface destroyed", false); batadv_softif_vlan_put(vlan); } /** * batadv_interface_add_vid() - ndo_add_vid API implementation * @dev: the netdev of the mesh interface * @proto: protocol of the vlan id * @vid: identifier of the new vlan * * Set up all the internal structures for handling the new vlan on top of the * mesh interface * * Return: 0 on success or a negative error code in case of failure. */ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto, unsigned short vid) { struct batadv_priv *bat_priv = netdev_priv(dev); struct batadv_softif_vlan *vlan; /* only 802.1Q vlans are supported. * batman-adv does not know how to handle other types */ if (proto != htons(ETH_P_8021Q)) return -EINVAL; /* VID 0 is only used to indicate "priority tag" frames which only * contain priority information and no VID. No management structures * should be created for this VID and it should be handled like an * untagged frame. */ if (vid == 0) return 0; vid |= BATADV_VLAN_HAS_TAG; /* if a new vlan is getting created and it already exists, it means that * it was not deleted yet. batadv_softif_vlan_get() increases the * refcount in order to revive the object. * * if it does not exist then create it. */ vlan = batadv_softif_vlan_get(bat_priv, vid); if (!vlan) return batadv_softif_create_vlan(bat_priv, vid); /* add a new TT local entry. This one will be marked with the NOPURGE * flag. This must be added again, even if the vlan object already * exists, because the entry was deleted by kill_vid() */ batadv_tt_local_add(bat_priv->soft_iface, bat_priv->soft_iface->dev_addr, vid, BATADV_NULL_IFINDEX, BATADV_NO_MARK); return 0; } /** * batadv_interface_kill_vid() - ndo_kill_vid API implementation * @dev: the netdev of the mesh interface * @proto: protocol of the vlan id * @vid: identifier of the deleted vlan * * Destroy all the internal structures used to handle the vlan identified by vid * on top of the mesh interface * * Return: 0 on success, -EINVAL if the specified prototype is not ETH_P_8021Q * or -ENOENT if the specified vlan id wasn't registered. */ static int batadv_interface_kill_vid(struct net_device *dev, __be16 proto, unsigned short vid) { struct batadv_priv *bat_priv = netdev_priv(dev); struct batadv_softif_vlan *vlan; /* only 802.1Q vlans are supported. batman-adv does not know how to * handle other types */ if (proto != htons(ETH_P_8021Q)) return -EINVAL; /* "priority tag" frames are handled like "untagged" frames * and no softif_vlan needs to be destroyed */ if (vid == 0) return 0; vlan = batadv_softif_vlan_get(bat_priv, vid | BATADV_VLAN_HAS_TAG); if (!vlan) return -ENOENT; batadv_softif_destroy_vlan(bat_priv, vlan); /* finally free the vlan object */ batadv_softif_vlan_put(vlan); return 0; } /* batman-adv network devices have devices nesting below it and are a special * "super class" of normal network devices; split their locks off into a * separate class since they always nest. */ static struct lock_class_key batadv_netdev_xmit_lock_key; static struct lock_class_key batadv_netdev_addr_lock_key; /** * batadv_set_lockdep_class_one() - Set lockdep class for a single tx queue * @dev: device which owns the tx queue * @txq: tx queue to modify * @_unused: always NULL */ static void batadv_set_lockdep_class_one(struct net_device *dev, struct netdev_queue *txq, void *_unused) { lockdep_set_class(&txq->_xmit_lock, &batadv_netdev_xmit_lock_key); } /** * batadv_set_lockdep_class() - Set txq and addr_list lockdep class * @dev: network device to modify */ static void batadv_set_lockdep_class(struct net_device *dev) { lockdep_set_class(&dev->addr_list_lock, &batadv_netdev_addr_lock_key); netdev_for_each_tx_queue(dev, batadv_set_lockdep_class_one, NULL); } /** * batadv_softif_init_late() - late stage initialization of soft interface * @dev: registered network device to modify * * Return: error code on failures */ static int batadv_softif_init_late(struct net_device *dev) { struct batadv_priv *bat_priv; u32 random_seqno; int ret; size_t cnt_len = sizeof(u64) * BATADV_CNT_NUM; batadv_set_lockdep_class(dev); bat_priv = netdev_priv(dev); bat_priv->soft_iface = dev; /* batadv_interface_stats() needs to be available as soon as * register_netdevice() has been called */ bat_priv->bat_counters = __alloc_percpu(cnt_len, __alignof__(u64)); if (!bat_priv->bat_counters) return -ENOMEM; atomic_set(&bat_priv->aggregated_ogms, 1); atomic_set(&bat_priv->bonding, 0); #ifdef CONFIG_BATMAN_ADV_BLA atomic_set(&bat_priv->bridge_loop_avoidance, 1); #endif #ifdef CONFIG_BATMAN_ADV_DAT atomic_set(&bat_priv->distributed_arp_table, 1); #endif #ifdef CONFIG_BATMAN_ADV_MCAST atomic_set(&bat_priv->multicast_mode, 1); atomic_set(&bat_priv->multicast_fanout, 16); atomic_set(&bat_priv->mcast.num_want_all_unsnoopables, 0); atomic_set(&bat_priv->mcast.num_want_all_ipv4, 0); atomic_set(&bat_priv->mcast.num_want_all_ipv6, 0); atomic_set(&bat_priv->mcast.num_no_mc_ptype_capa, 0); #endif atomic_set(&bat_priv->gw.mode, BATADV_GW_MODE_OFF); atomic_set(&bat_priv->gw.bandwidth_down, 100); atomic_set(&bat_priv->gw.bandwidth_up, 20); atomic_set(&bat_priv->orig_interval, 1000); atomic_set(&bat_priv->hop_penalty, 30); #ifdef CONFIG_BATMAN_ADV_DEBUG atomic_set(&bat_priv->log_level, 0); #endif atomic_set(&bat_priv->fragmentation, 1); atomic_set(&bat_priv->packet_size_max, ETH_DATA_LEN); atomic_set(&bat_priv->bcast_queue_left, BATADV_BCAST_QUEUE_LEN); atomic_set(&bat_priv->batman_queue_left, BATADV_BATMAN_QUEUE_LEN); atomic_set(&bat_priv->mesh_state, BATADV_MESH_INACTIVE); atomic_set(&bat_priv->bcast_seqno, 1); atomic_set(&bat_priv->tt.vn, 0); atomic_set(&bat_priv->tt.ogm_append_cnt, 0); #ifdef CONFIG_BATMAN_ADV_BLA atomic_set(&bat_priv->bla.num_requests, 0); #endif atomic_set(&bat_priv->tp_num, 0); WRITE_ONCE(bat_priv->tt.local_changes, 0); bat_priv->tt.last_changeset = NULL; bat_priv->tt.last_changeset_len = 0; bat_priv->isolation_mark = 0; bat_priv->isolation_mark_mask = 0; /* randomize initial seqno to avoid collision */ get_random_bytes(&random_seqno, sizeof(random_seqno)); atomic_set(&bat_priv->frag_seqno, random_seqno); bat_priv->primary_if = NULL; batadv_nc_init_bat_priv(bat_priv); if (!bat_priv->algo_ops) { ret = batadv_algo_select(bat_priv, batadv_routing_algo); if (ret < 0) goto free_bat_counters; } ret = batadv_mesh_init(dev); if (ret < 0) goto free_bat_counters; return 0; free_bat_counters: free_percpu(bat_priv->bat_counters); bat_priv->bat_counters = NULL; return ret; } /** * batadv_softif_slave_add() - Add a slave interface to a batadv_soft_interface * @dev: batadv_soft_interface used as master interface * @slave_dev: net_device which should become the slave interface * @extack: extended ACK report struct * * Return: 0 if successful or error otherwise. */ static int batadv_softif_slave_add(struct net_device *dev, struct net_device *slave_dev, struct netlink_ext_ack *extack) { struct batadv_hard_iface *hard_iface; int ret = -EINVAL; hard_iface = batadv_hardif_get_by_netdev(slave_dev); if (!hard_iface || hard_iface->soft_iface) goto out; ret = batadv_hardif_enable_interface(hard_iface, dev); out: batadv_hardif_put(hard_iface); return ret; } /** * batadv_softif_slave_del() - Delete a slave iface from a batadv_soft_interface * @dev: batadv_soft_interface used as master interface * @slave_dev: net_device which should be removed from the master interface * * Return: 0 if successful or error otherwise. */ static int batadv_softif_slave_del(struct net_device *dev, struct net_device *slave_dev) { struct batadv_hard_iface *hard_iface; int ret = -EINVAL; hard_iface = batadv_hardif_get_by_netdev(slave_dev); if (!hard_iface || hard_iface->soft_iface != dev) goto out; batadv_hardif_disable_interface(hard_iface); ret = 0; out: batadv_hardif_put(hard_iface); return ret; } static const struct net_device_ops batadv_netdev_ops = { .ndo_init = batadv_softif_init_late, .ndo_open = batadv_interface_open, .ndo_stop = batadv_interface_release, .ndo_get_stats = batadv_interface_stats, .ndo_vlan_rx_add_vid = batadv_interface_add_vid, .ndo_vlan_rx_kill_vid = batadv_interface_kill_vid, .ndo_set_mac_address = batadv_interface_set_mac_addr, .ndo_change_mtu = batadv_interface_change_mtu, .ndo_set_rx_mode = batadv_interface_set_rx_mode, .ndo_start_xmit = batadv_interface_tx, .ndo_validate_addr = eth_validate_addr, .ndo_add_slave = batadv_softif_slave_add, .ndo_del_slave = batadv_softif_slave_del, }; static void batadv_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { strscpy(info->driver, "B.A.T.M.A.N. advanced", sizeof(info->driver)); strscpy(info->version, BATADV_SOURCE_VERSION, sizeof(info->version)); strscpy(info->fw_version, "N/A", sizeof(info->fw_version)); strscpy(info->bus_info, "batman", sizeof(info->bus_info)); } /* Inspired by drivers/net/ethernet/dlink/sundance.c:1702 * Declare each description string in struct.name[] to get fixed sized buffer * and compile time checking for strings longer than ETH_GSTRING_LEN. */ static const struct { const char name[ETH_GSTRING_LEN]; } batadv_counters_strings[] = { { "tx" }, { "tx_bytes" }, { "tx_dropped" }, { "rx" }, { "rx_bytes" }, { "forward" }, { "forward_bytes" }, { "mgmt_tx" }, { "mgmt_tx_bytes" }, { "mgmt_rx" }, { "mgmt_rx_bytes" }, { "frag_tx" }, { "frag_tx_bytes" }, { "frag_rx" }, { "frag_rx_bytes" }, { "frag_fwd" }, { "frag_fwd_bytes" }, { "tt_request_tx" }, { "tt_request_rx" }, { "tt_response_tx" }, { "tt_response_rx" }, { "tt_roam_adv_tx" }, { "tt_roam_adv_rx" }, #ifdef CONFIG_BATMAN_ADV_MCAST { "mcast_tx" }, { "mcast_tx_bytes" }, { "mcast_tx_local" }, { "mcast_tx_local_bytes" }, { "mcast_rx" }, { "mcast_rx_bytes" }, { "mcast_rx_local" }, { "mcast_rx_local_bytes" }, { "mcast_fwd" }, { "mcast_fwd_bytes" }, #endif #ifdef CONFIG_BATMAN_ADV_DAT { "dat_get_tx" }, { "dat_get_rx" }, { "dat_put_tx" }, { "dat_put_rx" }, { "dat_cached_reply_tx" }, #endif #ifdef CONFIG_BATMAN_ADV_NC { "nc_code" }, { "nc_code_bytes" }, { "nc_recode" }, { "nc_recode_bytes" }, { "nc_buffer" }, { "nc_decode" }, { "nc_decode_bytes" }, { "nc_decode_failed" }, { "nc_sniffed" }, #endif }; static void batadv_get_strings(struct net_device *dev, u32 stringset, u8 *data) { if (stringset == ETH_SS_STATS) memcpy(data, batadv_counters_strings, sizeof(batadv_counters_strings)); } static void batadv_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, u64 *data) { struct batadv_priv *bat_priv = netdev_priv(dev); int i; for (i = 0; i < BATADV_CNT_NUM; i++) data[i] = batadv_sum_counter(bat_priv, i); } static int batadv_get_sset_count(struct net_device *dev, int stringset) { if (stringset == ETH_SS_STATS) return BATADV_CNT_NUM; return -EOPNOTSUPP; } static const struct ethtool_ops batadv_ethtool_ops = { .get_drvinfo = batadv_get_drvinfo, .get_link = ethtool_op_get_link, .get_strings = batadv_get_strings, .get_ethtool_stats = batadv_get_ethtool_stats, .get_sset_count = batadv_get_sset_count, }; /** * batadv_softif_free() - Deconstructor of batadv_soft_interface * @dev: Device to cleanup and remove */ static void batadv_softif_free(struct net_device *dev) { batadv_mesh_free(dev); /* some scheduled RCU callbacks need the bat_priv struct to accomplish * their tasks. Wait for them all to be finished before freeing the * netdev and its private data (bat_priv) */ rcu_barrier(); } /** * batadv_softif_init_early() - early stage initialization of soft interface * @dev: registered network device to modify */ static void batadv_softif_init_early(struct net_device *dev) { ether_setup(dev); dev->netdev_ops = &batadv_netdev_ops; dev->needs_free_netdev = true; dev->priv_destructor = batadv_softif_free; dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER; dev->priv_flags |= IFF_NO_QUEUE; dev->lltx = true; dev->netns_local = true; /* can't call min_mtu, because the needed variables * have not been initialized yet */ dev->mtu = ETH_DATA_LEN; /* generate random address */ eth_hw_addr_random(dev); dev->ethtool_ops = &batadv_ethtool_ops; } /** * batadv_softif_validate() - validate configuration of new batadv link * @tb: IFLA_INFO_DATA netlink attributes * @data: enum batadv_ifla_attrs attributes * @extack: extended ACK report struct * * Return: 0 if successful or error otherwise. */ static int batadv_softif_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct batadv_algo_ops *algo_ops; if (!data) return 0; if (data[IFLA_BATADV_ALGO_NAME]) { algo_ops = batadv_algo_get(nla_data(data[IFLA_BATADV_ALGO_NAME])); if (!algo_ops) return -EINVAL; } return 0; } /** * batadv_softif_newlink() - pre-initialize and register new batadv link * @src_net: the applicable net namespace * @dev: network device to register * @tb: IFLA_INFO_DATA netlink attributes * @data: enum batadv_ifla_attrs attributes * @extack: extended ACK report struct * * Return: 0 if successful or error otherwise. */ static int batadv_softif_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct batadv_priv *bat_priv = netdev_priv(dev); const char *algo_name; int err; if (data && data[IFLA_BATADV_ALGO_NAME]) { algo_name = nla_data(data[IFLA_BATADV_ALGO_NAME]); err = batadv_algo_select(bat_priv, algo_name); if (err) return -EINVAL; } return register_netdevice(dev); } /** * batadv_softif_destroy_netlink() - deletion of batadv_soft_interface via * netlink * @soft_iface: the to-be-removed batman-adv interface * @head: list pointer */ static void batadv_softif_destroy_netlink(struct net_device *soft_iface, struct list_head *head) { struct batadv_priv *bat_priv = netdev_priv(soft_iface); struct batadv_hard_iface *hard_iface; struct batadv_softif_vlan *vlan; list_for_each_entry(hard_iface, &batadv_hardif_list, list) { if (hard_iface->soft_iface == soft_iface) batadv_hardif_disable_interface(hard_iface); } /* destroy the "untagged" VLAN */ vlan = batadv_softif_vlan_get(bat_priv, BATADV_NO_FLAGS); if (vlan) { batadv_softif_destroy_vlan(bat_priv, vlan); batadv_softif_vlan_put(vlan); } unregister_netdevice_queue(soft_iface, head); } /** * batadv_softif_is_valid() - Check whether device is a batadv soft interface * @net_dev: device which should be checked * * Return: true when net_dev is a batman-adv interface, false otherwise */ bool batadv_softif_is_valid(const struct net_device *net_dev) { if (net_dev->netdev_ops->ndo_start_xmit == batadv_interface_tx) return true; return false; } static const struct nla_policy batadv_ifla_policy[IFLA_BATADV_MAX + 1] = { [IFLA_BATADV_ALGO_NAME] = { .type = NLA_NUL_STRING }, }; struct rtnl_link_ops batadv_link_ops __read_mostly = { .kind = "batadv", .priv_size = sizeof(struct batadv_priv), .setup = batadv_softif_init_early, .maxtype = IFLA_BATADV_MAX, .policy = batadv_ifla_policy, .validate = batadv_softif_validate, .newlink = batadv_softif_newlink, .dellink = batadv_softif_destroy_netlink, };
1 1 1 1 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 // SPDX-License-Identifier: GPL-2.0 /* * thermal.c - Generic Thermal Management Sysfs support. * * Copyright (C) 2008 Intel Corp * Copyright (C) 2008 Zhang Rui <rui.zhang@intel.com> * Copyright (C) 2008 Sujith Thomas <sujith.thomas@intel.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/device.h> #include <linux/err.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/kdev_t.h> #include <linux/idr.h> #include <linux/thermal.h> #include <linux/reboot.h> #include <linux/string.h> #include <linux/of.h> #include <linux/suspend.h> #define CREATE_TRACE_POINTS #include "thermal_trace.h" #include "thermal_core.h" #include "thermal_hwmon.h" static DEFINE_IDA(thermal_tz_ida); static DEFINE_IDA(thermal_cdev_ida); static LIST_HEAD(thermal_tz_list); static LIST_HEAD(thermal_cdev_list); static LIST_HEAD(thermal_governor_list); static DEFINE_MUTEX(thermal_list_lock); static DEFINE_MUTEX(thermal_governor_lock); static struct thermal_governor *def_governor; static bool thermal_pm_suspended; /* * Governor section: set of functions to handle thermal governors * * Functions to help in the life cycle of thermal governors within * the thermal core and by the thermal governor code. */ static struct thermal_governor *__find_governor(const char *name) { struct thermal_governor *pos; if (!name || !name[0]) return def_governor; list_for_each_entry(pos, &thermal_governor_list, governor_list) if (!strncasecmp(name, pos->name, THERMAL_NAME_LENGTH)) return pos; return NULL; } /** * bind_previous_governor() - bind the previous governor of the thermal zone * @tz: a valid pointer to a struct thermal_zone_device * @failed_gov_name: the name of the governor that failed to register * * Register the previous governor of the thermal zone after a new * governor has failed to be bound. */ static void bind_previous_governor(struct thermal_zone_device *tz, const char *failed_gov_name) { if (tz->governor && tz->governor->bind_to_tz) { if (tz->governor->bind_to_tz(tz)) { dev_err(&tz->device, "governor %s failed to bind and the previous one (%s) failed to bind again, thermal zone %s has no governor\n", failed_gov_name, tz->governor->name, tz->type); tz->governor = NULL; } } } /** * thermal_set_governor() - Switch to another governor * @tz: a valid pointer to a struct thermal_zone_device * @new_gov: pointer to the new governor * * Change the governor of thermal zone @tz. * * Return: 0 on success, an error if the new governor's bind_to_tz() failed. */ static int thermal_set_governor(struct thermal_zone_device *tz, struct thermal_governor *new_gov) { int ret = 0; if (tz->governor && tz->governor->unbind_from_tz) tz->governor->unbind_from_tz(tz); if (new_gov && new_gov->bind_to_tz) { ret = new_gov->bind_to_tz(tz); if (ret) { bind_previous_governor(tz, new_gov->name); return ret; } } tz->governor = new_gov; return ret; } int thermal_register_governor(struct thermal_governor *governor) { int err; const char *name; struct thermal_zone_device *pos; if (!governor) return -EINVAL; guard(mutex)(&thermal_governor_lock); err = -EBUSY; if (!__find_governor(governor->name)) { bool match_default; err = 0; list_add(&governor->governor_list, &thermal_governor_list); match_default = !strncmp(governor->name, DEFAULT_THERMAL_GOVERNOR, THERMAL_NAME_LENGTH); if (!def_governor && match_default) def_governor = governor; } guard(mutex)(&thermal_list_lock); list_for_each_entry(pos, &thermal_tz_list, node) { /* * only thermal zones with specified tz->tzp->governor_name * may run with tz->govenor unset */ if (pos->governor) continue; name = pos->tzp->governor_name; if (!strncasecmp(name, governor->name, THERMAL_NAME_LENGTH)) { int ret; ret = thermal_set_governor(pos, governor); if (ret) dev_err(&pos->device, "Failed to set governor %s for thermal zone %s: %d\n", governor->name, pos->type, ret); } } return err; } void thermal_unregister_governor(struct thermal_governor *governor) { struct thermal_zone_device *pos; if (!governor) return; guard(mutex)(&thermal_governor_lock); if (!__find_governor(governor->name)) return; list_del(&governor->governor_list); guard(mutex)(&thermal_list_lock); list_for_each_entry(pos, &thermal_tz_list, node) { if (!strncasecmp(pos->governor->name, governor->name, THERMAL_NAME_LENGTH)) thermal_set_governor(pos, NULL); } } int thermal_zone_device_set_policy(struct thermal_zone_device *tz, char *policy) { struct thermal_governor *gov; int ret = -EINVAL; guard(mutex)(&thermal_governor_lock); guard(thermal_zone)(tz); gov = __find_governor(strim(policy)); if (gov) ret = thermal_set_governor(tz, gov); thermal_notify_tz_gov_change(tz, policy); return ret; } int thermal_build_list_of_policies(char *buf) { struct thermal_governor *pos; ssize_t count = 0; guard(mutex)(&thermal_governor_lock); list_for_each_entry(pos, &thermal_governor_list, governor_list) { count += sysfs_emit_at(buf, count, "%s ", pos->name); } count += sysfs_emit_at(buf, count, "\n"); return count; } static void __init thermal_unregister_governors(void) { struct thermal_governor **governor; for_each_governor_table(governor) thermal_unregister_governor(*governor); } static int __init thermal_register_governors(void) { int ret = 0; struct thermal_governor **governor; for_each_governor_table(governor) { ret = thermal_register_governor(*governor); if (ret) { pr_err("Failed to register governor: '%s'", (*governor)->name); break; } pr_info("Registered thermal governor '%s'", (*governor)->name); } if (ret) { struct thermal_governor **gov; for_each_governor_table(gov) { if (gov == governor) break; thermal_unregister_governor(*gov); } } return ret; } static int __thermal_zone_device_set_mode(struct thermal_zone_device *tz, enum thermal_device_mode mode) { if (tz->ops.change_mode) { int ret; ret = tz->ops.change_mode(tz, mode); if (ret) return ret; } tz->mode = mode; return 0; } static void thermal_zone_broken_disable(struct thermal_zone_device *tz) { struct thermal_trip_desc *td; dev_err(&tz->device, "Unable to get temperature, disabling!\n"); /* * This function only runs for enabled thermal zones, so no need to * check for the current mode. */ __thermal_zone_device_set_mode(tz, THERMAL_DEVICE_DISABLED); thermal_notify_tz_disable(tz); for_each_trip_desc(tz, td) { if (td->trip.type == THERMAL_TRIP_CRITICAL && td->trip.temperature > THERMAL_TEMP_INVALID) { dev_crit(&tz->device, "Disabled thermal zone with critical trip point\n"); return; } } } /* * Zone update section: main control loop applied to each zone while monitoring * in polling mode. The monitoring is done using a workqueue. * Same update may be done on a zone by calling thermal_zone_device_update(). * * An update means: * - Non-critical trips will invoke the governor responsible for that zone; * - Hot trips will produce a notification to userspace; * - Critical trip point will cause a system shutdown. */ static void thermal_zone_device_set_polling(struct thermal_zone_device *tz, unsigned long delay) { if (delay > HZ) delay = round_jiffies_relative(delay); mod_delayed_work(system_freezable_power_efficient_wq, &tz->poll_queue, delay); } static void thermal_zone_recheck(struct thermal_zone_device *tz, int error) { if (error == -EAGAIN) { thermal_zone_device_set_polling(tz, THERMAL_RECHECK_DELAY); return; } /* * Print the message once to reduce log noise. It will be followed by * another one if the temperature cannot be determined after multiple * attempts. */ if (tz->recheck_delay_jiffies == THERMAL_RECHECK_DELAY) dev_info(&tz->device, "Temperature check failed (%d)\n", error); thermal_zone_device_set_polling(tz, tz->recheck_delay_jiffies); tz->recheck_delay_jiffies += max(tz->recheck_delay_jiffies >> 1, 1ULL); if (tz->recheck_delay_jiffies > THERMAL_MAX_RECHECK_DELAY) { thermal_zone_broken_disable(tz); /* * Restore the original recheck delay value to allow the thermal * zone to try to recover when it is reenabled by user space. */ tz->recheck_delay_jiffies = THERMAL_RECHECK_DELAY; } } static void monitor_thermal_zone(struct thermal_zone_device *tz) { if (tz->passive > 0 && tz->passive_delay_jiffies) thermal_zone_device_set_polling(tz, tz->passive_delay_jiffies); else if (tz->polling_delay_jiffies) thermal_zone_device_set_polling(tz, tz->polling_delay_jiffies); } static struct thermal_governor *thermal_get_tz_governor(struct thermal_zone_device *tz) { if (tz->governor) return tz->governor; return def_governor; } void thermal_governor_update_tz(struct thermal_zone_device *tz, enum thermal_notify_event reason) { if (!tz->governor || !tz->governor->update_tz) return; tz->governor->update_tz(tz, reason); } static void thermal_zone_device_halt(struct thermal_zone_device *tz, bool shutdown) { /* * poweroff_delay_ms must be a carefully profiled positive value. * Its a must for forced_emergency_poweroff_work to be scheduled. */ int poweroff_delay_ms = CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS; const char *msg = "Temperature too high"; dev_emerg(&tz->device, "%s: critical temperature reached\n", tz->type); if (shutdown) hw_protection_shutdown(msg, poweroff_delay_ms); else hw_protection_reboot(msg, poweroff_delay_ms); } void thermal_zone_device_critical(struct thermal_zone_device *tz) { thermal_zone_device_halt(tz, true); } EXPORT_SYMBOL(thermal_zone_device_critical); void thermal_zone_device_critical_reboot(struct thermal_zone_device *tz) { thermal_zone_device_halt(tz, false); } static void handle_critical_trips(struct thermal_zone_device *tz, const struct thermal_trip *trip) { trace_thermal_zone_trip(tz, thermal_zone_trip_id(tz, trip), trip->type); if (trip->type == THERMAL_TRIP_CRITICAL) tz->ops.critical(tz); else if (tz->ops.hot) tz->ops.hot(tz); } static void move_trip_to_sorted_list(struct thermal_trip_desc *td, struct list_head *list) { struct thermal_trip_desc *entry; /* * Delete upfront and then add to make relocation within the same list * work. */ list_del(&td->list_node); /* Assume that the new entry is likely to be the last one. */ list_for_each_entry_reverse(entry, list, list_node) { if (entry->threshold <= td->threshold) { list_add(&td->list_node, &entry->list_node); return; } } list_add(&td->list_node, list); } static void move_to_trips_high(struct thermal_zone_device *tz, struct thermal_trip_desc *td) { td->threshold = td->trip.temperature; move_trip_to_sorted_list(td, &tz->trips_high); } static void move_to_trips_reached(struct thermal_zone_device *tz, struct thermal_trip_desc *td) { td->threshold = td->trip.temperature - td->trip.hysteresis; move_trip_to_sorted_list(td, &tz->trips_reached); } static void move_to_trips_invalid(struct thermal_zone_device *tz, struct thermal_trip_desc *td) { td->threshold = INT_MAX; list_move(&td->list_node, &tz->trips_invalid); } static void thermal_governor_trip_crossed(struct thermal_governor *governor, struct thermal_zone_device *tz, const struct thermal_trip *trip, bool upward) { if (trip->type == THERMAL_TRIP_HOT || trip->type == THERMAL_TRIP_CRITICAL) return; if (governor->trip_crossed) governor->trip_crossed(tz, trip, upward); } static void thermal_trip_crossed(struct thermal_zone_device *tz, struct thermal_trip_desc *td, struct thermal_governor *governor, bool upward) { const struct thermal_trip *trip = &td->trip; if (upward) { if (trip->type == THERMAL_TRIP_PASSIVE) tz->passive++; else if (trip->type == THERMAL_TRIP_CRITICAL || trip->type == THERMAL_TRIP_HOT) handle_critical_trips(tz, trip); thermal_notify_tz_trip_up(tz, trip); thermal_debug_tz_trip_up(tz, trip); } else { if (trip->type == THERMAL_TRIP_PASSIVE) { tz->passive--; WARN_ON(tz->passive < 0); } thermal_notify_tz_trip_down(tz, trip); thermal_debug_tz_trip_down(tz, trip); } thermal_governor_trip_crossed(governor, tz, trip, upward); } void thermal_zone_set_trip_hyst(struct thermal_zone_device *tz, struct thermal_trip *trip, int hyst) { struct thermal_trip_desc *td = trip_to_trip_desc(trip); WRITE_ONCE(trip->hysteresis, hyst); thermal_notify_tz_trip_change(tz, trip); /* * If the zone temperature is above or at the trip tmperature, the trip * is in the trips_reached list and its threshold is equal to its low * temperature. It needs to stay in that list, but its threshold needs * to be updated and the list ordering may need to be restored. */ if (tz->temperature >= td->threshold) move_to_trips_reached(tz, td); } void thermal_zone_set_trip_temp(struct thermal_zone_device *tz, struct thermal_trip *trip, int temp) { struct thermal_trip_desc *td = trip_to_trip_desc(trip); int old_temp = trip->temperature; if (old_temp == temp) return; WRITE_ONCE(trip->temperature, temp); thermal_notify_tz_trip_change(tz, trip); if (old_temp == THERMAL_TEMP_INVALID) { /* * The trip was invalid before the change, so move it to the * trips_high list regardless of the new temperature value * because there is no mitigation under way for it. If a * mitigation needs to be started, the trip will be moved to the * trips_reached list later. */ move_to_trips_high(tz, td); return; } if (temp == THERMAL_TEMP_INVALID) { /* * If the trip is in the trips_reached list, mitigation is under * way for it and it needs to be stopped because the trip is * effectively going away. */ if (tz->temperature >= td->threshold) thermal_trip_crossed(tz, td, thermal_get_tz_governor(tz), false); move_to_trips_invalid(tz, td); return; } /* * The trip stays on its current list, but its threshold needs to be * updated due to the temperature change and the list ordering may need * to be restored. */ if (tz->temperature >= td->threshold) move_to_trips_reached(tz, td); else move_to_trips_high(tz, td); } EXPORT_SYMBOL_GPL(thermal_zone_set_trip_temp); static void thermal_zone_handle_trips(struct thermal_zone_device *tz, struct thermal_governor *governor, int *low, int *high) { struct thermal_trip_desc *td, *next; LIST_HEAD(way_down_list); /* Check the trips that were below or at the zone temperature. */ list_for_each_entry_safe_reverse(td, next, &tz->trips_reached, list_node) { if (td->threshold <= tz->temperature) break; thermal_trip_crossed(tz, td, governor, false); /* * The current trips_high list needs to be processed before * adding new entries to it, so put them on a temporary list. */ list_move(&td->list_node, &way_down_list); } /* Check the trips that were previously above the zone temperature. */ list_for_each_entry_safe(td, next, &tz->trips_high, list_node) { if (td->threshold > tz->temperature) break; thermal_trip_crossed(tz, td, governor, true); move_to_trips_reached(tz, td); } /* Move all of the trips from the temporary list to trips_high. */ list_for_each_entry_safe(td, next, &way_down_list, list_node) move_to_trips_high(tz, td); if (!list_empty(&tz->trips_reached)) { td = list_last_entry(&tz->trips_reached, struct thermal_trip_desc, list_node); /* * Set the "low" value below the current trip threshold in case * the zone temperature is at that threshold and stays there, * which would trigger a new interrupt immediately in vain. */ *low = td->threshold - 1; } if (!list_empty(&tz->trips_high)) { td = list_first_entry(&tz->trips_high, struct thermal_trip_desc, list_node); *high = td->threshold; } } void __thermal_zone_device_update(struct thermal_zone_device *tz, enum thermal_notify_event event) { struct thermal_governor *governor = thermal_get_tz_governor(tz); int low = -INT_MAX, high = INT_MAX; int temp, ret; if (tz->state != TZ_STATE_READY || tz->mode != THERMAL_DEVICE_ENABLED) return; ret = __thermal_zone_get_temp(tz, &temp); if (ret) { thermal_zone_recheck(tz, ret); return; } else if (temp <= THERMAL_TEMP_INVALID) { /* * Special case: No valid temperature value is available, but * the zone owner does not want the core to do anything about * it. Continue regular zone polling if needed, so that this * function can be called again, but skip everything else. */ goto monitor; } tz->recheck_delay_jiffies = THERMAL_RECHECK_DELAY; tz->last_temperature = tz->temperature; tz->temperature = temp; trace_thermal_temperature(tz); thermal_genl_sampling_temp(tz->id, temp); tz->notify_event = event; thermal_zone_handle_trips(tz, governor, &low, &high); thermal_thresholds_handle(tz, &low, &high); thermal_zone_set_trips(tz, low, high); if (governor->manage) governor->manage(tz); thermal_debug_update_trip_stats(tz); monitor: monitor_thermal_zone(tz); } static int thermal_zone_device_set_mode(struct thermal_zone_device *tz, enum thermal_device_mode mode) { int ret; guard(thermal_zone)(tz); /* do nothing if mode isn't changing */ if (mode == tz->mode) return 0; ret = __thermal_zone_device_set_mode(tz, mode); if (ret) return ret; __thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED); if (mode == THERMAL_DEVICE_ENABLED) thermal_notify_tz_enable(tz); else thermal_notify_tz_disable(tz); return 0; } int thermal_zone_device_enable(struct thermal_zone_device *tz) { return thermal_zone_device_set_mode(tz, THERMAL_DEVICE_ENABLED); } EXPORT_SYMBOL_GPL(thermal_zone_device_enable); int thermal_zone_device_disable(struct thermal_zone_device *tz) { return thermal_zone_device_set_mode(tz, THERMAL_DEVICE_DISABLED); } EXPORT_SYMBOL_GPL(thermal_zone_device_disable); static bool thermal_zone_is_present(struct thermal_zone_device *tz) { return !list_empty(&tz->node); } void thermal_zone_device_update(struct thermal_zone_device *tz, enum thermal_notify_event event) { guard(thermal_zone)(tz); if (thermal_zone_is_present(tz)) __thermal_zone_device_update(tz, event); } EXPORT_SYMBOL_GPL(thermal_zone_device_update); int for_each_thermal_governor(int (*cb)(struct thermal_governor *, void *), void *data) { struct thermal_governor *gov; guard(mutex)(&thermal_governor_lock); list_for_each_entry(gov, &thermal_governor_list, governor_list) { int ret; ret = cb(gov, data); if (ret) return ret; } return 0; } int for_each_thermal_cooling_device(int (*cb)(struct thermal_cooling_device *, void *), void *data) { struct thermal_cooling_device *cdev; guard(mutex)(&thermal_list_lock); list_for_each_entry(cdev, &thermal_cdev_list, node) { int ret; ret = cb(cdev, data); if (ret) return ret; } return 0; } int for_each_thermal_zone(int (*cb)(struct thermal_zone_device *, void *), void *data) { struct thermal_zone_device *tz; guard(mutex)(&thermal_list_lock); list_for_each_entry(tz, &thermal_tz_list, node) { int ret; ret = cb(tz, data); if (ret) return ret; } return 0; } struct thermal_zone_device *thermal_zone_get_by_id(int id) { struct thermal_zone_device *tz; guard(mutex)(&thermal_list_lock); list_for_each_entry(tz, &thermal_tz_list, node) { if (tz->id == id) { get_device(&tz->device); return tz; } } return NULL; } /* * Device management section: cooling devices, zones devices, and binding * * Set of functions provided by the thermal core for: * - cooling devices lifecycle: registration, unregistration, * binding, and unbinding. * - thermal zone devices lifecycle: registration, unregistration, * binding, and unbinding. */ static int thermal_instance_add(struct thermal_instance *new_instance, struct thermal_cooling_device *cdev, struct thermal_trip_desc *td) { struct thermal_instance *instance; list_for_each_entry(instance, &td->thermal_instances, trip_node) { if (instance->cdev == cdev) return -EEXIST; } list_add_tail(&new_instance->trip_node, &td->thermal_instances); guard(cooling_dev)(cdev); list_add_tail(&new_instance->cdev_node, &cdev->thermal_instances); return 0; } /** * thermal_bind_cdev_to_trip - bind a cooling device to a thermal zone * @tz: pointer to struct thermal_zone_device * @td: descriptor of the trip point to bind @cdev to * @cdev: pointer to struct thermal_cooling_device * @cool_spec: cooling specification for the trip point and @cdev * * This interface function bind a thermal cooling device to the certain trip * point of a thermal zone device. * This function is usually called in the thermal zone device .bind callback. * * Return: 0 on success, the proper error value otherwise. */ static int thermal_bind_cdev_to_trip(struct thermal_zone_device *tz, struct thermal_trip_desc *td, struct thermal_cooling_device *cdev, struct cooling_spec *cool_spec) { struct thermal_instance *dev; bool upper_no_limit; int result; /* lower default 0, upper default max_state */ if (cool_spec->lower == THERMAL_NO_LIMIT) cool_spec->lower = 0; if (cool_spec->upper == THERMAL_NO_LIMIT) { cool_spec->upper = cdev->max_state; upper_no_limit = true; } else { upper_no_limit = false; } if (cool_spec->lower > cool_spec->upper || cool_spec->upper > cdev->max_state) return -EINVAL; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return -ENOMEM; dev->cdev = cdev; dev->trip = &td->trip; dev->upper = cool_spec->upper; dev->upper_no_limit = upper_no_limit; dev->lower = cool_spec->lower; dev->target = THERMAL_NO_TARGET; dev->weight = cool_spec->weight; result = ida_alloc(&tz->ida, GFP_KERNEL); if (result < 0) goto free_mem; dev->id = result; sprintf(dev->name, "cdev%d", dev->id); result = sysfs_create_link(&tz->device.kobj, &cdev->device.kobj, dev->name); if (result) goto release_ida; snprintf(dev->attr_name, sizeof(dev->attr_name), "cdev%d_trip_point", dev->id); sysfs_attr_init(&dev->attr.attr); dev->attr.attr.name = dev->attr_name; dev->attr.attr.mode = 0444; dev->attr.show = trip_point_show; result = device_create_file(&tz->device, &dev->attr); if (result) goto remove_symbol_link; snprintf(dev->weight_attr_name, sizeof(dev->weight_attr_name), "cdev%d_weight", dev->id); sysfs_attr_init(&dev->weight_attr.attr); dev->weight_attr.attr.name = dev->weight_attr_name; dev->weight_attr.attr.mode = S_IWUSR | S_IRUGO; dev->weight_attr.show = weight_show; dev->weight_attr.store = weight_store; result = device_create_file(&tz->device, &dev->weight_attr); if (result) goto remove_trip_file; result = thermal_instance_add(dev, cdev, td); if (result) goto remove_weight_file; thermal_governor_update_tz(tz, THERMAL_TZ_BIND_CDEV); return 0; remove_weight_file: device_remove_file(&tz->device, &dev->weight_attr); remove_trip_file: device_remove_file(&tz->device, &dev->attr); remove_symbol_link: sysfs_remove_link(&tz->device.kobj, dev->name); release_ida: ida_free(&tz->ida, dev->id); free_mem: kfree(dev); return result; } static void thermal_instance_delete(struct thermal_instance *instance) { list_del(&instance->trip_node); guard(cooling_dev)(instance->cdev); list_del(&instance->cdev_node); } /** * thermal_unbind_cdev_from_trip - unbind a cooling device from a thermal zone. * @tz: pointer to a struct thermal_zone_device. * @td: descriptor of the trip point to unbind @cdev from * @cdev: pointer to a struct thermal_cooling_device. * * This interface function unbind a thermal cooling device from the certain * trip point of a thermal zone device. * This function is usually called in the thermal zone device .unbind callback. */ static void thermal_unbind_cdev_from_trip(struct thermal_zone_device *tz, struct thermal_trip_desc *td, struct thermal_cooling_device *cdev) { struct thermal_instance *pos, *next; list_for_each_entry_safe(pos, next, &td->thermal_instances, trip_node) { if (pos->cdev == cdev) { thermal_instance_delete(pos); goto unbind; } } return; unbind: thermal_governor_update_tz(tz, THERMAL_TZ_UNBIND_CDEV); device_remove_file(&tz->device, &pos->weight_attr); device_remove_file(&tz->device, &pos->attr); sysfs_remove_link(&tz->device.kobj, pos->name); ida_free(&tz->ida, pos->id); kfree(pos); } static void thermal_release(struct device *dev) { struct thermal_zone_device *tz; struct thermal_cooling_device *cdev; if (!strncmp(dev_name(dev), "thermal_zone", sizeof("thermal_zone") - 1)) { tz = to_thermal_zone(dev); thermal_zone_destroy_device_groups(tz); mutex_destroy(&tz->lock); complete(&tz->removal); } else if (!strncmp(dev_name(dev), "cooling_device", sizeof("cooling_device") - 1)) { cdev = to_cooling_device(dev); thermal_cooling_device_destroy_sysfs(cdev); kfree_const(cdev->type); ida_free(&thermal_cdev_ida, cdev->id); kfree(cdev); } } static struct class *thermal_class; static inline void print_bind_err_msg(struct thermal_zone_device *tz, const struct thermal_trip_desc *td, struct thermal_cooling_device *cdev, int ret) { dev_err(&tz->device, "binding cdev %s to trip %d failed: %d\n", cdev->type, thermal_zone_trip_id(tz, &td->trip), ret); } static bool __thermal_zone_cdev_bind(struct thermal_zone_device *tz, struct thermal_cooling_device *cdev) { struct thermal_trip_desc *td; bool update_tz = false; if (!tz->ops.should_bind) return false; for_each_trip_desc(tz, td) { struct cooling_spec c = { .upper = THERMAL_NO_LIMIT, .lower = THERMAL_NO_LIMIT, .weight = THERMAL_WEIGHT_DEFAULT }; int ret; if (!tz->ops.should_bind(tz, &td->trip, cdev, &c)) continue; ret = thermal_bind_cdev_to_trip(tz, td, cdev, &c); if (ret) { print_bind_err_msg(tz, td, cdev, ret); continue; } update_tz = true; } return update_tz; } static void thermal_zone_cdev_bind(struct thermal_zone_device *tz, struct thermal_cooling_device *cdev) { guard(thermal_zone)(tz); if (__thermal_zone_cdev_bind(tz, cdev)) __thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED); } static void thermal_cooling_device_init_complete(struct thermal_cooling_device *cdev) { struct thermal_zone_device *tz; guard(mutex)(&thermal_list_lock); list_add(&cdev->node, &thermal_cdev_list); list_for_each_entry(tz, &thermal_tz_list, node) thermal_zone_cdev_bind(tz, cdev); } /** * __thermal_cooling_device_register() - register a new thermal cooling device * @np: a pointer to a device tree node. * @type: the thermal cooling device type. * @devdata: device private data. * @ops: standard thermal cooling devices callbacks. * * This interface function adds a new thermal cooling device (fan/processor/...) * to /sys/class/thermal/ folder as cooling_device[0-*]. It tries to bind itself * to all the thermal zone devices registered at the same time. * It also gives the opportunity to link the cooling device to a device tree * node, so that it can be bound to a thermal zone created out of device tree. * * Return: a pointer to the created struct thermal_cooling_device or an * ERR_PTR. Caller must check return value with IS_ERR*() helpers. */ static struct thermal_cooling_device * __thermal_cooling_device_register(struct device_node *np, const char *type, void *devdata, const struct thermal_cooling_device_ops *ops) { struct thermal_cooling_device *cdev; unsigned long current_state; int id, ret; if (!ops || !ops->get_max_state || !ops->get_cur_state || !ops->set_cur_state) return ERR_PTR(-EINVAL); if (!thermal_class) return ERR_PTR(-ENODEV); cdev = kzalloc(sizeof(*cdev), GFP_KERNEL); if (!cdev) return ERR_PTR(-ENOMEM); ret = ida_alloc(&thermal_cdev_ida, GFP_KERNEL); if (ret < 0) goto out_kfree_cdev; cdev->id = ret; id = ret; cdev->type = kstrdup_const(type ? type : "", GFP_KERNEL); if (!cdev->type) { ret = -ENOMEM; goto out_ida_remove; } mutex_init(&cdev->lock); INIT_LIST_HEAD(&cdev->thermal_instances); cdev->np = np; cdev->ops = ops; cdev->updated = false; cdev->device.class = thermal_class; cdev->devdata = devdata; ret = cdev->ops->get_max_state(cdev, &cdev->max_state); if (ret) goto out_cdev_type; /* * The cooling device's current state is only needed for debug * initialization below, so a failure to get it does not cause * the entire cooling device initialization to fail. However, * the debug will not work for the device if its initial state * cannot be determined and drivers are responsible for ensuring * that this will not happen. */ ret = cdev->ops->get_cur_state(cdev, &current_state); if (ret) current_state = ULONG_MAX; thermal_cooling_device_setup_sysfs(cdev); ret = dev_set_name(&cdev->device, "cooling_device%d", cdev->id); if (ret) goto out_cooling_dev; ret = device_register(&cdev->device); if (ret) { /* thermal_release() handles rest of the cleanup */ put_device(&cdev->device); return ERR_PTR(ret); } if (current_state <= cdev->max_state) thermal_debug_cdev_add(cdev, current_state); thermal_cooling_device_init_complete(cdev); return cdev; out_cooling_dev: thermal_cooling_device_destroy_sysfs(cdev); out_cdev_type: kfree_const(cdev->type); out_ida_remove: ida_free(&thermal_cdev_ida, id); out_kfree_cdev: kfree(cdev); return ERR_PTR(ret); } /** * thermal_cooling_device_register() - register a new thermal cooling device * @type: the thermal cooling device type. * @devdata: device private data. * @ops: standard thermal cooling devices callbacks. * * This interface function adds a new thermal cooling device (fan/processor/...) * to /sys/class/thermal/ folder as cooling_device[0-*]. It tries to bind itself * to all the thermal zone devices registered at the same time. * * Return: a pointer to the created struct thermal_cooling_device or an * ERR_PTR. Caller must check return value with IS_ERR*() helpers. */ struct thermal_cooling_device * thermal_cooling_device_register(const char *type, void *devdata, const struct thermal_cooling_device_ops *ops) { return __thermal_cooling_device_register(NULL, type, devdata, ops); } EXPORT_SYMBOL_GPL(thermal_cooling_device_register); /** * thermal_of_cooling_device_register() - register an OF thermal cooling device * @np: a pointer to a device tree node. * @type: the thermal cooling device type. * @devdata: device private data. * @ops: standard thermal cooling devices callbacks. * * This function will register a cooling device with device tree node reference. * This interface function adds a new thermal cooling device (fan/processor/...) * to /sys/class/thermal/ folder as cooling_device[0-*]. It tries to bind itself * to all the thermal zone devices registered at the same time. * * Return: a pointer to the created struct thermal_cooling_device or an * ERR_PTR. Caller must check return value with IS_ERR*() helpers. */ struct thermal_cooling_device * thermal_of_cooling_device_register(struct device_node *np, const char *type, void *devdata, const struct thermal_cooling_device_ops *ops) { return __thermal_cooling_device_register(np, type, devdata, ops); } EXPORT_SYMBOL_GPL(thermal_of_cooling_device_register); static void thermal_cooling_device_release(struct device *dev, void *res) { thermal_cooling_device_unregister( *(struct thermal_cooling_device **)res); } /** * devm_thermal_of_cooling_device_register() - register an OF thermal cooling * device * @dev: a valid struct device pointer of a sensor device. * @np: a pointer to a device tree node. * @type: the thermal cooling device type. * @devdata: device private data. * @ops: standard thermal cooling devices callbacks. * * This function will register a cooling device with device tree node reference. * This interface function adds a new thermal cooling device (fan/processor/...) * to /sys/class/thermal/ folder as cooling_device[0-*]. It tries to bind itself * to all the thermal zone devices registered at the same time. * * Return: a pointer to the created struct thermal_cooling_device or an * ERR_PTR. Caller must check return value with IS_ERR*() helpers. */ struct thermal_cooling_device * devm_thermal_of_cooling_device_register(struct device *dev, struct device_node *np, const char *type, void *devdata, const struct thermal_cooling_device_ops *ops) { struct thermal_cooling_device **ptr, *tcd; ptr = devres_alloc(thermal_cooling_device_release, sizeof(*ptr), GFP_KERNEL); if (!ptr) return ERR_PTR(-ENOMEM); tcd = __thermal_cooling_device_register(np, type, devdata, ops); if (IS_ERR(tcd)) { devres_free(ptr); return tcd; } *ptr = tcd; devres_add(dev, ptr); return tcd; } EXPORT_SYMBOL_GPL(devm_thermal_of_cooling_device_register); static bool thermal_cooling_device_present(struct thermal_cooling_device *cdev) { struct thermal_cooling_device *pos = NULL; list_for_each_entry(pos, &thermal_cdev_list, node) { if (pos == cdev) return true; } return false; } /** * thermal_cooling_device_update - Update a cooling device object * @cdev: Target cooling device. * * Update @cdev to reflect a change of the underlying hardware or platform. * * Must be called when the maximum cooling state of @cdev becomes invalid and so * its .get_max_state() callback needs to be run to produce the new maximum * cooling state value. */ void thermal_cooling_device_update(struct thermal_cooling_device *cdev) { struct thermal_instance *ti; unsigned long state; if (IS_ERR_OR_NULL(cdev)) return; /* * Hold thermal_list_lock throughout the update to prevent the device * from going away while being updated. */ guard(mutex)(&thermal_list_lock); if (!thermal_cooling_device_present(cdev)) return; /* * Update under the cdev lock to prevent the state from being set beyond * the new limit concurrently. */ guard(cooling_dev)(cdev); if (cdev->ops->get_max_state(cdev, &cdev->max_state)) return; thermal_cooling_device_stats_reinit(cdev); list_for_each_entry(ti, &cdev->thermal_instances, cdev_node) { if (ti->upper == cdev->max_state) continue; if (ti->upper < cdev->max_state) { if (ti->upper_no_limit) ti->upper = cdev->max_state; continue; } ti->upper = cdev->max_state; if (ti->lower > ti->upper) ti->lower = ti->upper; if (ti->target == THERMAL_NO_TARGET) continue; if (ti->target > ti->upper) ti->target = ti->upper; } if (cdev->ops->get_cur_state(cdev, &state) || state > cdev->max_state) return; thermal_cooling_device_stats_update(cdev, state); } EXPORT_SYMBOL_GPL(thermal_cooling_device_update); static void __thermal_zone_cdev_unbind(struct thermal_zone_device *tz, struct thermal_cooling_device *cdev) { struct thermal_trip_desc *td; for_each_trip_desc(tz, td) thermal_unbind_cdev_from_trip(tz, td, cdev); } static void thermal_zone_cdev_unbind(struct thermal_zone_device *tz, struct thermal_cooling_device *cdev) { guard(thermal_zone)(tz); __thermal_zone_cdev_unbind(tz, cdev); } static bool thermal_cooling_device_exit(struct thermal_cooling_device *cdev) { struct thermal_zone_device *tz; guard(mutex)(&thermal_list_lock); if (!thermal_cooling_device_present(cdev)) return false; list_del(&cdev->node); list_for_each_entry(tz, &thermal_tz_list, node) thermal_zone_cdev_unbind(tz, cdev); return true; } /** * thermal_cooling_device_unregister() - removes a thermal cooling device * @cdev: Thermal cooling device to remove. */ void thermal_cooling_device_unregister(struct thermal_cooling_device *cdev) { if (!cdev) return; thermal_debug_cdev_remove(cdev); if (thermal_cooling_device_exit(cdev)) device_unregister(&cdev->device); } EXPORT_SYMBOL_GPL(thermal_cooling_device_unregister); int thermal_zone_get_crit_temp(struct thermal_zone_device *tz, int *temp) { const struct thermal_trip_desc *td; int ret = -EINVAL; if (tz->ops.get_crit_temp) return tz->ops.get_crit_temp(tz, temp); guard(thermal_zone)(tz); for_each_trip_desc(tz, td) { const struct thermal_trip *trip = &td->trip; if (trip->type == THERMAL_TRIP_CRITICAL) { *temp = trip->temperature; ret = 0; break; } } return ret; } EXPORT_SYMBOL_GPL(thermal_zone_get_crit_temp); static void thermal_zone_device_check(struct work_struct *work) { struct thermal_zone_device *tz = container_of(work, struct thermal_zone_device, poll_queue.work); thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED); } static void thermal_zone_device_init(struct thermal_zone_device *tz) { struct thermal_trip_desc *td, *next; INIT_DELAYED_WORK(&tz->poll_queue, thermal_zone_device_check); tz->temperature = THERMAL_TEMP_INIT; tz->passive = 0; tz->prev_low_trip = -INT_MAX; tz->prev_high_trip = INT_MAX; for_each_trip_desc(tz, td) { struct thermal_instance *instance; list_for_each_entry(instance, &td->thermal_instances, trip_node) instance->initialized = false; } /* * At this point, all valid trips need to be moved to trips_high so that * mitigation can be started if the zone temperature is above them. */ list_for_each_entry_safe(td, next, &tz->trips_invalid, list_node) { if (td->trip.temperature != THERMAL_TEMP_INVALID) move_to_trips_high(tz, td); } /* The trips_reached list may not be empty during system resume. */ list_for_each_entry_safe(td, next, &tz->trips_reached, list_node) { if (td->trip.temperature == THERMAL_TEMP_INVALID) move_to_trips_invalid(tz, td); else move_to_trips_high(tz, td); } } static int thermal_zone_init_governor(struct thermal_zone_device *tz) { struct thermal_governor *governor; guard(mutex)(&thermal_governor_lock); if (tz->tzp) governor = __find_governor(tz->tzp->governor_name); else governor = def_governor; return thermal_set_governor(tz, governor); } static void thermal_zone_init_complete(struct thermal_zone_device *tz) { struct thermal_cooling_device *cdev; guard(mutex)(&thermal_list_lock); list_add_tail(&tz->node, &thermal_tz_list); guard(thermal_zone)(tz); /* Bind cooling devices for this zone. */ list_for_each_entry(cdev, &thermal_cdev_list, node) __thermal_zone_cdev_bind(tz, cdev); tz->state &= ~TZ_STATE_FLAG_INIT; /* * If system suspend or resume is in progress at this point, the * new thermal zone needs to be marked as suspended because * thermal_pm_notify() has run already. */ if (thermal_pm_suspended) tz->state |= TZ_STATE_FLAG_SUSPENDED; __thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED); } /** * thermal_zone_device_register_with_trips() - register a new thermal zone device * @type: the thermal zone device type * @trips: a pointer to an array of thermal trips * @num_trips: the number of trip points the thermal zone support * @devdata: private device data * @ops: standard thermal zone device callbacks * @tzp: thermal zone platform parameters * @passive_delay: number of milliseconds to wait between polls when * performing passive cooling * @polling_delay: number of milliseconds to wait between polls when checking * whether trip points have been crossed (0 for interrupt * driven systems) * * This interface function adds a new thermal zone device (sensor) to * /sys/class/thermal folder as thermal_zone[0-*]. It tries to bind all the * thermal cooling devices registered at the same time. * thermal_zone_device_unregister() must be called when the device is no * longer needed. The passive cooling depends on the .get_trend() return value. * * Return: a pointer to the created struct thermal_zone_device or an * in case of error, an ERR_PTR. Caller must check return value with * IS_ERR*() helpers. */ struct thermal_zone_device * thermal_zone_device_register_with_trips(const char *type, const struct thermal_trip *trips, int num_trips, void *devdata, const struct thermal_zone_device_ops *ops, const struct thermal_zone_params *tzp, unsigned int passive_delay, unsigned int polling_delay) { const struct thermal_trip *trip = trips; struct thermal_zone_device *tz; struct thermal_trip_desc *td; int id; int result; if (!type || strlen(type) == 0) { pr_err("No thermal zone type defined\n"); return ERR_PTR(-EINVAL); } if (strlen(type) >= THERMAL_NAME_LENGTH) { pr_err("Thermal zone name (%s) too long, should be under %d chars\n", type, THERMAL_NAME_LENGTH); return ERR_PTR(-EINVAL); } if (num_trips < 0) { pr_err("Incorrect number of thermal trips\n"); return ERR_PTR(-EINVAL); } if (!ops || !ops->get_temp) { pr_err("Thermal zone device ops not defined or invalid\n"); return ERR_PTR(-EINVAL); } if (num_trips > 0 && !trips) return ERR_PTR(-EINVAL); if (polling_delay && passive_delay > polling_delay) return ERR_PTR(-EINVAL); if (!thermal_class) return ERR_PTR(-ENODEV); tz = kzalloc(struct_size(tz, trips, num_trips), GFP_KERNEL); if (!tz) return ERR_PTR(-ENOMEM); if (tzp) { tz->tzp = kmemdup(tzp, sizeof(*tzp), GFP_KERNEL); if (!tz->tzp) { result = -ENOMEM; goto free_tz; } } INIT_LIST_HEAD(&tz->node); INIT_LIST_HEAD(&tz->trips_high); INIT_LIST_HEAD(&tz->trips_reached); INIT_LIST_HEAD(&tz->trips_invalid); ida_init(&tz->ida); mutex_init(&tz->lock); init_completion(&tz->removal); init_completion(&tz->resume); id = ida_alloc(&thermal_tz_ida, GFP_KERNEL); if (id < 0) { result = id; goto free_tzp; } tz->id = id; strscpy(tz->type, type, sizeof(tz->type)); tz->ops = *ops; if (!tz->ops.critical) tz->ops.critical = thermal_zone_device_critical; tz->device.class = thermal_class; tz->devdata = devdata; tz->num_trips = num_trips; for_each_trip_desc(tz, td) { td->trip = *trip++; INIT_LIST_HEAD(&td->thermal_instances); INIT_LIST_HEAD(&td->list_node); /* * Mark all thresholds as invalid to start with even though * this only matters for the trips that start as invalid and * become valid later. */ move_to_trips_invalid(tz, td); } tz->polling_delay_jiffies = msecs_to_jiffies(polling_delay); tz->passive_delay_jiffies = msecs_to_jiffies(passive_delay); tz->recheck_delay_jiffies = THERMAL_RECHECK_DELAY; tz->state = TZ_STATE_FLAG_INIT; /* sys I/F */ /* Add nodes that are always present via .groups */ result = thermal_zone_create_device_groups(tz); if (result) goto remove_id; result = dev_set_name(&tz->device, "thermal_zone%d", tz->id); if (result) { thermal_zone_destroy_device_groups(tz); goto remove_id; } thermal_zone_device_init(tz); result = device_register(&tz->device); if (result) goto release_device; result = thermal_zone_init_governor(tz); if (result) goto unregister; if (!tz->tzp || !tz->tzp->no_hwmon) { result = thermal_add_hwmon_sysfs(tz); if (result) goto unregister; } result = thermal_thresholds_init(tz); if (result) goto remove_hwmon; thermal_zone_init_complete(tz); thermal_notify_tz_create(tz); thermal_debug_tz_add(tz); return tz; remove_hwmon: thermal_remove_hwmon_sysfs(tz); unregister: device_del(&tz->device); release_device: put_device(&tz->device); remove_id: ida_free(&thermal_tz_ida, id); free_tzp: kfree(tz->tzp); free_tz: kfree(tz); return ERR_PTR(result); } EXPORT_SYMBOL_GPL(thermal_zone_device_register_with_trips); struct thermal_zone_device *thermal_tripless_zone_device_register( const char *type, void *devdata, const struct thermal_zone_device_ops *ops, const struct thermal_zone_params *tzp) { return thermal_zone_device_register_with_trips(type, NULL, 0, devdata, ops, tzp, 0, 0); } EXPORT_SYMBOL_GPL(thermal_tripless_zone_device_register); void *thermal_zone_device_priv(struct thermal_zone_device *tzd) { return tzd->devdata; } EXPORT_SYMBOL_GPL(thermal_zone_device_priv); const char *thermal_zone_device_type(struct thermal_zone_device *tzd) { return tzd->type; } EXPORT_SYMBOL_GPL(thermal_zone_device_type); int thermal_zone_device_id(struct thermal_zone_device *tzd) { return tzd->id; } EXPORT_SYMBOL_GPL(thermal_zone_device_id); struct device *thermal_zone_device(struct thermal_zone_device *tzd) { return &tzd->device; } EXPORT_SYMBOL_GPL(thermal_zone_device); static bool thermal_zone_exit(struct thermal_zone_device *tz) { struct thermal_cooling_device *cdev; guard(mutex)(&thermal_list_lock); if (list_empty(&tz->node)) return false; guard(thermal_zone)(tz); tz->state |= TZ_STATE_FLAG_EXIT; list_del_init(&tz->node); /* Unbind all cdevs associated with this thermal zone. */ list_for_each_entry(cdev, &thermal_cdev_list, node) __thermal_zone_cdev_unbind(tz, cdev); return true; } /** * thermal_zone_device_unregister - removes the registered thermal zone device * @tz: the thermal zone device to remove */ void thermal_zone_device_unregister(struct thermal_zone_device *tz) { if (!tz) return; thermal_debug_tz_remove(tz); if (!thermal_zone_exit(tz)) return; cancel_delayed_work_sync(&tz->poll_queue); thermal_set_governor(tz, NULL); thermal_thresholds_exit(tz); thermal_remove_hwmon_sysfs(tz); ida_free(&thermal_tz_ida, tz->id); ida_destroy(&tz->ida); device_del(&tz->device); put_device(&tz->device); thermal_notify_tz_delete(tz); wait_for_completion(&tz->removal); kfree(tz->tzp); kfree(tz); } EXPORT_SYMBOL_GPL(thermal_zone_device_unregister); /** * thermal_zone_get_zone_by_name() - search for a zone and returns its ref * @name: thermal zone name to fetch the temperature * * When only one zone is found with the passed name, returns a reference to it. * * Return: On success returns a reference to an unique thermal zone with * matching name equals to @name, an ERR_PTR otherwise (-EINVAL for invalid * paramenters, -ENODEV for not found and -EEXIST for multiple matches). */ struct thermal_zone_device *thermal_zone_get_zone_by_name(const char *name) { struct thermal_zone_device *pos = NULL, *ref = ERR_PTR(-EINVAL); unsigned int found = 0; if (!name) return ERR_PTR(-EINVAL); guard(mutex)(&thermal_list_lock); list_for_each_entry(pos, &thermal_tz_list, node) if (!strncasecmp(name, pos->type, THERMAL_NAME_LENGTH)) { found++; ref = pos; } if (!found) return ERR_PTR(-ENODEV); /* Success only when one zone is found. */ if (found > 1) return ERR_PTR(-EEXIST); return ref; } EXPORT_SYMBOL_GPL(thermal_zone_get_zone_by_name); static void thermal_zone_device_resume(struct work_struct *work) { struct thermal_zone_device *tz; tz = container_of(work, struct thermal_zone_device, poll_queue.work); guard(thermal_zone)(tz); tz->state &= ~(TZ_STATE_FLAG_SUSPENDED | TZ_STATE_FLAG_RESUMING); thermal_debug_tz_resume(tz); thermal_zone_device_init(tz); thermal_governor_update_tz(tz, THERMAL_TZ_RESUME); __thermal_zone_device_update(tz, THERMAL_TZ_RESUME); complete(&tz->resume); } static void thermal_zone_pm_prepare(struct thermal_zone_device *tz) { guard(thermal_zone)(tz); if (tz->state & TZ_STATE_FLAG_RESUMING) { /* * thermal_zone_device_resume() queued up for this zone has not * acquired the lock yet, so release it to let the function run * and wait util it has done the work. */ scoped_guard(thermal_zone_reverse, tz) { wait_for_completion(&tz->resume); } } tz->state |= TZ_STATE_FLAG_SUSPENDED; } static void thermal_pm_notify_prepare(void) { struct thermal_zone_device *tz; guard(mutex)(&thermal_list_lock); thermal_pm_suspended = true; list_for_each_entry(tz, &thermal_tz_list, node) thermal_zone_pm_prepare(tz); } static void thermal_zone_pm_complete(struct thermal_zone_device *tz) { guard(thermal_zone)(tz); cancel_delayed_work(&tz->poll_queue); reinit_completion(&tz->resume); tz->state |= TZ_STATE_FLAG_RESUMING; /* * Replace the work function with the resume one, which will restore the * original work function and schedule the polling work if needed. */ INIT_DELAYED_WORK(&tz->poll_queue, thermal_zone_device_resume); /* Queue up the work without a delay. */ mod_delayed_work(system_freezable_power_efficient_wq, &tz->poll_queue, 0); } static void thermal_pm_notify_complete(void) { struct thermal_zone_device *tz; guard(mutex)(&thermal_list_lock); thermal_pm_suspended = false; list_for_each_entry(tz, &thermal_tz_list, node) thermal_zone_pm_complete(tz); } static int thermal_pm_notify(struct notifier_block *nb, unsigned long mode, void *_unused) { switch (mode) { case PM_HIBERNATION_PREPARE: case PM_RESTORE_PREPARE: case PM_SUSPEND_PREPARE: thermal_pm_notify_prepare(); break; case PM_POST_HIBERNATION: case PM_POST_RESTORE: case PM_POST_SUSPEND: thermal_pm_notify_complete(); break; default: break; } return 0; } static struct notifier_block thermal_pm_nb = { .notifier_call = thermal_pm_notify, /* * Run at the lowest priority to avoid interference between the thermal * zone resume work items spawned by thermal_pm_notify() and the other * PM notifiers. */ .priority = INT_MIN, }; static int __init thermal_init(void) { int result; thermal_debug_init(); result = thermal_netlink_init(); if (result) goto error; result = thermal_register_governors(); if (result) goto unregister_netlink; thermal_class = kzalloc(sizeof(*thermal_class), GFP_KERNEL); if (!thermal_class) { result = -ENOMEM; goto unregister_governors; } thermal_class->name = "thermal"; thermal_class->dev_release = thermal_release; result = class_register(thermal_class); if (result) { kfree(thermal_class); thermal_class = NULL; goto unregister_governors; } result = register_pm_notifier(&thermal_pm_nb); if (result) pr_warn("Thermal: Can not register suspend notifier, return %d\n", result); return 0; unregister_governors: thermal_unregister_governors(); unregister_netlink: thermal_netlink_exit(); error: mutex_destroy(&thermal_list_lock); mutex_destroy(&thermal_governor_lock); return result; } postcore_initcall(thermal_init);
16 16 16 1 16 16 16 16 8 8 8 3 3 3 1 4 1 5 16 16 16 2 7 5 6 1 1 1 11 11 11 9 2 1 5 3 1 2 22 22 13 7 2 20 1 22 22 24 24 10 13 22 22 5 2 1 1 1 3 2 14 15 14 12 3 1 2 5 4 5 4 17 10 1 1 2 3 1 8 6 2 1 6 3 1 1 1 1 17 13 14 14 14 5 5 1 2 2 3 1 3 1 3 1 3 1 3 1 5 3 2 1 1 39 79 80 5 2 2 7 7 7 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPv6 virtual tunneling interface * * Copyright (C) 2013 secunet Security Networks AG * * Author: * Steffen Klassert <steffen.klassert@secunet.com> * * Based on: * net/ipv6/ip6_tunnel.c */ #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/sockios.h> #include <linux/icmp.h> #include <linux/if.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/icmpv6.h> #include <linux/init.h> #include <linux/route.h> #include <linux/rtnetlink.h> #include <linux/netfilter_ipv6.h> #include <linux/slab.h> #include <linux/hash.h> #include <linux/uaccess.h> #include <linux/atomic.h> #include <net/icmp.h> #include <net/ip.h> #include <net/ip_tunnels.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/ip6_tunnel.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <linux/etherdevice.h> #define IP6_VTI_HASH_SIZE_SHIFT 5 #define IP6_VTI_HASH_SIZE (1 << IP6_VTI_HASH_SIZE_SHIFT) static u32 HASH(const struct in6_addr *addr1, const struct in6_addr *addr2) { u32 hash = ipv6_addr_hash(addr1) ^ ipv6_addr_hash(addr2); return hash_32(hash, IP6_VTI_HASH_SIZE_SHIFT); } static int vti6_dev_init(struct net_device *dev); static void vti6_dev_setup(struct net_device *dev); static struct rtnl_link_ops vti6_link_ops __read_mostly; static unsigned int vti6_net_id __read_mostly; struct vti6_net { /* the vti6 tunnel fallback device */ struct net_device *fb_tnl_dev; /* lists for storing tunnels in use */ struct ip6_tnl __rcu *tnls_r_l[IP6_VTI_HASH_SIZE]; struct ip6_tnl __rcu *tnls_wc[1]; struct ip6_tnl __rcu **tnls[2]; }; #define for_each_vti6_tunnel_rcu(start) \ for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) /** * vti6_tnl_lookup - fetch tunnel matching the end-point addresses * @net: network namespace * @remote: the address of the tunnel exit-point * @local: the address of the tunnel entry-point * * Return: * tunnel matching given end-points if found, * else fallback tunnel if its device is up, * else %NULL **/ static struct ip6_tnl * vti6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_addr *local) { unsigned int hash = HASH(remote, local); struct ip6_tnl *t; struct vti6_net *ip6n = net_generic(net, vti6_net_id); struct in6_addr any; for_each_vti6_tunnel_rcu(ip6n->tnls_r_l[hash]) { if (ipv6_addr_equal(local, &t->parms.laddr) && ipv6_addr_equal(remote, &t->parms.raddr) && (t->dev->flags & IFF_UP)) return t; } memset(&any, 0, sizeof(any)); hash = HASH(&any, local); for_each_vti6_tunnel_rcu(ip6n->tnls_r_l[hash]) { if (ipv6_addr_equal(local, &t->parms.laddr) && (t->dev->flags & IFF_UP)) return t; } hash = HASH(remote, &any); for_each_vti6_tunnel_rcu(ip6n->tnls_r_l[hash]) { if (ipv6_addr_equal(remote, &t->parms.raddr) && (t->dev->flags & IFF_UP)) return t; } t = rcu_dereference(ip6n->tnls_wc[0]); if (t && (t->dev->flags & IFF_UP)) return t; return NULL; } /** * vti6_tnl_bucket - get head of list matching given tunnel parameters * @ip6n: the private data for ip6_vti in the netns * @p: parameters containing tunnel end-points * * Description: * vti6_tnl_bucket() returns the head of the list matching the * &struct in6_addr entries laddr and raddr in @p. * * Return: head of IPv6 tunnel list **/ static struct ip6_tnl __rcu ** vti6_tnl_bucket(struct vti6_net *ip6n, const struct __ip6_tnl_parm *p) { const struct in6_addr *remote = &p->raddr; const struct in6_addr *local = &p->laddr; unsigned int h = 0; int prio = 0; if (!ipv6_addr_any(remote) || !ipv6_addr_any(local)) { prio = 1; h = HASH(remote, local); } return &ip6n->tnls[prio][h]; } static void vti6_tnl_link(struct vti6_net *ip6n, struct ip6_tnl *t) { struct ip6_tnl __rcu **tp = vti6_tnl_bucket(ip6n, &t->parms); rcu_assign_pointer(t->next, rtnl_dereference(*tp)); rcu_assign_pointer(*tp, t); } static void vti6_tnl_unlink(struct vti6_net *ip6n, struct ip6_tnl *t) { struct ip6_tnl __rcu **tp; struct ip6_tnl *iter; for (tp = vti6_tnl_bucket(ip6n, &t->parms); (iter = rtnl_dereference(*tp)) != NULL; tp = &iter->next) { if (t == iter) { rcu_assign_pointer(*tp, t->next); break; } } } static int vti6_tnl_create2(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct net *net = dev_net(dev); struct vti6_net *ip6n = net_generic(net, vti6_net_id); int err; dev->rtnl_link_ops = &vti6_link_ops; err = register_netdevice(dev); if (err < 0) goto out; strcpy(t->parms.name, dev->name); vti6_tnl_link(ip6n, t); return 0; out: return err; } static struct ip6_tnl *vti6_tnl_create(struct net *net, struct __ip6_tnl_parm *p) { struct net_device *dev; struct ip6_tnl *t; char name[IFNAMSIZ]; int err; if (p->name[0]) { if (!dev_valid_name(p->name)) goto failed; strscpy(name, p->name, IFNAMSIZ); } else { sprintf(name, "ip6_vti%%d"); } dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, vti6_dev_setup); if (!dev) goto failed; dev_net_set(dev, net); t = netdev_priv(dev); t->parms = *p; t->net = dev_net(dev); err = vti6_tnl_create2(dev); if (err < 0) goto failed_free; return t; failed_free: free_netdev(dev); failed: return NULL; } /** * vti6_locate - find or create tunnel matching given parameters * @net: network namespace * @p: tunnel parameters * @create: != 0 if allowed to create new tunnel if no match found * * Description: * vti6_locate() first tries to locate an existing tunnel * based on @parms. If this is unsuccessful, but @create is set a new * tunnel device is created and registered for use. * * Return: * matching tunnel or NULL **/ static struct ip6_tnl *vti6_locate(struct net *net, struct __ip6_tnl_parm *p, int create) { const struct in6_addr *remote = &p->raddr; const struct in6_addr *local = &p->laddr; struct ip6_tnl __rcu **tp; struct ip6_tnl *t; struct vti6_net *ip6n = net_generic(net, vti6_net_id); for (tp = vti6_tnl_bucket(ip6n, p); (t = rtnl_dereference(*tp)) != NULL; tp = &t->next) { if (ipv6_addr_equal(local, &t->parms.laddr) && ipv6_addr_equal(remote, &t->parms.raddr)) { if (create) return NULL; return t; } } if (!create) return NULL; return vti6_tnl_create(net, p); } /** * vti6_dev_uninit - tunnel device uninitializer * @dev: the device to be destroyed * * Description: * vti6_dev_uninit() removes tunnel from its list **/ static void vti6_dev_uninit(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct vti6_net *ip6n = net_generic(t->net, vti6_net_id); if (dev == ip6n->fb_tnl_dev) RCU_INIT_POINTER(ip6n->tnls_wc[0], NULL); else vti6_tnl_unlink(ip6n, t); netdev_put(dev, &t->dev_tracker); } static int vti6_input_proto(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) { struct ip6_tnl *t; const struct ipv6hdr *ipv6h = ipv6_hdr(skb); rcu_read_lock(); t = vti6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr, &ipv6h->daddr); if (t) { if (t->parms.proto != IPPROTO_IPV6 && t->parms.proto != 0) { rcu_read_unlock(); goto discard; } if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { rcu_read_unlock(); goto discard; } ipv6h = ipv6_hdr(skb); if (!ip6_tnl_rcv_ctl(t, &ipv6h->daddr, &ipv6h->saddr)) { DEV_STATS_INC(t->dev, rx_dropped); rcu_read_unlock(); goto discard; } rcu_read_unlock(); XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t; XFRM_SPI_SKB_CB(skb)->family = AF_INET6; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); return xfrm_input(skb, nexthdr, spi, encap_type); } rcu_read_unlock(); return -EINVAL; discard: kfree_skb(skb); return 0; } static int vti6_rcv(struct sk_buff *skb) { int nexthdr = skb_network_header(skb)[IP6CB(skb)->nhoff]; return vti6_input_proto(skb, nexthdr, 0, 0); } static int vti6_rcv_cb(struct sk_buff *skb, int err) { unsigned short family; struct net_device *dev; struct xfrm_state *x; const struct xfrm_mode *inner_mode; struct ip6_tnl *t = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6; u32 orig_mark = skb->mark; int ret; if (!t) return 1; dev = t->dev; if (err) { DEV_STATS_INC(dev, rx_errors); DEV_STATS_INC(dev, rx_dropped); return 0; } x = xfrm_input_state(skb); inner_mode = &x->inner_mode; if (x->sel.family == AF_UNSPEC) { inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); if (inner_mode == NULL) { XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMINSTATEMODEERROR); return -EINVAL; } } family = inner_mode->family; skb->mark = be32_to_cpu(t->parms.i_key); ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family); skb->mark = orig_mark; if (!ret) return -EPERM; skb_scrub_packet(skb, !net_eq(t->net, dev_net(skb->dev))); skb->dev = dev; dev_sw_netstats_rx_add(dev, skb->len); return 0; } /** * vti6_addr_conflict - compare packet addresses to tunnel's own * @t: the outgoing tunnel device * @hdr: IPv6 header from the incoming packet * * Description: * Avoid trivial tunneling loop by checking that tunnel exit-point * doesn't match source of incoming packet. * * Return: * 1 if conflict, * 0 else **/ static inline bool vti6_addr_conflict(const struct ip6_tnl *t, const struct ipv6hdr *hdr) { return ipv6_addr_equal(&t->parms.raddr, &hdr->saddr); } static bool vti6_state_check(const struct xfrm_state *x, const struct in6_addr *dst, const struct in6_addr *src) { xfrm_address_t *daddr = (xfrm_address_t *)dst; xfrm_address_t *saddr = (xfrm_address_t *)src; /* if there is no transform then this tunnel is not functional. * Or if the xfrm is not mode tunnel. */ if (!x || x->props.mode != XFRM_MODE_TUNNEL || x->props.family != AF_INET6) return false; if (ipv6_addr_any(dst)) return xfrm_addr_equal(saddr, &x->props.saddr, AF_INET6); if (!xfrm_state_addr_check(x, daddr, saddr, AF_INET6)) return false; return true; } /** * vti6_xmit - send a packet * @skb: the outgoing socket buffer * @dev: the outgoing tunnel device * @fl: the flow informations for the xfrm_lookup **/ static int vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) { struct ip6_tnl *t = netdev_priv(dev); struct dst_entry *dst = skb_dst(skb); struct net_device *tdev; struct xfrm_state *x; int pkt_len = skb->len; int err = -1; int mtu; if (!dst) { switch (skb->protocol) { case htons(ETH_P_IP): { struct rtable *rt; fl->u.ip4.flowi4_oif = dev->ifindex; fl->u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC; rt = __ip_route_output_key(dev_net(dev), &fl->u.ip4); if (IS_ERR(rt)) goto tx_err_link_failure; dst = &rt->dst; skb_dst_set(skb, dst); break; } case htons(ETH_P_IPV6): fl->u.ip6.flowi6_oif = dev->ifindex; fl->u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC; dst = ip6_route_output(dev_net(dev), NULL, &fl->u.ip6); if (dst->error) { dst_release(dst); dst = NULL; goto tx_err_link_failure; } skb_dst_set(skb, dst); break; default: goto tx_err_link_failure; } } dst_hold(dst); dst = xfrm_lookup_route(t->net, dst, fl, NULL, 0); if (IS_ERR(dst)) { err = PTR_ERR(dst); dst = NULL; goto tx_err_link_failure; } if (dst->flags & DST_XFRM_QUEUE) goto xmit; x = dst->xfrm; if (!vti6_state_check(x, &t->parms.raddr, &t->parms.laddr)) goto tx_err_link_failure; if (!ip6_tnl_xmit_ctl(t, (const struct in6_addr *)&x->props.saddr, (const struct in6_addr *)&x->id.daddr)) goto tx_err_link_failure; tdev = dst->dev; if (tdev == dev) { DEV_STATS_INC(dev, collisions); net_warn_ratelimited("%s: Local routing loop detected!\n", t->parms.name); goto tx_err_dst_release; } mtu = dst_mtu(dst); if (skb->len > mtu) { skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->protocol == htons(ETH_P_IPV6)) { if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); } else { if (!(ip_hdr(skb)->frag_off & htons(IP_DF))) goto xmit; icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); } err = -EMSGSIZE; goto tx_err_dst_release; } xmit: skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev))); skb_dst_set(skb, dst); skb->dev = skb_dst(skb)->dev; err = dst_output(t->net, skb->sk, skb); if (net_xmit_eval(err) == 0) err = pkt_len; iptunnel_xmit_stats(dev, err); return 0; tx_err_link_failure: DEV_STATS_INC(dev, tx_carrier_errors); dst_link_failure(skb); tx_err_dst_release: dst_release(dst); return err; } static netdev_tx_t vti6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct flowi fl; int ret; if (!pskb_inet_may_pull(skb)) goto tx_err; memset(&fl, 0, sizeof(fl)); switch (skb->protocol) { case htons(ETH_P_IPV6): if ((t->parms.proto != IPPROTO_IPV6 && t->parms.proto != 0) || vti6_addr_conflict(t, ipv6_hdr(skb))) goto tx_err; memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET6); break; case htons(ETH_P_IP): memset(IPCB(skb), 0, sizeof(*IPCB(skb))); xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET); break; default: goto tx_err; } /* override mark with tunnel output key */ fl.flowi_mark = be32_to_cpu(t->parms.o_key); ret = vti6_xmit(skb, dev, &fl); if (ret < 0) goto tx_err; return NETDEV_TX_OK; tx_err: DEV_STATS_INC(dev, tx_errors); DEV_STATS_INC(dev, tx_dropped); kfree_skb(skb); return NETDEV_TX_OK; } static int vti6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { __be32 spi; __u32 mark; struct xfrm_state *x; struct ip6_tnl *t; struct ip_esp_hdr *esph; struct ip_auth_hdr *ah; struct ip_comp_hdr *ipch; struct net *net = dev_net(skb->dev); const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data; int protocol = iph->nexthdr; t = vti6_tnl_lookup(dev_net(skb->dev), &iph->daddr, &iph->saddr); if (!t) return -1; mark = be32_to_cpu(t->parms.o_key); switch (protocol) { case IPPROTO_ESP: esph = (struct ip_esp_hdr *)(skb->data + offset); spi = esph->spi; break; case IPPROTO_AH: ah = (struct ip_auth_hdr *)(skb->data + offset); spi = ah->spi; break; case IPPROTO_COMP: ipch = (struct ip_comp_hdr *)(skb->data + offset); spi = htonl(ntohs(ipch->cpi)); break; default: return 0; } if (type != ICMPV6_PKT_TOOBIG && type != NDISC_REDIRECT) return 0; x = xfrm_state_lookup(net, mark, (const xfrm_address_t *)&iph->daddr, spi, protocol, AF_INET6); if (!x) return 0; if (type == NDISC_REDIRECT) ip6_redirect(skb, net, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); else ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL)); xfrm_state_put(x); return 0; } static void vti6_link_config(struct ip6_tnl *t, bool keep_mtu) { struct net_device *dev = t->dev; struct __ip6_tnl_parm *p = &t->parms; struct net_device *tdev = NULL; int mtu; __dev_addr_set(dev, &p->laddr, sizeof(struct in6_addr)); memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr)); p->flags &= ~(IP6_TNL_F_CAP_XMIT | IP6_TNL_F_CAP_RCV | IP6_TNL_F_CAP_PER_PACKET); p->flags |= ip6_tnl_get_cap(t, &p->laddr, &p->raddr); if (p->flags & IP6_TNL_F_CAP_XMIT && p->flags & IP6_TNL_F_CAP_RCV) dev->flags |= IFF_POINTOPOINT; else dev->flags &= ~IFF_POINTOPOINT; if (keep_mtu && dev->mtu) { WRITE_ONCE(dev->mtu, clamp(dev->mtu, dev->min_mtu, dev->max_mtu)); return; } if (p->flags & IP6_TNL_F_CAP_XMIT) { int strict = (ipv6_addr_type(&p->raddr) & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)); struct rt6_info *rt = rt6_lookup(t->net, &p->raddr, &p->laddr, p->link, NULL, strict); if (rt) tdev = rt->dst.dev; ip6_rt_put(rt); } if (!tdev && p->link) tdev = __dev_get_by_index(t->net, p->link); if (tdev) mtu = tdev->mtu - sizeof(struct ipv6hdr); else mtu = ETH_DATA_LEN - LL_MAX_HEADER - sizeof(struct ipv6hdr); dev->mtu = max_t(int, mtu, IPV4_MIN_MTU); } /** * vti6_tnl_change - update the tunnel parameters * @t: tunnel to be changed * @p: tunnel configuration parameters * @keep_mtu: MTU was set from userspace, don't re-compute it * * Description: * vti6_tnl_change() updates the tunnel parameters **/ static int vti6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p, bool keep_mtu) { t->parms.laddr = p->laddr; t->parms.raddr = p->raddr; t->parms.link = p->link; t->parms.i_key = p->i_key; t->parms.o_key = p->o_key; t->parms.proto = p->proto; t->parms.fwmark = p->fwmark; dst_cache_reset(&t->dst_cache); vti6_link_config(t, keep_mtu); return 0; } static int vti6_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p, bool keep_mtu) { struct net *net = dev_net(t->dev); struct vti6_net *ip6n = net_generic(net, vti6_net_id); int err; vti6_tnl_unlink(ip6n, t); synchronize_net(); err = vti6_tnl_change(t, p, keep_mtu); vti6_tnl_link(ip6n, t); netdev_state_change(t->dev); return err; } static void vti6_parm_from_user(struct __ip6_tnl_parm *p, const struct ip6_tnl_parm2 *u) { p->laddr = u->laddr; p->raddr = u->raddr; p->link = u->link; p->i_key = u->i_key; p->o_key = u->o_key; p->proto = u->proto; memcpy(p->name, u->name, sizeof(u->name)); } static void vti6_parm_to_user(struct ip6_tnl_parm2 *u, const struct __ip6_tnl_parm *p) { u->laddr = p->laddr; u->raddr = p->raddr; u->link = p->link; u->i_key = p->i_key; u->o_key = p->o_key; if (u->i_key) u->i_flags |= GRE_KEY; if (u->o_key) u->o_flags |= GRE_KEY; u->proto = p->proto; memcpy(u->name, p->name, sizeof(u->name)); } /** * vti6_siocdevprivate - configure vti6 tunnels from userspace * @dev: virtual device associated with tunnel * @ifr: unused * @data: parameters passed from userspace * @cmd: command to be performed * * Description: * vti6_siocdevprivate() is used for managing vti6 tunnels * from userspace. * * The possible commands are the following: * %SIOCGETTUNNEL: get tunnel parameters for device * %SIOCADDTUNNEL: add tunnel matching given tunnel parameters * %SIOCCHGTUNNEL: change tunnel parameters to those given * %SIOCDELTUNNEL: delete tunnel * * The fallback device "ip6_vti0", created during module * initialization, can be used for creating other tunnel devices. * * Return: * 0 on success, * %-EFAULT if unable to copy data to or from userspace, * %-EPERM if current process hasn't %CAP_NET_ADMIN set * %-EINVAL if passed tunnel parameters are invalid, * %-EEXIST if changing a tunnel's parameters would cause a conflict * %-ENODEV if attempting to change or delete a nonexisting device **/ static int vti6_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd) { int err = 0; struct ip6_tnl_parm2 p; struct __ip6_tnl_parm p1; struct ip6_tnl *t = NULL; struct net *net = dev_net(dev); struct vti6_net *ip6n = net_generic(net, vti6_net_id); memset(&p1, 0, sizeof(p1)); switch (cmd) { case SIOCGETTUNNEL: if (dev == ip6n->fb_tnl_dev) { if (copy_from_user(&p, data, sizeof(p))) { err = -EFAULT; break; } vti6_parm_from_user(&p1, &p); t = vti6_locate(net, &p1, 0); } else { memset(&p, 0, sizeof(p)); } if (!t) t = netdev_priv(dev); vti6_parm_to_user(&p, &t->parms); if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; break; case SIOCADDTUNNEL: case SIOCCHGTUNNEL: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = -EFAULT; if (copy_from_user(&p, data, sizeof(p))) break; err = -EINVAL; if (p.proto != IPPROTO_IPV6 && p.proto != 0) break; vti6_parm_from_user(&p1, &p); t = vti6_locate(net, &p1, cmd == SIOCADDTUNNEL); if (dev != ip6n->fb_tnl_dev && cmd == SIOCCHGTUNNEL) { if (t) { if (t->dev != dev) { err = -EEXIST; break; } } else t = netdev_priv(dev); err = vti6_update(t, &p1, false); } if (t) { err = 0; vti6_parm_to_user(&p, &t->parms); if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; } else err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); break; case SIOCDELTUNNEL: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; if (dev == ip6n->fb_tnl_dev) { err = -EFAULT; if (copy_from_user(&p, data, sizeof(p))) break; err = -ENOENT; vti6_parm_from_user(&p1, &p); t = vti6_locate(net, &p1, 0); if (!t) break; err = -EPERM; if (t->dev == ip6n->fb_tnl_dev) break; dev = t->dev; } err = 0; unregister_netdevice(dev); break; default: err = -EINVAL; } return err; } static const struct net_device_ops vti6_netdev_ops = { .ndo_init = vti6_dev_init, .ndo_uninit = vti6_dev_uninit, .ndo_start_xmit = vti6_tnl_xmit, .ndo_siocdevprivate = vti6_siocdevprivate, .ndo_get_iflink = ip6_tnl_get_iflink, }; /** * vti6_dev_setup - setup virtual tunnel device * @dev: virtual device associated with tunnel * * Description: * Initialize function pointers and device parameters **/ static void vti6_dev_setup(struct net_device *dev) { dev->netdev_ops = &vti6_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; dev->needs_free_netdev = true; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->type = ARPHRD_TUNNEL6; dev->min_mtu = IPV4_MIN_MTU; dev->max_mtu = IP_MAX_MTU - sizeof(struct ipv6hdr); dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); netif_keep_dst(dev); /* This perm addr will be used as interface identifier by IPv6 */ dev->addr_assign_type = NET_ADDR_RANDOM; eth_random_addr(dev->perm_addr); } /** * vti6_dev_init_gen - general initializer for all tunnel devices * @dev: virtual device associated with tunnel **/ static inline int vti6_dev_init_gen(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); t->dev = dev; t->net = dev_net(dev); netdev_hold(dev, &t->dev_tracker, GFP_KERNEL); netdev_lockdep_set_classes(dev); return 0; } /** * vti6_dev_init - initializer for all non fallback tunnel devices * @dev: virtual device associated with tunnel **/ static int vti6_dev_init(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); int err = vti6_dev_init_gen(dev); if (err) return err; vti6_link_config(t, true); return 0; } /** * vti6_fb_tnl_dev_init - initializer for fallback tunnel device * @dev: fallback device * * Return: 0 **/ static int __net_init vti6_fb_tnl_dev_init(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct net *net = dev_net(dev); struct vti6_net *ip6n = net_generic(net, vti6_net_id); t->parms.proto = IPPROTO_IPV6; rcu_assign_pointer(ip6n->tnls_wc[0], t); return 0; } static int vti6_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { return 0; } static void vti6_netlink_parms(struct nlattr *data[], struct __ip6_tnl_parm *parms) { memset(parms, 0, sizeof(*parms)); if (!data) return; if (data[IFLA_VTI_LINK]) parms->link = nla_get_u32(data[IFLA_VTI_LINK]); if (data[IFLA_VTI_LOCAL]) parms->laddr = nla_get_in6_addr(data[IFLA_VTI_LOCAL]); if (data[IFLA_VTI_REMOTE]) parms->raddr = nla_get_in6_addr(data[IFLA_VTI_REMOTE]); if (data[IFLA_VTI_IKEY]) parms->i_key = nla_get_be32(data[IFLA_VTI_IKEY]); if (data[IFLA_VTI_OKEY]) parms->o_key = nla_get_be32(data[IFLA_VTI_OKEY]); if (data[IFLA_VTI_FWMARK]) parms->fwmark = nla_get_u32(data[IFLA_VTI_FWMARK]); } static int vti6_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct net *net = dev_net(dev); struct ip6_tnl *nt; nt = netdev_priv(dev); vti6_netlink_parms(data, &nt->parms); nt->parms.proto = IPPROTO_IPV6; if (vti6_locate(net, &nt->parms, 0)) return -EEXIST; return vti6_tnl_create2(dev); } static void vti6_dellink(struct net_device *dev, struct list_head *head) { struct net *net = dev_net(dev); struct vti6_net *ip6n = net_generic(net, vti6_net_id); if (dev != ip6n->fb_tnl_dev) unregister_netdevice_queue(dev, head); } static int vti6_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ip6_tnl *t; struct __ip6_tnl_parm p; struct net *net = dev_net(dev); struct vti6_net *ip6n = net_generic(net, vti6_net_id); if (dev == ip6n->fb_tnl_dev) return -EINVAL; vti6_netlink_parms(data, &p); t = vti6_locate(net, &p, 0); if (t) { if (t->dev != dev) return -EEXIST; } else t = netdev_priv(dev); return vti6_update(t, &p, tb && tb[IFLA_MTU]); } static size_t vti6_get_size(const struct net_device *dev) { return /* IFLA_VTI_LINK */ nla_total_size(4) + /* IFLA_VTI_LOCAL */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VTI_REMOTE */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VTI_IKEY */ nla_total_size(4) + /* IFLA_VTI_OKEY */ nla_total_size(4) + /* IFLA_VTI_FWMARK */ nla_total_size(4) + 0; } static int vti6_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip6_tnl *tunnel = netdev_priv(dev); struct __ip6_tnl_parm *parm = &tunnel->parms; if (nla_put_u32(skb, IFLA_VTI_LINK, parm->link) || nla_put_in6_addr(skb, IFLA_VTI_LOCAL, &parm->laddr) || nla_put_in6_addr(skb, IFLA_VTI_REMOTE, &parm->raddr) || nla_put_be32(skb, IFLA_VTI_IKEY, parm->i_key) || nla_put_be32(skb, IFLA_VTI_OKEY, parm->o_key) || nla_put_u32(skb, IFLA_VTI_FWMARK, parm->fwmark)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static const struct nla_policy vti6_policy[IFLA_VTI_MAX + 1] = { [IFLA_VTI_LINK] = { .type = NLA_U32 }, [IFLA_VTI_LOCAL] = { .len = sizeof(struct in6_addr) }, [IFLA_VTI_REMOTE] = { .len = sizeof(struct in6_addr) }, [IFLA_VTI_IKEY] = { .type = NLA_U32 }, [IFLA_VTI_OKEY] = { .type = NLA_U32 }, [IFLA_VTI_FWMARK] = { .type = NLA_U32 }, }; static struct rtnl_link_ops vti6_link_ops __read_mostly = { .kind = "vti6", .maxtype = IFLA_VTI_MAX, .policy = vti6_policy, .priv_size = sizeof(struct ip6_tnl), .setup = vti6_dev_setup, .validate = vti6_validate, .newlink = vti6_newlink, .dellink = vti6_dellink, .changelink = vti6_changelink, .get_size = vti6_get_size, .fill_info = vti6_fill_info, .get_link_net = ip6_tnl_get_link_net, }; static void __net_exit vti6_destroy_tunnels(struct vti6_net *ip6n, struct list_head *list) { int h; struct ip6_tnl *t; for (h = 0; h < IP6_VTI_HASH_SIZE; h++) { t = rtnl_dereference(ip6n->tnls_r_l[h]); while (t) { unregister_netdevice_queue(t->dev, list); t = rtnl_dereference(t->next); } } t = rtnl_dereference(ip6n->tnls_wc[0]); if (t) unregister_netdevice_queue(t->dev, list); } static int __net_init vti6_init_net(struct net *net) { struct vti6_net *ip6n = net_generic(net, vti6_net_id); struct ip6_tnl *t = NULL; int err; ip6n->tnls[0] = ip6n->tnls_wc; ip6n->tnls[1] = ip6n->tnls_r_l; if (!net_has_fallback_tunnels(net)) return 0; err = -ENOMEM; ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6_vti0", NET_NAME_UNKNOWN, vti6_dev_setup); if (!ip6n->fb_tnl_dev) goto err_alloc_dev; dev_net_set(ip6n->fb_tnl_dev, net); ip6n->fb_tnl_dev->rtnl_link_ops = &vti6_link_ops; err = vti6_fb_tnl_dev_init(ip6n->fb_tnl_dev); if (err < 0) goto err_register; err = register_netdev(ip6n->fb_tnl_dev); if (err < 0) goto err_register; t = netdev_priv(ip6n->fb_tnl_dev); strcpy(t->parms.name, ip6n->fb_tnl_dev->name); return 0; err_register: free_netdev(ip6n->fb_tnl_dev); err_alloc_dev: return err; } static void __net_exit vti6_exit_batch_rtnl(struct list_head *net_list, struct list_head *dev_to_kill) { struct vti6_net *ip6n; struct net *net; ASSERT_RTNL(); list_for_each_entry(net, net_list, exit_list) { ip6n = net_generic(net, vti6_net_id); vti6_destroy_tunnels(ip6n, dev_to_kill); } } static struct pernet_operations vti6_net_ops = { .init = vti6_init_net, .exit_batch_rtnl = vti6_exit_batch_rtnl, .id = &vti6_net_id, .size = sizeof(struct vti6_net), }; static struct xfrm6_protocol vti_esp6_protocol __read_mostly = { .handler = vti6_rcv, .input_handler = vti6_input_proto, .cb_handler = vti6_rcv_cb, .err_handler = vti6_err, .priority = 100, }; static struct xfrm6_protocol vti_ah6_protocol __read_mostly = { .handler = vti6_rcv, .input_handler = vti6_input_proto, .cb_handler = vti6_rcv_cb, .err_handler = vti6_err, .priority = 100, }; static struct xfrm6_protocol vti_ipcomp6_protocol __read_mostly = { .handler = vti6_rcv, .input_handler = vti6_input_proto, .cb_handler = vti6_rcv_cb, .err_handler = vti6_err, .priority = 100, }; #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) static int vti6_rcv_tunnel(struct sk_buff *skb) { const xfrm_address_t *saddr; __be32 spi; saddr = (const xfrm_address_t *)&ipv6_hdr(skb)->saddr; spi = xfrm6_tunnel_spi_lookup(dev_net(skb->dev), saddr); return vti6_input_proto(skb, IPPROTO_IPV6, spi, 0); } static struct xfrm6_tunnel vti_ipv6_handler __read_mostly = { .handler = vti6_rcv_tunnel, .cb_handler = vti6_rcv_cb, .err_handler = vti6_err, .priority = 0, }; static struct xfrm6_tunnel vti_ip6ip_handler __read_mostly = { .handler = vti6_rcv_tunnel, .cb_handler = vti6_rcv_cb, .err_handler = vti6_err, .priority = 0, }; #endif /** * vti6_tunnel_init - register protocol and reserve needed resources * * Return: 0 on success **/ static int __init vti6_tunnel_init(void) { const char *msg; int err; msg = "tunnel device"; err = register_pernet_device(&vti6_net_ops); if (err < 0) goto pernet_dev_failed; msg = "tunnel protocols"; err = xfrm6_protocol_register(&vti_esp6_protocol, IPPROTO_ESP); if (err < 0) goto xfrm_proto_esp_failed; err = xfrm6_protocol_register(&vti_ah6_protocol, IPPROTO_AH); if (err < 0) goto xfrm_proto_ah_failed; err = xfrm6_protocol_register(&vti_ipcomp6_protocol, IPPROTO_COMP); if (err < 0) goto xfrm_proto_comp_failed; #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) msg = "ipv6 tunnel"; err = xfrm6_tunnel_register(&vti_ipv6_handler, AF_INET6); if (err < 0) goto vti_tunnel_ipv6_failed; err = xfrm6_tunnel_register(&vti_ip6ip_handler, AF_INET); if (err < 0) goto vti_tunnel_ip6ip_failed; #endif msg = "netlink interface"; err = rtnl_link_register(&vti6_link_ops); if (err < 0) goto rtnl_link_failed; return 0; rtnl_link_failed: #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) err = xfrm6_tunnel_deregister(&vti_ip6ip_handler, AF_INET); vti_tunnel_ip6ip_failed: err = xfrm6_tunnel_deregister(&vti_ipv6_handler, AF_INET6); vti_tunnel_ipv6_failed: #endif xfrm6_protocol_deregister(&vti_ipcomp6_protocol, IPPROTO_COMP); xfrm_proto_comp_failed: xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH); xfrm_proto_ah_failed: xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP); xfrm_proto_esp_failed: unregister_pernet_device(&vti6_net_ops); pernet_dev_failed: pr_err("vti6 init: failed to register %s\n", msg); return err; } /** * vti6_tunnel_cleanup - free resources and unregister protocol **/ static void __exit vti6_tunnel_cleanup(void) { rtnl_link_unregister(&vti6_link_ops); #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) xfrm6_tunnel_deregister(&vti_ip6ip_handler, AF_INET); xfrm6_tunnel_deregister(&vti_ipv6_handler, AF_INET6); #endif xfrm6_protocol_deregister(&vti_ipcomp6_protocol, IPPROTO_COMP); xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH); xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP); unregister_pernet_device(&vti6_net_ops); } module_init(vti6_tunnel_init); module_exit(vti6_tunnel_cleanup); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("vti6"); MODULE_ALIAS_NETDEV("ip6_vti0"); MODULE_AUTHOR("Steffen Klassert"); MODULE_DESCRIPTION("IPv6 virtual tunnel interface");
1883 1883 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef IOPRIO_H #define IOPRIO_H #include <linux/sched.h> #include <linux/sched/rt.h> #include <linux/iocontext.h> #include <uapi/linux/ioprio.h> /* * Default IO priority. */ #define IOPRIO_DEFAULT IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0) /* * Check that a priority value has a valid class. */ static inline bool ioprio_valid(unsigned short ioprio) { unsigned short class = IOPRIO_PRIO_CLASS(ioprio); return class > IOPRIO_CLASS_NONE && class <= IOPRIO_CLASS_IDLE; } /* * if process has set io priority explicitly, use that. if not, convert * the cpu scheduler nice value to an io priority */ static inline int task_nice_ioprio(struct task_struct *task) { return (task_nice(task) + 20) / 5; } /* * This is for the case where the task hasn't asked for a specific IO class. * Check for idle and rt task process, and return appropriate IO class. */ static inline int task_nice_ioclass(struct task_struct *task) { if (task->policy == SCHED_IDLE) return IOPRIO_CLASS_IDLE; else if (rt_or_dl_task_policy(task)) return IOPRIO_CLASS_RT; else return IOPRIO_CLASS_BE; } #ifdef CONFIG_BLOCK /* * If the task has set an I/O priority, use that. Otherwise, return * the default I/O priority. * * Expected to be called for current task or with task_lock() held to keep * io_context stable. */ static inline int __get_task_ioprio(struct task_struct *p) { struct io_context *ioc = p->io_context; int prio; if (!ioc) return IOPRIO_DEFAULT; if (p != current) lockdep_assert_held(&p->alloc_lock); prio = ioc->ioprio; if (IOPRIO_PRIO_CLASS(prio) == IOPRIO_CLASS_NONE) prio = IOPRIO_PRIO_VALUE(task_nice_ioclass(p), task_nice_ioprio(p)); return prio; } #else static inline int __get_task_ioprio(struct task_struct *p) { return IOPRIO_DEFAULT; } #endif /* CONFIG_BLOCK */ static inline int get_current_ioprio(void) { return __get_task_ioprio(current); } extern int set_task_ioprio(struct task_struct *task, int ioprio); #ifdef CONFIG_BLOCK extern int ioprio_check_cap(int ioprio); #else static inline int ioprio_check_cap(int ioprio) { return -ENOTBLK; } #endif /* CONFIG_BLOCK */ #endif
919 919 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2013-2014 Intel Corp. */ #include <linux/if_arp.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/module.h> #include <linux/debugfs.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/pkt_sched.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/l2cap.h> #include <net/6lowpan.h> /* for the compression support */ #define VERSION "0.1" static struct dentry *lowpan_enable_debugfs; static struct dentry *lowpan_control_debugfs; #define IFACE_NAME_TEMPLATE "bt%d" struct skb_cb { struct in6_addr addr; struct in6_addr gw; struct l2cap_chan *chan; }; #define lowpan_cb(skb) ((struct skb_cb *)((skb)->cb)) /* The devices list contains those devices that we are acting * as a proxy. The BT 6LoWPAN device is a virtual device that * connects to the Bluetooth LE device. The real connection to * BT device is done via l2cap layer. There exists one * virtual device / one BT 6LoWPAN network (=hciX device). * The list contains struct lowpan_dev elements. */ static LIST_HEAD(bt_6lowpan_devices); static DEFINE_SPINLOCK(devices_lock); static bool enable_6lowpan; /* We are listening incoming connections via this channel */ static struct l2cap_chan *listen_chan; static DEFINE_MUTEX(set_lock); struct lowpan_peer { struct list_head list; struct rcu_head rcu; struct l2cap_chan *chan; /* peer addresses in various formats */ unsigned char lladdr[ETH_ALEN]; struct in6_addr peer_addr; }; struct lowpan_btle_dev { struct list_head list; struct hci_dev *hdev; struct net_device *netdev; struct list_head peers; atomic_t peer_count; /* number of items in peers list */ struct work_struct delete_netdev; struct delayed_work notify_peers; }; static inline struct lowpan_btle_dev * lowpan_btle_dev(const struct net_device *netdev) { return (struct lowpan_btle_dev *)lowpan_dev(netdev)->priv; } static inline void peer_add(struct lowpan_btle_dev *dev, struct lowpan_peer *peer) { list_add_rcu(&peer->list, &dev->peers); atomic_inc(&dev->peer_count); } static inline bool peer_del(struct lowpan_btle_dev *dev, struct lowpan_peer *peer) { list_del_rcu(&peer->list); kfree_rcu(peer, rcu); module_put(THIS_MODULE); if (atomic_dec_and_test(&dev->peer_count)) { BT_DBG("last peer"); return true; } return false; } static inline struct lowpan_peer * __peer_lookup_chan(struct lowpan_btle_dev *dev, struct l2cap_chan *chan) { struct lowpan_peer *peer; list_for_each_entry_rcu(peer, &dev->peers, list) { if (peer->chan == chan) return peer; } return NULL; } static inline struct lowpan_peer * __peer_lookup_conn(struct lowpan_btle_dev *dev, struct l2cap_conn *conn) { struct lowpan_peer *peer; list_for_each_entry_rcu(peer, &dev->peers, list) { if (peer->chan->conn == conn) return peer; } return NULL; } static inline struct lowpan_peer *peer_lookup_dst(struct lowpan_btle_dev *dev, struct in6_addr *daddr, struct sk_buff *skb) { struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); int count = atomic_read(&dev->peer_count); const struct in6_addr *nexthop; struct lowpan_peer *peer; struct neighbour *neigh; BT_DBG("peers %d addr %pI6c rt %p", count, daddr, rt); if (!rt) { if (ipv6_addr_any(&lowpan_cb(skb)->gw)) { /* There is neither route nor gateway, * probably the destination is a direct peer. */ nexthop = daddr; } else { /* There is a known gateway */ nexthop = &lowpan_cb(skb)->gw; } } else { nexthop = rt6_nexthop(rt, daddr); /* We need to remember the address because it is needed * by bt_xmit() when sending the packet. In bt_xmit(), the * destination routing info is not set. */ memcpy(&lowpan_cb(skb)->gw, nexthop, sizeof(struct in6_addr)); } BT_DBG("gw %pI6c", nexthop); rcu_read_lock(); list_for_each_entry_rcu(peer, &dev->peers, list) { BT_DBG("dst addr %pMR dst type %u ip %pI6c", &peer->chan->dst, peer->chan->dst_type, &peer->peer_addr); if (!ipv6_addr_cmp(&peer->peer_addr, nexthop)) { rcu_read_unlock(); return peer; } } /* use the neighbour cache for matching addresses assigned by SLAAC */ neigh = __ipv6_neigh_lookup(dev->netdev, nexthop); if (neigh) { list_for_each_entry_rcu(peer, &dev->peers, list) { if (!memcmp(neigh->ha, peer->lladdr, ETH_ALEN)) { neigh_release(neigh); rcu_read_unlock(); return peer; } } neigh_release(neigh); } rcu_read_unlock(); return NULL; } static struct lowpan_peer *lookup_peer(struct l2cap_conn *conn) { struct lowpan_btle_dev *entry; struct lowpan_peer *peer = NULL; rcu_read_lock(); list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) { peer = __peer_lookup_conn(entry, conn); if (peer) break; } rcu_read_unlock(); return peer; } static struct lowpan_btle_dev *lookup_dev(struct l2cap_conn *conn) { struct lowpan_btle_dev *entry; struct lowpan_btle_dev *dev = NULL; rcu_read_lock(); list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) { if (conn->hcon->hdev == entry->hdev) { dev = entry; break; } } rcu_read_unlock(); return dev; } static int give_skb_to_upper(struct sk_buff *skb, struct net_device *dev) { struct sk_buff *skb_cp; skb_cp = skb_copy(skb, GFP_ATOMIC); if (!skb_cp) return NET_RX_DROP; return netif_rx(skb_cp); } static int iphc_decompress(struct sk_buff *skb, struct net_device *netdev, struct lowpan_peer *peer) { const u8 *saddr; saddr = peer->lladdr; return lowpan_header_decompress(skb, netdev, netdev->dev_addr, saddr); } static int recv_pkt(struct sk_buff *skb, struct net_device *dev, struct lowpan_peer *peer) { struct sk_buff *local_skb; int ret; if (!netif_running(dev)) goto drop; if (dev->type != ARPHRD_6LOWPAN || !skb->len) goto drop; skb_reset_network_header(skb); skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) goto drop; /* check that it's our buffer */ if (lowpan_is_ipv6(*skb_network_header(skb))) { /* Pull off the 1-byte of 6lowpan header. */ skb_pull(skb, 1); /* Copy the packet so that the IPv6 header is * properly aligned. */ local_skb = skb_copy_expand(skb, NET_SKB_PAD - 1, skb_tailroom(skb), GFP_ATOMIC); if (!local_skb) goto drop; local_skb->protocol = htons(ETH_P_IPV6); local_skb->pkt_type = PACKET_HOST; local_skb->dev = dev; skb_set_transport_header(local_skb, sizeof(struct ipv6hdr)); if (give_skb_to_upper(local_skb, dev) != NET_RX_SUCCESS) { kfree_skb(local_skb); goto drop; } dev->stats.rx_bytes += skb->len; dev->stats.rx_packets++; consume_skb(local_skb); consume_skb(skb); } else if (lowpan_is_iphc(*skb_network_header(skb))) { local_skb = skb_clone(skb, GFP_ATOMIC); if (!local_skb) goto drop; local_skb->dev = dev; ret = iphc_decompress(local_skb, dev, peer); if (ret < 0) { BT_DBG("iphc_decompress failed: %d", ret); kfree_skb(local_skb); goto drop; } local_skb->protocol = htons(ETH_P_IPV6); local_skb->pkt_type = PACKET_HOST; if (give_skb_to_upper(local_skb, dev) != NET_RX_SUCCESS) { kfree_skb(local_skb); goto drop; } dev->stats.rx_bytes += skb->len; dev->stats.rx_packets++; consume_skb(local_skb); consume_skb(skb); } else { BT_DBG("unknown packet type"); goto drop; } return NET_RX_SUCCESS; drop: dev->stats.rx_dropped++; return NET_RX_DROP; } /* Packet from BT LE device */ static int chan_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb) { struct lowpan_btle_dev *dev; struct lowpan_peer *peer; int err; peer = lookup_peer(chan->conn); if (!peer) return -ENOENT; dev = lookup_dev(chan->conn); if (!dev || !dev->netdev) return -ENOENT; err = recv_pkt(skb, dev->netdev, peer); if (err) { BT_DBG("recv pkt %d", err); err = -EAGAIN; } return err; } static int setup_header(struct sk_buff *skb, struct net_device *netdev, bdaddr_t *peer_addr, u8 *peer_addr_type) { struct in6_addr ipv6_daddr; struct ipv6hdr *hdr; struct lowpan_btle_dev *dev; struct lowpan_peer *peer; u8 *daddr; int err, status = 0; hdr = ipv6_hdr(skb); dev = lowpan_btle_dev(netdev); memcpy(&ipv6_daddr, &hdr->daddr, sizeof(ipv6_daddr)); if (ipv6_addr_is_multicast(&ipv6_daddr)) { lowpan_cb(skb)->chan = NULL; daddr = NULL; } else { BT_DBG("dest IP %pI6c", &ipv6_daddr); /* The packet might be sent to 6lowpan interface * because of routing (either via default route * or user set route) so get peer according to * the destination address. */ peer = peer_lookup_dst(dev, &ipv6_daddr, skb); if (!peer) { BT_DBG("no such peer"); return -ENOENT; } daddr = peer->lladdr; *peer_addr = peer->chan->dst; *peer_addr_type = peer->chan->dst_type; lowpan_cb(skb)->chan = peer->chan; status = 1; } lowpan_header_compress(skb, netdev, daddr, dev->netdev->dev_addr); err = dev_hard_header(skb, netdev, ETH_P_IPV6, NULL, NULL, 0); if (err < 0) return err; return status; } static int header_create(struct sk_buff *skb, struct net_device *netdev, unsigned short type, const void *_daddr, const void *_saddr, unsigned int len) { if (type != ETH_P_IPV6) return -EINVAL; return 0; } /* Packet to BT LE device */ static int send_pkt(struct l2cap_chan *chan, struct sk_buff *skb, struct net_device *netdev) { struct msghdr msg; struct kvec iv; int err; /* Remember the skb so that we can send EAGAIN to the caller if * we run out of credits. */ chan->data = skb; iv.iov_base = skb->data; iv.iov_len = skb->len; memset(&msg, 0, sizeof(msg)); iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &iv, 1, skb->len); err = l2cap_chan_send(chan, &msg, skb->len); if (err > 0) { netdev->stats.tx_bytes += err; netdev->stats.tx_packets++; return 0; } if (err < 0) netdev->stats.tx_errors++; return err; } static int send_mcast_pkt(struct sk_buff *skb, struct net_device *netdev) { struct sk_buff *local_skb; struct lowpan_btle_dev *entry; int err = 0; rcu_read_lock(); list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) { struct lowpan_peer *pentry; struct lowpan_btle_dev *dev; if (entry->netdev != netdev) continue; dev = lowpan_btle_dev(entry->netdev); list_for_each_entry_rcu(pentry, &dev->peers, list) { int ret; local_skb = skb_clone(skb, GFP_ATOMIC); BT_DBG("xmit %s to %pMR type %u IP %pI6c chan %p", netdev->name, &pentry->chan->dst, pentry->chan->dst_type, &pentry->peer_addr, pentry->chan); ret = send_pkt(pentry->chan, local_skb, netdev); if (ret < 0) err = ret; kfree_skb(local_skb); } } rcu_read_unlock(); return err; } static netdev_tx_t bt_xmit(struct sk_buff *skb, struct net_device *netdev) { int err = 0; bdaddr_t addr; u8 addr_type; /* We must take a copy of the skb before we modify/replace the ipv6 * header as the header could be used elsewhere */ skb = skb_unshare(skb, GFP_ATOMIC); if (!skb) return NET_XMIT_DROP; /* Return values from setup_header() * <0 - error, packet is dropped * 0 - this is a multicast packet * 1 - this is unicast packet */ err = setup_header(skb, netdev, &addr, &addr_type); if (err < 0) { kfree_skb(skb); return NET_XMIT_DROP; } if (err) { if (lowpan_cb(skb)->chan) { BT_DBG("xmit %s to %pMR type %u IP %pI6c chan %p", netdev->name, &addr, addr_type, &lowpan_cb(skb)->addr, lowpan_cb(skb)->chan); err = send_pkt(lowpan_cb(skb)->chan, skb, netdev); } else { err = -ENOENT; } } else { /* We need to send the packet to every device behind this * interface. */ err = send_mcast_pkt(skb, netdev); } dev_kfree_skb(skb); if (err) BT_DBG("ERROR: xmit failed (%d)", err); return err < 0 ? NET_XMIT_DROP : err; } static int bt_dev_init(struct net_device *dev) { netdev_lockdep_set_classes(dev); return 0; } static const struct net_device_ops netdev_ops = { .ndo_init = bt_dev_init, .ndo_start_xmit = bt_xmit, }; static const struct header_ops header_ops = { .create = header_create, }; static void netdev_setup(struct net_device *dev) { dev->hard_header_len = 0; dev->needed_tailroom = 0; dev->flags = IFF_RUNNING | IFF_MULTICAST; dev->watchdog_timeo = 0; dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; dev->netdev_ops = &netdev_ops; dev->header_ops = &header_ops; dev->needs_free_netdev = true; } static const struct device_type bt_type = { .name = "bluetooth", }; static void ifup(struct net_device *netdev) { int err; rtnl_lock(); err = dev_open(netdev, NULL); if (err < 0) BT_INFO("iface %s cannot be opened (%d)", netdev->name, err); rtnl_unlock(); } static void ifdown(struct net_device *netdev) { rtnl_lock(); dev_close(netdev); rtnl_unlock(); } static void do_notify_peers(struct work_struct *work) { struct lowpan_btle_dev *dev = container_of(work, struct lowpan_btle_dev, notify_peers.work); netdev_notify_peers(dev->netdev); /* send neighbour adv at startup */ } static bool is_bt_6lowpan(struct hci_conn *hcon) { if (hcon->type != LE_LINK) return false; if (!enable_6lowpan) return false; return true; } static struct l2cap_chan *chan_create(void) { struct l2cap_chan *chan; chan = l2cap_chan_create(); if (!chan) return NULL; l2cap_chan_set_defaults(chan); chan->chan_type = L2CAP_CHAN_CONN_ORIENTED; chan->mode = L2CAP_MODE_LE_FLOWCTL; chan->imtu = 1280; return chan; } static struct l2cap_chan *add_peer_chan(struct l2cap_chan *chan, struct lowpan_btle_dev *dev, bool new_netdev) { struct lowpan_peer *peer; peer = kzalloc(sizeof(*peer), GFP_ATOMIC); if (!peer) return NULL; peer->chan = chan; baswap((void *)peer->lladdr, &chan->dst); lowpan_iphc_uncompress_eui48_lladdr(&peer->peer_addr, peer->lladdr); spin_lock(&devices_lock); INIT_LIST_HEAD(&peer->list); peer_add(dev, peer); spin_unlock(&devices_lock); /* Notifying peers about us needs to be done without locks held */ if (new_netdev) INIT_DELAYED_WORK(&dev->notify_peers, do_notify_peers); schedule_delayed_work(&dev->notify_peers, msecs_to_jiffies(100)); return peer->chan; } static int setup_netdev(struct l2cap_chan *chan, struct lowpan_btle_dev **dev) { struct net_device *netdev; bdaddr_t addr; int err; netdev = alloc_netdev(LOWPAN_PRIV_SIZE(sizeof(struct lowpan_btle_dev)), IFACE_NAME_TEMPLATE, NET_NAME_UNKNOWN, netdev_setup); if (!netdev) return -ENOMEM; netdev->addr_assign_type = NET_ADDR_PERM; baswap(&addr, &chan->src); __dev_addr_set(netdev, &addr, sizeof(addr)); netdev->netdev_ops = &netdev_ops; SET_NETDEV_DEV(netdev, &chan->conn->hcon->hdev->dev); SET_NETDEV_DEVTYPE(netdev, &bt_type); *dev = lowpan_btle_dev(netdev); (*dev)->netdev = netdev; (*dev)->hdev = chan->conn->hcon->hdev; INIT_LIST_HEAD(&(*dev)->peers); spin_lock(&devices_lock); INIT_LIST_HEAD(&(*dev)->list); list_add_rcu(&(*dev)->list, &bt_6lowpan_devices); spin_unlock(&devices_lock); err = lowpan_register_netdev(netdev, LOWPAN_LLTYPE_BTLE); if (err < 0) { BT_INFO("register_netdev failed %d", err); spin_lock(&devices_lock); list_del_rcu(&(*dev)->list); spin_unlock(&devices_lock); free_netdev(netdev); goto out; } BT_DBG("ifindex %d peer bdaddr %pMR type %d my addr %pMR type %d", netdev->ifindex, &chan->dst, chan->dst_type, &chan->src, chan->src_type); set_bit(__LINK_STATE_PRESENT, &netdev->state); return 0; out: return err; } static inline void chan_ready_cb(struct l2cap_chan *chan) { struct lowpan_btle_dev *dev; bool new_netdev = false; dev = lookup_dev(chan->conn); BT_DBG("chan %p conn %p dev %p", chan, chan->conn, dev); if (!dev) { if (setup_netdev(chan, &dev) < 0) { l2cap_chan_del(chan, -ENOENT); return; } new_netdev = true; } if (!try_module_get(THIS_MODULE)) return; add_peer_chan(chan, dev, new_netdev); ifup(dev->netdev); } static inline struct l2cap_chan *chan_new_conn_cb(struct l2cap_chan *pchan) { struct l2cap_chan *chan; chan = chan_create(); if (!chan) return NULL; chan->ops = pchan->ops; BT_DBG("chan %p pchan %p", chan, pchan); return chan; } static void delete_netdev(struct work_struct *work) { struct lowpan_btle_dev *entry = container_of(work, struct lowpan_btle_dev, delete_netdev); lowpan_unregister_netdev(entry->netdev); /* The entry pointer is deleted by the netdev destructor. */ } static void chan_close_cb(struct l2cap_chan *chan) { struct lowpan_btle_dev *entry; struct lowpan_btle_dev *dev = NULL; struct lowpan_peer *peer; int err = -ENOENT; bool last = false, remove = true; BT_DBG("chan %p conn %p", chan, chan->conn); if (chan->conn && chan->conn->hcon) { if (!is_bt_6lowpan(chan->conn->hcon)) return; /* If conn is set, then the netdev is also there and we should * not remove it. */ remove = false; } spin_lock(&devices_lock); list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) { dev = lowpan_btle_dev(entry->netdev); peer = __peer_lookup_chan(dev, chan); if (peer) { last = peer_del(dev, peer); err = 0; BT_DBG("dev %p removing %speer %p", dev, last ? "last " : "1 ", peer); BT_DBG("chan %p orig refcnt %u", chan, kref_read(&chan->kref)); l2cap_chan_put(chan); break; } } if (!err && last && dev && !atomic_read(&dev->peer_count)) { spin_unlock(&devices_lock); cancel_delayed_work_sync(&dev->notify_peers); ifdown(dev->netdev); if (remove) { INIT_WORK(&entry->delete_netdev, delete_netdev); schedule_work(&entry->delete_netdev); } } else { spin_unlock(&devices_lock); } } static void chan_state_change_cb(struct l2cap_chan *chan, int state, int err) { BT_DBG("chan %p conn %p state %s err %d", chan, chan->conn, state_to_string(state), err); } static struct sk_buff *chan_alloc_skb_cb(struct l2cap_chan *chan, unsigned long hdr_len, unsigned long len, int nb) { /* Note that we must allocate using GFP_ATOMIC here as * this function is called originally from netdev hard xmit * function in atomic context. */ return bt_skb_alloc(hdr_len + len, GFP_ATOMIC); } static void chan_suspend_cb(struct l2cap_chan *chan) { struct lowpan_btle_dev *dev; BT_DBG("chan %p suspend", chan); dev = lookup_dev(chan->conn); if (!dev || !dev->netdev) return; netif_stop_queue(dev->netdev); } static void chan_resume_cb(struct l2cap_chan *chan) { struct lowpan_btle_dev *dev; BT_DBG("chan %p resume", chan); dev = lookup_dev(chan->conn); if (!dev || !dev->netdev) return; netif_wake_queue(dev->netdev); } static long chan_get_sndtimeo_cb(struct l2cap_chan *chan) { return L2CAP_CONN_TIMEOUT; } static const struct l2cap_ops bt_6lowpan_chan_ops = { .name = "L2CAP 6LoWPAN channel", .new_connection = chan_new_conn_cb, .recv = chan_recv_cb, .close = chan_close_cb, .state_change = chan_state_change_cb, .ready = chan_ready_cb, .resume = chan_resume_cb, .suspend = chan_suspend_cb, .get_sndtimeo = chan_get_sndtimeo_cb, .alloc_skb = chan_alloc_skb_cb, .teardown = l2cap_chan_no_teardown, .defer = l2cap_chan_no_defer, .set_shutdown = l2cap_chan_no_set_shutdown, }; static int bt_6lowpan_connect(bdaddr_t *addr, u8 dst_type) { struct l2cap_chan *chan; int err; chan = chan_create(); if (!chan) return -EINVAL; chan->ops = &bt_6lowpan_chan_ops; err = l2cap_chan_connect(chan, cpu_to_le16(L2CAP_PSM_IPSP), 0, addr, dst_type, L2CAP_CONN_TIMEOUT); BT_DBG("chan %p err %d", chan, err); if (err < 0) l2cap_chan_put(chan); return err; } static int bt_6lowpan_disconnect(struct l2cap_conn *conn, u8 dst_type) { struct lowpan_peer *peer; BT_DBG("conn %p dst type %u", conn, dst_type); peer = lookup_peer(conn); if (!peer) return -ENOENT; BT_DBG("peer %p chan %p", peer, peer->chan); l2cap_chan_close(peer->chan, ENOENT); return 0; } static struct l2cap_chan *bt_6lowpan_listen(void) { bdaddr_t *addr = BDADDR_ANY; struct l2cap_chan *chan; int err; if (!enable_6lowpan) return NULL; chan = chan_create(); if (!chan) return NULL; chan->ops = &bt_6lowpan_chan_ops; chan->state = BT_LISTEN; chan->src_type = BDADDR_LE_PUBLIC; atomic_set(&chan->nesting, L2CAP_NESTING_PARENT); BT_DBG("chan %p src type %u", chan, chan->src_type); err = l2cap_add_psm(chan, addr, cpu_to_le16(L2CAP_PSM_IPSP)); if (err) { l2cap_chan_put(chan); BT_ERR("psm cannot be added err %d", err); return NULL; } return chan; } static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type, struct l2cap_conn **conn) { struct hci_conn *hcon; struct hci_dev *hdev; int n; n = sscanf(buf, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx %hhu", &addr->b[5], &addr->b[4], &addr->b[3], &addr->b[2], &addr->b[1], &addr->b[0], addr_type); if (n < 7) return -EINVAL; /* The LE_PUBLIC address type is ignored because of BDADDR_ANY */ hdev = hci_get_route(addr, BDADDR_ANY, BDADDR_LE_PUBLIC); if (!hdev) return -ENOENT; hci_dev_lock(hdev); hcon = hci_conn_hash_lookup_le(hdev, addr, *addr_type); hci_dev_unlock(hdev); hci_dev_put(hdev); if (!hcon) return -ENOENT; *conn = (struct l2cap_conn *)hcon->l2cap_data; BT_DBG("conn %p dst %pMR type %u", *conn, &hcon->dst, hcon->dst_type); return 0; } static void disconnect_all_peers(void) { struct lowpan_btle_dev *entry; struct lowpan_peer *peer, *tmp_peer, *new_peer; struct list_head peers; INIT_LIST_HEAD(&peers); /* We make a separate list of peers as the close_cb() will * modify the device peers list so it is better not to mess * with the same list at the same time. */ rcu_read_lock(); list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) { list_for_each_entry_rcu(peer, &entry->peers, list) { new_peer = kmalloc(sizeof(*new_peer), GFP_ATOMIC); if (!new_peer) break; new_peer->chan = peer->chan; INIT_LIST_HEAD(&new_peer->list); list_add(&new_peer->list, &peers); } } rcu_read_unlock(); spin_lock(&devices_lock); list_for_each_entry_safe(peer, tmp_peer, &peers, list) { l2cap_chan_close(peer->chan, ENOENT); list_del_rcu(&peer->list); kfree_rcu(peer, rcu); } spin_unlock(&devices_lock); } struct set_enable { struct work_struct work; bool flag; }; static void do_enable_set(struct work_struct *work) { struct set_enable *set_enable = container_of(work, struct set_enable, work); if (!set_enable->flag || enable_6lowpan != set_enable->flag) /* Disconnect existing connections if 6lowpan is * disabled */ disconnect_all_peers(); enable_6lowpan = set_enable->flag; mutex_lock(&set_lock); if (listen_chan) { l2cap_chan_close(listen_chan, 0); l2cap_chan_put(listen_chan); } listen_chan = bt_6lowpan_listen(); mutex_unlock(&set_lock); kfree(set_enable); } static int lowpan_enable_set(void *data, u64 val) { struct set_enable *set_enable; set_enable = kzalloc(sizeof(*set_enable), GFP_KERNEL); if (!set_enable) return -ENOMEM; set_enable->flag = !!val; INIT_WORK(&set_enable->work, do_enable_set); schedule_work(&set_enable->work); return 0; } static int lowpan_enable_get(void *data, u64 *val) { *val = enable_6lowpan; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(lowpan_enable_fops, lowpan_enable_get, lowpan_enable_set, "%llu\n"); static ssize_t lowpan_control_write(struct file *fp, const char __user *user_buffer, size_t count, loff_t *position) { char buf[32]; size_t buf_size = min(count, sizeof(buf) - 1); int ret; bdaddr_t addr; u8 addr_type; struct l2cap_conn *conn = NULL; if (copy_from_user(buf, user_buffer, buf_size)) return -EFAULT; buf[buf_size] = '\0'; if (memcmp(buf, "connect ", 8) == 0) { ret = get_l2cap_conn(&buf[8], &addr, &addr_type, &conn); if (ret == -EINVAL) return ret; mutex_lock(&set_lock); if (listen_chan) { l2cap_chan_close(listen_chan, 0); l2cap_chan_put(listen_chan); listen_chan = NULL; } mutex_unlock(&set_lock); if (conn) { struct lowpan_peer *peer; if (!is_bt_6lowpan(conn->hcon)) return -EINVAL; peer = lookup_peer(conn); if (peer) { BT_DBG("6LoWPAN connection already exists"); return -EALREADY; } BT_DBG("conn %p dst %pMR type %d user %u", conn, &conn->hcon->dst, conn->hcon->dst_type, addr_type); } ret = bt_6lowpan_connect(&addr, addr_type); if (ret < 0) return ret; return count; } if (memcmp(buf, "disconnect ", 11) == 0) { ret = get_l2cap_conn(&buf[11], &addr, &addr_type, &conn); if (ret < 0) return ret; ret = bt_6lowpan_disconnect(conn, addr_type); if (ret < 0) return ret; return count; } return count; } static int lowpan_control_show(struct seq_file *f, void *ptr) { struct lowpan_btle_dev *entry; struct lowpan_peer *peer; spin_lock(&devices_lock); list_for_each_entry(entry, &bt_6lowpan_devices, list) { list_for_each_entry(peer, &entry->peers, list) seq_printf(f, "%pMR (type %u)\n", &peer->chan->dst, peer->chan->dst_type); } spin_unlock(&devices_lock); return 0; } static int lowpan_control_open(struct inode *inode, struct file *file) { return single_open(file, lowpan_control_show, inode->i_private); } static const struct file_operations lowpan_control_fops = { .open = lowpan_control_open, .read = seq_read, .write = lowpan_control_write, .llseek = seq_lseek, .release = single_release, }; static void disconnect_devices(void) { struct lowpan_btle_dev *entry, *tmp, *new_dev; struct list_head devices; INIT_LIST_HEAD(&devices); /* We make a separate list of devices because the unregister_netdev() * will call device_event() which will also want to modify the same * devices list. */ rcu_read_lock(); list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) { new_dev = kmalloc(sizeof(*new_dev), GFP_ATOMIC); if (!new_dev) break; new_dev->netdev = entry->netdev; INIT_LIST_HEAD(&new_dev->list); list_add_rcu(&new_dev->list, &devices); } rcu_read_unlock(); list_for_each_entry_safe(entry, tmp, &devices, list) { ifdown(entry->netdev); BT_DBG("Unregistering netdev %s %p", entry->netdev->name, entry->netdev); lowpan_unregister_netdev(entry->netdev); kfree(entry); } } static int device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); struct lowpan_btle_dev *entry; if (netdev->type != ARPHRD_6LOWPAN) return NOTIFY_DONE; switch (event) { case NETDEV_UNREGISTER: spin_lock(&devices_lock); list_for_each_entry(entry, &bt_6lowpan_devices, list) { if (entry->netdev == netdev) { BT_DBG("Unregistered netdev %s %p", netdev->name, netdev); list_del(&entry->list); break; } } spin_unlock(&devices_lock); break; } return NOTIFY_DONE; } static struct notifier_block bt_6lowpan_dev_notifier = { .notifier_call = device_event, }; static int __init bt_6lowpan_init(void) { lowpan_enable_debugfs = debugfs_create_file_unsafe("6lowpan_enable", 0644, bt_debugfs, NULL, &lowpan_enable_fops); lowpan_control_debugfs = debugfs_create_file("6lowpan_control", 0644, bt_debugfs, NULL, &lowpan_control_fops); return register_netdevice_notifier(&bt_6lowpan_dev_notifier); } static void __exit bt_6lowpan_exit(void) { debugfs_remove(lowpan_enable_debugfs); debugfs_remove(lowpan_control_debugfs); if (listen_chan) { l2cap_chan_close(listen_chan, 0); l2cap_chan_put(listen_chan); } disconnect_devices(); unregister_netdevice_notifier(&bt_6lowpan_dev_notifier); } module_init(bt_6lowpan_init); module_exit(bt_6lowpan_exit); MODULE_AUTHOR("Jukka Rissanen <jukka.rissanen@linux.intel.com>"); MODULE_DESCRIPTION("Bluetooth 6LoWPAN"); MODULE_VERSION(VERSION); MODULE_LICENSE("GPL");
918 6 908 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2007-2012 Nicira, Inc. */ #include <linux/netdevice.h> #include <net/genetlink.h> #include <net/netns/generic.h> #include "datapath.h" #include "vport-internal_dev.h" #include "vport-netdev.h" static void dp_detach_port_notify(struct vport *vport) { struct sk_buff *notify; struct datapath *dp; dp = vport->dp; notify = ovs_vport_cmd_build_info(vport, ovs_dp_get_net(dp), 0, 0, OVS_VPORT_CMD_DEL); ovs_dp_detach_port(vport); if (IS_ERR(notify)) { genl_set_err(&dp_vport_genl_family, ovs_dp_get_net(dp), 0, 0, PTR_ERR(notify)); return; } genlmsg_multicast_netns(&dp_vport_genl_family, ovs_dp_get_net(dp), notify, 0, 0, GFP_KERNEL); } void ovs_dp_notify_wq(struct work_struct *work) { struct ovs_net *ovs_net = container_of(work, struct ovs_net, dp_notify_work); struct datapath *dp; ovs_lock(); list_for_each_entry(dp, &ovs_net->dps, list_node) { int i; for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { struct vport *vport; struct hlist_node *n; hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node) { if (vport->ops->type == OVS_VPORT_TYPE_INTERNAL) continue; if (!(netif_is_ovs_port(vport->dev))) dp_detach_port_notify(vport); } } } ovs_unlock(); } static int dp_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct ovs_net *ovs_net; struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct vport *vport = NULL; if (!ovs_is_internal_dev(dev)) vport = ovs_netdev_get_vport(dev); if (!vport) return NOTIFY_DONE; if (event == NETDEV_UNREGISTER) { /* upper_dev_unlink and decrement promisc immediately */ ovs_netdev_detach_dev(vport); /* schedule vport destroy, dev_put and genl notification */ ovs_net = net_generic(dev_net(dev), ovs_net_id); queue_work(system_wq, &ovs_net->dp_notify_work); } return NOTIFY_DONE; } struct notifier_block ovs_dp_device_notifier = { .notifier_call = dp_device_event };
1 12 847 338 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_NETLINK_H #define __LINUX_NETLINK_H #include <linux/capability.h> #include <linux/skbuff.h> #include <linux/export.h> #include <net/scm.h> #include <uapi/linux/netlink.h> struct net; void do_trace_netlink_extack(const char *msg); static inline struct nlmsghdr *nlmsg_hdr(const struct sk_buff *skb) { return (struct nlmsghdr *)skb->data; } enum netlink_skb_flags { NETLINK_SKB_DST = 0x8, /* Dst set in sendto or sendmsg */ }; struct netlink_skb_parms { struct scm_creds creds; /* Skb credentials */ __u32 portid; __u32 dst_group; __u32 flags; struct sock *sk; bool nsid_is_set; int nsid; }; #define NETLINK_CB(skb) (*(struct netlink_skb_parms*)&((skb)->cb)) #define NETLINK_CREDS(skb) (&NETLINK_CB((skb)).creds) #define NETLINK_CTX_SIZE 48 void netlink_table_grab(void); void netlink_table_ungrab(void); #define NL_CFG_F_NONROOT_RECV (1 << 0) #define NL_CFG_F_NONROOT_SEND (1 << 1) /* optional Netlink kernel configuration parameters */ struct netlink_kernel_cfg { unsigned int groups; unsigned int flags; void (*input)(struct sk_buff *skb); int (*bind)(struct net *net, int group); void (*unbind)(struct net *net, int group); void (*release) (struct sock *sk, unsigned long *groups); }; struct sock *__netlink_kernel_create(struct net *net, int unit, struct module *module, struct netlink_kernel_cfg *cfg); static inline struct sock * netlink_kernel_create(struct net *net, int unit, struct netlink_kernel_cfg *cfg) { return __netlink_kernel_create(net, unit, THIS_MODULE, cfg); } /* this can be increased when necessary - don't expose to userland */ #define NETLINK_MAX_COOKIE_LEN 20 #define NETLINK_MAX_FMTMSG_LEN 80 /** * struct netlink_ext_ack - netlink extended ACK report struct * @_msg: message string to report - don't access directly, use * %NL_SET_ERR_MSG * @bad_attr: attribute with error * @policy: policy for a bad attribute * @miss_type: attribute type which was missing * @miss_nest: nest missing an attribute (%NULL if missing top level attr) * @cookie: cookie data to return to userspace (for success) * @cookie_len: actual cookie data length * @_msg_buf: output buffer for formatted message strings - don't access * directly, use %NL_SET_ERR_MSG_FMT */ struct netlink_ext_ack { const char *_msg; const struct nlattr *bad_attr; const struct nla_policy *policy; const struct nlattr *miss_nest; u16 miss_type; u8 cookie[NETLINK_MAX_COOKIE_LEN]; u8 cookie_len; char _msg_buf[NETLINK_MAX_FMTMSG_LEN]; }; /* Always use this macro, this allows later putting the * message into a separate section or such for things * like translation or listing all possible messages. * If string formatting is needed use NL_SET_ERR_MSG_FMT. */ #define NL_SET_ERR_MSG(extack, msg) do { \ static const char __msg[] = msg; \ struct netlink_ext_ack *__extack = (extack); \ \ do_trace_netlink_extack(__msg); \ \ if (__extack) \ __extack->_msg = __msg; \ } while (0) /* We splice fmt with %s at each end even in the snprintf so that both calls * can use the same string constant, avoiding its duplication in .ro */ #define NL_SET_ERR_MSG_FMT(extack, fmt, args...) do { \ struct netlink_ext_ack *__extack = (extack); \ \ if (!__extack) \ break; \ if (snprintf(__extack->_msg_buf, NETLINK_MAX_FMTMSG_LEN, \ "%s" fmt "%s", "", ##args, "") >= \ NETLINK_MAX_FMTMSG_LEN) \ net_warn_ratelimited("%s" fmt "%s", "truncated extack: ", \ ##args, "\n"); \ \ do_trace_netlink_extack(__extack->_msg_buf); \ \ __extack->_msg = __extack->_msg_buf; \ } while (0) #define NL_SET_ERR_MSG_MOD(extack, msg) \ NL_SET_ERR_MSG((extack), KBUILD_MODNAME ": " msg) #define NL_SET_ERR_MSG_FMT_MOD(extack, fmt, args...) \ NL_SET_ERR_MSG_FMT((extack), KBUILD_MODNAME ": " fmt, ##args) #define NL_SET_ERR_MSG_WEAK(extack, msg) do { \ if ((extack) && !(extack)->_msg) \ NL_SET_ERR_MSG((extack), msg); \ } while (0) #define NL_SET_ERR_MSG_WEAK_MOD(extack, msg) do { \ if ((extack) && !(extack)->_msg) \ NL_SET_ERR_MSG_MOD((extack), msg); \ } while (0) #define NL_SET_BAD_ATTR_POLICY(extack, attr, pol) do { \ if ((extack)) { \ (extack)->bad_attr = (attr); \ (extack)->policy = (pol); \ } \ } while (0) #define NL_SET_BAD_ATTR(extack, attr) NL_SET_BAD_ATTR_POLICY(extack, attr, NULL) #define NL_SET_ERR_MSG_ATTR_POL(extack, attr, pol, msg) do { \ static const char __msg[] = msg; \ struct netlink_ext_ack *__extack = (extack); \ \ do_trace_netlink_extack(__msg); \ \ if (__extack) { \ __extack->_msg = __msg; \ __extack->bad_attr = (attr); \ __extack->policy = (pol); \ } \ } while (0) #define NL_SET_ERR_MSG_ATTR_POL_FMT(extack, attr, pol, fmt, args...) do { \ struct netlink_ext_ack *__extack = (extack); \ \ if (!__extack) \ break; \ \ if (snprintf(__extack->_msg_buf, NETLINK_MAX_FMTMSG_LEN, \ "%s" fmt "%s", "", ##args, "") >= \ NETLINK_MAX_FMTMSG_LEN) \ net_warn_ratelimited("%s" fmt "%s", "truncated extack: ", \ ##args, "\n"); \ \ do_trace_netlink_extack(__extack->_msg_buf); \ \ __extack->_msg = __extack->_msg_buf; \ __extack->bad_attr = (attr); \ __extack->policy = (pol); \ } while (0) #define NL_SET_ERR_MSG_ATTR(extack, attr, msg) \ NL_SET_ERR_MSG_ATTR_POL(extack, attr, NULL, msg) #define NL_SET_ERR_MSG_ATTR_FMT(extack, attr, msg, args...) \ NL_SET_ERR_MSG_ATTR_POL_FMT(extack, attr, NULL, msg, ##args) #define NL_SET_ERR_ATTR_MISS(extack, nest, type) do { \ struct netlink_ext_ack *__extack = (extack); \ \ if (__extack) { \ __extack->miss_nest = (nest); \ __extack->miss_type = (type); \ } \ } while (0) #define NL_REQ_ATTR_CHECK(extack, nest, tb, type) ({ \ struct nlattr **__tb = (tb); \ u32 __attr = (type); \ int __retval; \ \ __retval = !__tb[__attr]; \ if (__retval) \ NL_SET_ERR_ATTR_MISS((extack), (nest), __attr); \ __retval; \ }) static inline void nl_set_extack_cookie_u64(struct netlink_ext_ack *extack, u64 cookie) { if (!extack) return; memcpy(extack->cookie, &cookie, sizeof(cookie)); extack->cookie_len = sizeof(cookie); } void netlink_kernel_release(struct sock *sk); int __netlink_change_ngroups(struct sock *sk, unsigned int groups); int netlink_change_ngroups(struct sock *sk, unsigned int groups); void __netlink_clear_multicast_users(struct sock *sk, unsigned int group); void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, const struct netlink_ext_ack *extack); int netlink_has_listeners(struct sock *sk, unsigned int group); bool netlink_strict_get_check(struct sk_buff *skb); int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 portid, int nonblock); int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 portid, __u32 group, gfp_t allocation); typedef int (*netlink_filter_fn)(struct sock *dsk, struct sk_buff *skb, void *data); int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, __u32 portid, __u32 group, gfp_t allocation, netlink_filter_fn filter, void *filter_data); int netlink_set_err(struct sock *ssk, __u32 portid, __u32 group, int code); int netlink_register_notifier(struct notifier_block *nb); int netlink_unregister_notifier(struct notifier_block *nb); /* finegrained unicast helpers: */ struct sock *netlink_getsockbyfd(int fd); int netlink_attachskb(struct sock *sk, struct sk_buff *skb, long *timeo, struct sock *ssk); void netlink_detachskb(struct sock *sk, struct sk_buff *skb); int netlink_sendskb(struct sock *sk, struct sk_buff *skb); static inline struct sk_buff * netlink_skb_clone(struct sk_buff *skb, gfp_t gfp_mask) { struct sk_buff *nskb; nskb = skb_clone(skb, gfp_mask); if (!nskb) return NULL; /* This is a large skb, set destructor callback to release head */ if (is_vmalloc_addr(skb->head)) nskb->destructor = skb->destructor; return nskb; } /* * skb should fit one page. This choice is good for headerless malloc. * But we should limit to 8K so that userspace does not have to * use enormous buffer sizes on recvmsg() calls just to avoid * MSG_TRUNC when PAGE_SIZE is very large. */ #if PAGE_SIZE < 8192UL #define NLMSG_GOODSIZE SKB_WITH_OVERHEAD(PAGE_SIZE) #else #define NLMSG_GOODSIZE SKB_WITH_OVERHEAD(8192UL) #endif #define NLMSG_DEFAULT_SIZE (NLMSG_GOODSIZE - NLMSG_HDRLEN) struct netlink_callback { struct sk_buff *skb; const struct nlmsghdr *nlh; int (*dump)(struct sk_buff * skb, struct netlink_callback *cb); int (*done)(struct netlink_callback *cb); void *data; /* the module that dump function belong to */ struct module *module; struct netlink_ext_ack *extack; u16 family; u16 answer_flags; u32 min_dump_alloc; unsigned int prev_seq, seq; int flags; bool strict_check; union { u8 ctx[NETLINK_CTX_SIZE]; /* args is deprecated. Cast a struct over ctx instead * for proper type safety. */ long args[6]; }; }; #define NL_ASSERT_CTX_FITS(type_name) \ BUILD_BUG_ON(sizeof(type_name) > \ sizeof_field(struct netlink_callback, ctx)) struct netlink_notify { struct net *net; u32 portid; int protocol; }; struct nlmsghdr * __nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int flags); struct netlink_dump_control { int (*start)(struct netlink_callback *); int (*dump)(struct sk_buff *skb, struct netlink_callback *); int (*done)(struct netlink_callback *); struct netlink_ext_ack *extack; void *data; struct module *module; u32 min_dump_alloc; int flags; }; int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, const struct nlmsghdr *nlh, struct netlink_dump_control *control); static inline int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, const struct nlmsghdr *nlh, struct netlink_dump_control *control) { if (!control->module) control->module = THIS_MODULE; return __netlink_dump_start(ssk, skb, nlh, control); } struct netlink_tap { struct net_device *dev; struct module *module; struct list_head list; }; int netlink_add_tap(struct netlink_tap *nt); int netlink_remove_tap(struct netlink_tap *nt); bool __netlink_ns_capable(const struct netlink_skb_parms *nsp, struct user_namespace *ns, int cap); bool netlink_ns_capable(const struct sk_buff *skb, struct user_namespace *ns, int cap); bool netlink_capable(const struct sk_buff *skb, int cap); bool netlink_net_capable(const struct sk_buff *skb, int cap); struct sk_buff *netlink_alloc_large_skb(unsigned int size, int broadcast); #endif /* __LINUX_NETLINK_H */
19 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 // SPDX-License-Identifier: GPL-2.0 #include <linux/bitops.h> #include <linux/bug.h> #include <linux/export.h> #include <linux/limits.h> #include <linux/math.h> #include <linux/minmax.h> #include <linux/types.h> #include <linux/reciprocal_div.h> /* * For a description of the algorithm please have a look at * include/linux/reciprocal_div.h */ struct reciprocal_value reciprocal_value(u32 d) { struct reciprocal_value R; u64 m; int l; l = fls(d - 1); m = ((1ULL << 32) * ((1ULL << l) - d)); do_div(m, d); ++m; R.m = (u32)m; R.sh1 = min(l, 1); R.sh2 = max(l - 1, 0); return R; } EXPORT_SYMBOL(reciprocal_value); struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec) { struct reciprocal_value_adv R; u32 l, post_shift; u64 mhigh, mlow; /* ceil(log2(d)) */ l = fls(d - 1); /* NOTE: mlow/mhigh could overflow u64 when l == 32. This case needs to * be handled before calling "reciprocal_value_adv", please see the * comment at include/linux/reciprocal_div.h. */ WARN(l == 32, "ceil(log2(0x%08x)) == 32, %s doesn't support such divisor", d, __func__); post_shift = l; mlow = 1ULL << (32 + l); do_div(mlow, d); mhigh = (1ULL << (32 + l)) + (1ULL << (32 + l - prec)); do_div(mhigh, d); for (; post_shift > 0; post_shift--) { u64 lo = mlow >> 1, hi = mhigh >> 1; if (lo >= hi) break; mlow = lo; mhigh = hi; } R.m = (u32)mhigh; R.sh = post_shift; R.exp = l; R.is_wide_m = mhigh > U32_MAX; return R; } EXPORT_SYMBOL(reciprocal_value_adv);
4 332 1 26 26 305 13 1 20 254 16 4 3 8 16 20 3 1 5 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 /* SPDX-License-Identifier: GPL-2.0 */ /* IP Virtual Server * data structure and functionality definitions */ #ifndef _NET_IP_VS_H #define _NET_IP_VS_H #include <linux/ip_vs.h> /* definitions shared with userland */ #include <asm/types.h> /* for __uXX types */ #include <linux/list.h> /* for struct list_head */ #include <linux/spinlock.h> /* for struct rwlock_t */ #include <linux/atomic.h> /* for struct atomic_t */ #include <linux/refcount.h> /* for struct refcount_t */ #include <linux/workqueue.h> #include <linux/compiler.h> #include <linux/timer.h> #include <linux/bug.h> #include <net/checksum.h> #include <linux/netfilter.h> /* for union nf_inet_addr */ #include <linux/ip.h> #include <linux/ipv6.h> /* for struct ipv6hdr */ #include <net/ipv6.h> #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack.h> #endif #include <net/net_namespace.h> /* Netw namespace */ #include <linux/sched/isolation.h> #define IP_VS_HDR_INVERSE 1 #define IP_VS_HDR_ICMP 2 /* Generic access of ipvs struct */ static inline struct netns_ipvs *net_ipvs(struct net* net) { return net->ipvs; } /* Connections' size value needed by ip_vs_ctl.c */ extern int ip_vs_conn_tab_size; extern struct mutex __ip_vs_mutex; struct ip_vs_iphdr { int hdr_flags; /* ipvs flags */ __u32 off; /* Where IP or IPv4 header starts */ __u32 len; /* IPv4 simply where L4 starts * IPv6 where L4 Transport Header starts */ __u16 fragoffs; /* IPv6 fragment offset, 0 if first frag (or not frag)*/ __s16 protocol; __s32 flags; union nf_inet_addr saddr; union nf_inet_addr daddr; }; static inline void *frag_safe_skb_hp(const struct sk_buff *skb, int offset, int len, void *buffer) { return skb_header_pointer(skb, offset, len, buffer); } /* This function handles filling *ip_vs_iphdr, both for IPv4 and IPv6. * IPv6 requires some extra work, as finding proper header position, * depend on the IPv6 extension headers. */ static inline int ip_vs_fill_iph_skb_off(int af, const struct sk_buff *skb, int offset, int hdr_flags, struct ip_vs_iphdr *iphdr) { iphdr->hdr_flags = hdr_flags; iphdr->off = offset; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { struct ipv6hdr _iph; const struct ipv6hdr *iph = skb_header_pointer( skb, offset, sizeof(_iph), &_iph); if (!iph) return 0; iphdr->saddr.in6 = iph->saddr; iphdr->daddr.in6 = iph->daddr; /* ipv6_find_hdr() updates len, flags */ iphdr->len = offset; iphdr->flags = 0; iphdr->protocol = ipv6_find_hdr(skb, &iphdr->len, -1, &iphdr->fragoffs, &iphdr->flags); if (iphdr->protocol < 0) return 0; } else #endif { struct iphdr _iph; const struct iphdr *iph = skb_header_pointer( skb, offset, sizeof(_iph), &_iph); if (!iph) return 0; iphdr->len = offset + iph->ihl * 4; iphdr->fragoffs = 0; iphdr->protocol = iph->protocol; iphdr->saddr.ip = iph->saddr; iphdr->daddr.ip = iph->daddr; } return 1; } static inline int ip_vs_fill_iph_skb_icmp(int af, const struct sk_buff *skb, int offset, bool inverse, struct ip_vs_iphdr *iphdr) { int hdr_flags = IP_VS_HDR_ICMP; if (inverse) hdr_flags |= IP_VS_HDR_INVERSE; return ip_vs_fill_iph_skb_off(af, skb, offset, hdr_flags, iphdr); } static inline int ip_vs_fill_iph_skb(int af, const struct sk_buff *skb, bool inverse, struct ip_vs_iphdr *iphdr) { int hdr_flags = 0; if (inverse) hdr_flags |= IP_VS_HDR_INVERSE; return ip_vs_fill_iph_skb_off(af, skb, skb_network_offset(skb), hdr_flags, iphdr); } static inline bool ip_vs_iph_inverse(const struct ip_vs_iphdr *iph) { return !!(iph->hdr_flags & IP_VS_HDR_INVERSE); } static inline bool ip_vs_iph_icmp(const struct ip_vs_iphdr *iph) { return !!(iph->hdr_flags & IP_VS_HDR_ICMP); } static inline void ip_vs_addr_copy(int af, union nf_inet_addr *dst, const union nf_inet_addr *src) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) dst->in6 = src->in6; else #endif dst->ip = src->ip; } static inline void ip_vs_addr_set(int af, union nf_inet_addr *dst, const union nf_inet_addr *src) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { dst->in6 = src->in6; return; } #endif dst->ip = src->ip; dst->all[1] = 0; dst->all[2] = 0; dst->all[3] = 0; } static inline int ip_vs_addr_equal(int af, const union nf_inet_addr *a, const union nf_inet_addr *b) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) return ipv6_addr_equal(&a->in6, &b->in6); #endif return a->ip == b->ip; } #ifdef CONFIG_IP_VS_DEBUG #include <linux/net.h> int ip_vs_get_debug_level(void); static inline const char *ip_vs_dbg_addr(int af, char *buf, size_t buf_len, const union nf_inet_addr *addr, int *idx) { int len; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) len = snprintf(&buf[*idx], buf_len - *idx, "[%pI6c]", &addr->in6) + 1; else #endif len = snprintf(&buf[*idx], buf_len - *idx, "%pI4", &addr->ip) + 1; *idx += len; BUG_ON(*idx > buf_len + 1); return &buf[*idx - len]; } #define IP_VS_DBG_BUF(level, msg, ...) \ do { \ char ip_vs_dbg_buf[160]; \ int ip_vs_dbg_idx = 0; \ if (level <= ip_vs_get_debug_level()) \ printk(KERN_DEBUG pr_fmt(msg), ##__VA_ARGS__); \ } while (0) #define IP_VS_ERR_BUF(msg...) \ do { \ char ip_vs_dbg_buf[160]; \ int ip_vs_dbg_idx = 0; \ pr_err(msg); \ } while (0) /* Only use from within IP_VS_DBG_BUF() or IP_VS_ERR_BUF macros */ #define IP_VS_DBG_ADDR(af, addr) \ ip_vs_dbg_addr(af, ip_vs_dbg_buf, \ sizeof(ip_vs_dbg_buf), addr, \ &ip_vs_dbg_idx) #define IP_VS_DBG(level, msg, ...) \ do { \ if (level <= ip_vs_get_debug_level()) \ printk(KERN_DEBUG pr_fmt(msg), ##__VA_ARGS__); \ } while (0) #define IP_VS_DBG_RL(msg, ...) \ do { \ if (net_ratelimit()) \ printk(KERN_DEBUG pr_fmt(msg), ##__VA_ARGS__); \ } while (0) #define IP_VS_DBG_PKT(level, af, pp, skb, ofs, msg) \ do { \ if (level <= ip_vs_get_debug_level()) \ pp->debug_packet(af, pp, skb, ofs, msg); \ } while (0) #define IP_VS_DBG_RL_PKT(level, af, pp, skb, ofs, msg) \ do { \ if (level <= ip_vs_get_debug_level() && \ net_ratelimit()) \ pp->debug_packet(af, pp, skb, ofs, msg); \ } while (0) #else /* NO DEBUGGING at ALL */ #define IP_VS_DBG_BUF(level, msg...) do {} while (0) #define IP_VS_ERR_BUF(msg...) do {} while (0) #define IP_VS_DBG(level, msg...) do {} while (0) #define IP_VS_DBG_RL(msg...) do {} while (0) #define IP_VS_DBG_PKT(level, af, pp, skb, ofs, msg) do {} while (0) #define IP_VS_DBG_RL_PKT(level, af, pp, skb, ofs, msg) do {} while (0) #endif #define IP_VS_BUG() BUG() #define IP_VS_ERR_RL(msg, ...) \ do { \ if (net_ratelimit()) \ pr_err(msg, ##__VA_ARGS__); \ } while (0) /* The port number of FTP service (in network order). */ #define FTPPORT cpu_to_be16(21) #define FTPDATA cpu_to_be16(20) /* TCP State Values */ enum { IP_VS_TCP_S_NONE = 0, IP_VS_TCP_S_ESTABLISHED, IP_VS_TCP_S_SYN_SENT, IP_VS_TCP_S_SYN_RECV, IP_VS_TCP_S_FIN_WAIT, IP_VS_TCP_S_TIME_WAIT, IP_VS_TCP_S_CLOSE, IP_VS_TCP_S_CLOSE_WAIT, IP_VS_TCP_S_LAST_ACK, IP_VS_TCP_S_LISTEN, IP_VS_TCP_S_SYNACK, IP_VS_TCP_S_LAST }; /* UDP State Values */ enum { IP_VS_UDP_S_NORMAL, IP_VS_UDP_S_LAST, }; /* ICMP State Values */ enum { IP_VS_ICMP_S_NORMAL, IP_VS_ICMP_S_LAST, }; /* SCTP State Values */ enum ip_vs_sctp_states { IP_VS_SCTP_S_NONE, IP_VS_SCTP_S_INIT1, IP_VS_SCTP_S_INIT, IP_VS_SCTP_S_COOKIE_SENT, IP_VS_SCTP_S_COOKIE_REPLIED, IP_VS_SCTP_S_COOKIE_WAIT, IP_VS_SCTP_S_COOKIE, IP_VS_SCTP_S_COOKIE_ECHOED, IP_VS_SCTP_S_ESTABLISHED, IP_VS_SCTP_S_SHUTDOWN_SENT, IP_VS_SCTP_S_SHUTDOWN_RECEIVED, IP_VS_SCTP_S_SHUTDOWN_ACK_SENT, IP_VS_SCTP_S_REJECTED, IP_VS_SCTP_S_CLOSED, IP_VS_SCTP_S_LAST }; /* Connection templates use bits from state */ #define IP_VS_CTPL_S_NONE 0x0000 #define IP_VS_CTPL_S_ASSURED 0x0001 #define IP_VS_CTPL_S_LAST 0x0002 /* Delta sequence info structure * Each ip_vs_conn has 2 (output AND input seq. changes). * Only used in the VS/NAT. */ struct ip_vs_seq { __u32 init_seq; /* Add delta from this seq */ __u32 delta; /* Delta in sequence numbers */ __u32 previous_delta; /* Delta in sequence numbers * before last resized pkt */ }; /* counters per cpu */ struct ip_vs_counters { u64_stats_t conns; /* connections scheduled */ u64_stats_t inpkts; /* incoming packets */ u64_stats_t outpkts; /* outgoing packets */ u64_stats_t inbytes; /* incoming bytes */ u64_stats_t outbytes; /* outgoing bytes */ }; /* Stats per cpu */ struct ip_vs_cpu_stats { struct ip_vs_counters cnt; struct u64_stats_sync syncp; }; /* Default nice for estimator kthreads */ #define IPVS_EST_NICE 0 /* IPVS statistics objects */ struct ip_vs_estimator { struct hlist_node list; u64 last_inbytes; u64 last_outbytes; u64 last_conns; u64 last_inpkts; u64 last_outpkts; u64 cps; u64 inpps; u64 outpps; u64 inbps; u64 outbps; s32 ktid:16, /* kthread ID, -1=temp list */ ktrow:8, /* row/tick ID for kthread */ ktcid:8; /* chain ID for kthread tick */ }; /* * IPVS statistics object, 64-bit kernel version of struct ip_vs_stats_user */ struct ip_vs_kstats { u64 conns; /* connections scheduled */ u64 inpkts; /* incoming packets */ u64 outpkts; /* outgoing packets */ u64 inbytes; /* incoming bytes */ u64 outbytes; /* outgoing bytes */ u64 cps; /* current connection rate */ u64 inpps; /* current in packet rate */ u64 outpps; /* current out packet rate */ u64 inbps; /* current in byte rate */ u64 outbps; /* current out byte rate */ }; struct ip_vs_stats { struct ip_vs_kstats kstats; /* kernel statistics */ struct ip_vs_estimator est; /* estimator */ struct ip_vs_cpu_stats __percpu *cpustats; /* per cpu counters */ spinlock_t lock; /* spin lock */ struct ip_vs_kstats kstats0; /* reset values */ }; struct ip_vs_stats_rcu { struct ip_vs_stats s; struct rcu_head rcu_head; }; int ip_vs_stats_init_alloc(struct ip_vs_stats *s); struct ip_vs_stats *ip_vs_stats_alloc(void); void ip_vs_stats_release(struct ip_vs_stats *stats); void ip_vs_stats_free(struct ip_vs_stats *stats); /* Process estimators in multiple timer ticks (20/50/100, see ktrow) */ #define IPVS_EST_NTICKS 50 /* Estimation uses a 2-second period containing ticks (in jiffies) */ #define IPVS_EST_TICK ((2 * HZ) / IPVS_EST_NTICKS) /* Limit of CPU load per kthread (8 for 12.5%), ratio of CPU capacity (1/C). * Value of 4 and above ensures kthreads will take work without exceeding * the CPU capacity under different circumstances. */ #define IPVS_EST_LOAD_DIVISOR 8 /* Kthreads should not have work that exceeds the CPU load above 50% */ #define IPVS_EST_CPU_KTHREADS (IPVS_EST_LOAD_DIVISOR / 2) /* Desired number of chains per timer tick (chain load factor in 100us units), * 48=4.8ms of 40ms tick (12% CPU usage): * 2 sec * 1000 ms in sec * 10 (100us in ms) / 8 (12.5%) / 50 */ #define IPVS_EST_CHAIN_FACTOR \ ALIGN_DOWN(2 * 1000 * 10 / IPVS_EST_LOAD_DIVISOR / IPVS_EST_NTICKS, 8) /* Compiled number of chains per tick * The defines should match cond_resched_rcu */ #if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU) #define IPVS_EST_TICK_CHAINS IPVS_EST_CHAIN_FACTOR #else #define IPVS_EST_TICK_CHAINS 1 #endif #if IPVS_EST_NTICKS > 127 #error Too many timer ticks for ktrow #endif /* Multiple chains processed in same tick */ struct ip_vs_est_tick_data { struct rcu_head rcu_head; struct hlist_head chains[IPVS_EST_TICK_CHAINS]; DECLARE_BITMAP(present, IPVS_EST_TICK_CHAINS); DECLARE_BITMAP(full, IPVS_EST_TICK_CHAINS); int chain_len[IPVS_EST_TICK_CHAINS]; }; /* Context for estimation kthread */ struct ip_vs_est_kt_data { struct netns_ipvs *ipvs; struct task_struct *task; /* task if running */ struct ip_vs_est_tick_data __rcu *ticks[IPVS_EST_NTICKS]; DECLARE_BITMAP(avail, IPVS_EST_NTICKS); /* tick has space for ests */ unsigned long est_timer; /* estimation timer (jiffies) */ struct ip_vs_stats *calc_stats; /* Used for calculation */ int tick_len[IPVS_EST_NTICKS]; /* est count */ int id; /* ktid per netns */ int chain_max; /* max ests per tick chain */ int tick_max; /* max ests per tick */ int est_count; /* attached ests to kthread */ int est_max_count; /* max ests per kthread */ int add_row; /* row for new ests */ int est_row; /* estimated row */ }; struct dst_entry; struct iphdr; struct ip_vs_conn; struct ip_vs_app; struct sk_buff; struct ip_vs_proto_data; struct ip_vs_protocol { struct ip_vs_protocol *next; char *name; u16 protocol; u16 num_states; int dont_defrag; void (*init)(struct ip_vs_protocol *pp); void (*exit)(struct ip_vs_protocol *pp); int (*init_netns)(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd); void (*exit_netns)(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd); int (*conn_schedule)(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, struct ip_vs_iphdr *iph); struct ip_vs_conn * (*conn_in_get)(struct netns_ipvs *ipvs, int af, const struct sk_buff *skb, const struct ip_vs_iphdr *iph); struct ip_vs_conn * (*conn_out_get)(struct netns_ipvs *ipvs, int af, const struct sk_buff *skb, const struct ip_vs_iphdr *iph); int (*snat_handler)(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_iphdr *iph); int (*dnat_handler)(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_iphdr *iph); const char *(*state_name)(int state); void (*state_transition)(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, struct ip_vs_proto_data *pd); int (*register_app)(struct netns_ipvs *ipvs, struct ip_vs_app *inc); void (*unregister_app)(struct netns_ipvs *ipvs, struct ip_vs_app *inc); int (*app_conn_bind)(struct ip_vs_conn *cp); void (*debug_packet)(int af, struct ip_vs_protocol *pp, const struct sk_buff *skb, int offset, const char *msg); void (*timeout_change)(struct ip_vs_proto_data *pd, int flags); }; /* protocol data per netns */ struct ip_vs_proto_data { struct ip_vs_proto_data *next; struct ip_vs_protocol *pp; int *timeout_table; /* protocol timeout table */ atomic_t appcnt; /* counter of proto app incs. */ struct tcp_states_t *tcp_state_table; }; struct ip_vs_protocol *ip_vs_proto_get(unsigned short proto); struct ip_vs_proto_data *ip_vs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto); struct ip_vs_conn_param { struct netns_ipvs *ipvs; const union nf_inet_addr *caddr; const union nf_inet_addr *vaddr; __be16 cport; __be16 vport; __u16 protocol; u16 af; const struct ip_vs_pe *pe; char *pe_data; __u8 pe_data_len; }; /* IP_VS structure allocated for each dynamically scheduled connection */ struct ip_vs_conn { struct hlist_node c_list; /* hashed list heads */ /* Protocol, addresses and port numbers */ __be16 cport; __be16 dport; __be16 vport; u16 af; /* address family */ union nf_inet_addr caddr; /* client address */ union nf_inet_addr vaddr; /* virtual address */ union nf_inet_addr daddr; /* destination address */ volatile __u32 flags; /* status flags */ __u16 protocol; /* Which protocol (TCP/UDP) */ __u16 daf; /* Address family of the dest */ struct netns_ipvs *ipvs; /* counter and timer */ refcount_t refcnt; /* reference count */ struct timer_list timer; /* Expiration timer */ volatile unsigned long timeout; /* timeout */ /* Flags and state transition */ spinlock_t lock; /* lock for state transition */ volatile __u16 state; /* state info */ volatile __u16 old_state; /* old state, to be used for * state transition triggered * synchronization */ __u32 fwmark; /* Fire wall mark from skb */ unsigned long sync_endtime; /* jiffies + sent_retries */ /* Control members */ struct ip_vs_conn *control; /* Master control connection */ atomic_t n_control; /* Number of controlled ones */ struct ip_vs_dest *dest; /* real server */ atomic_t in_pkts; /* incoming packet counter */ /* Packet transmitter for different forwarding methods. If it * mangles the packet, it must return NF_DROP or better NF_STOLEN, * otherwise this must be changed to a sk_buff **. * NF_ACCEPT can be returned when destination is local. */ int (*packet_xmit)(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); /* Note: we can group the following members into a structure, * in order to save more space, and the following members are * only used in VS/NAT anyway */ struct ip_vs_app *app; /* bound ip_vs_app object */ void *app_data; /* Application private data */ struct_group(sync_conn_opt, struct ip_vs_seq in_seq; /* incoming seq. struct */ struct ip_vs_seq out_seq; /* outgoing seq. struct */ ); const struct ip_vs_pe *pe; char *pe_data; __u8 pe_data_len; struct rcu_head rcu_head; }; /* Extended internal versions of struct ip_vs_service_user and ip_vs_dest_user * for IPv6 support. * * We need these to conveniently pass around service and destination * options, but unfortunately, we also need to keep the old definitions to * maintain userspace backwards compatibility for the setsockopt interface. */ struct ip_vs_service_user_kern { /* virtual service addresses */ u16 af; u16 protocol; union nf_inet_addr addr; /* virtual ip address */ __be16 port; u32 fwmark; /* firewall mark of service */ /* virtual service options */ char *sched_name; char *pe_name; unsigned int flags; /* virtual service flags */ unsigned int timeout; /* persistent timeout in sec */ __be32 netmask; /* persistent netmask or plen */ }; struct ip_vs_dest_user_kern { /* destination server address */ union nf_inet_addr addr; __be16 port; /* real server options */ unsigned int conn_flags; /* connection flags */ int weight; /* destination weight */ /* thresholds for active connections */ u32 u_threshold; /* upper threshold */ u32 l_threshold; /* lower threshold */ /* Address family of addr */ u16 af; u16 tun_type; /* tunnel type */ __be16 tun_port; /* tunnel port */ u16 tun_flags; /* tunnel flags */ }; /* * The information about the virtual service offered to the net and the * forwarding entries. */ struct ip_vs_service { struct hlist_node s_list; /* for normal service table */ struct hlist_node f_list; /* for fwmark-based service table */ atomic_t refcnt; /* reference counter */ u16 af; /* address family */ __u16 protocol; /* which protocol (TCP/UDP) */ union nf_inet_addr addr; /* IP address for virtual service */ __be16 port; /* port number for the service */ __u32 fwmark; /* firewall mark of the service */ unsigned int flags; /* service status flags */ unsigned int timeout; /* persistent timeout in ticks */ __be32 netmask; /* grouping granularity, mask/plen */ struct netns_ipvs *ipvs; struct list_head destinations; /* real server d-linked list */ __u32 num_dests; /* number of servers */ struct ip_vs_stats stats; /* statistics for the service */ /* for scheduling */ struct ip_vs_scheduler __rcu *scheduler; /* bound scheduler object */ spinlock_t sched_lock; /* lock sched_data */ void *sched_data; /* scheduler application data */ /* alternate persistence engine */ struct ip_vs_pe __rcu *pe; int conntrack_afmask; struct rcu_head rcu_head; }; /* Information for cached dst */ struct ip_vs_dest_dst { struct dst_entry *dst_cache; /* destination cache entry */ u32 dst_cookie; union nf_inet_addr dst_saddr; struct rcu_head rcu_head; }; /* The real server destination forwarding entry with ip address, port number, * and so on. */ struct ip_vs_dest { struct list_head n_list; /* for the dests in the service */ struct hlist_node d_list; /* for table with all the dests */ u16 af; /* address family */ __be16 port; /* port number of the server */ union nf_inet_addr addr; /* IP address of the server */ volatile unsigned int flags; /* dest status flags */ atomic_t conn_flags; /* flags to copy to conn */ atomic_t weight; /* server weight */ atomic_t last_weight; /* server latest weight */ __u16 tun_type; /* tunnel type */ __be16 tun_port; /* tunnel port */ __u16 tun_flags; /* tunnel flags */ refcount_t refcnt; /* reference counter */ struct ip_vs_stats stats; /* statistics */ unsigned long idle_start; /* start time, jiffies */ /* connection counters and thresholds */ atomic_t activeconns; /* active connections */ atomic_t inactconns; /* inactive connections */ atomic_t persistconns; /* persistent connections */ __u32 u_threshold; /* upper threshold */ __u32 l_threshold; /* lower threshold */ /* for destination cache */ spinlock_t dst_lock; /* lock of dst_cache */ struct ip_vs_dest_dst __rcu *dest_dst; /* cached dst info */ /* for virtual service */ struct ip_vs_service __rcu *svc; /* service it belongs to */ __u16 protocol; /* which protocol (TCP/UDP) */ __be16 vport; /* virtual port number */ union nf_inet_addr vaddr; /* virtual IP address */ __u32 vfwmark; /* firewall mark of service */ struct rcu_head rcu_head; struct list_head t_list; /* in dest_trash */ unsigned int in_rs_table:1; /* we are in rs_table */ }; /* The scheduler object */ struct ip_vs_scheduler { struct list_head n_list; /* d-linked list head */ char *name; /* scheduler name */ atomic_t refcnt; /* reference counter */ struct module *module; /* THIS_MODULE/NULL */ /* scheduler initializing service */ int (*init_service)(struct ip_vs_service *svc); /* scheduling service finish */ void (*done_service)(struct ip_vs_service *svc); /* dest is linked */ int (*add_dest)(struct ip_vs_service *svc, struct ip_vs_dest *dest); /* dest is unlinked */ int (*del_dest)(struct ip_vs_service *svc, struct ip_vs_dest *dest); /* dest is updated */ int (*upd_dest)(struct ip_vs_service *svc, struct ip_vs_dest *dest); /* selecting a server from the given service */ struct ip_vs_dest* (*schedule)(struct ip_vs_service *svc, const struct sk_buff *skb, struct ip_vs_iphdr *iph); }; /* The persistence engine object */ struct ip_vs_pe { struct list_head n_list; /* d-linked list head */ char *name; /* scheduler name */ atomic_t refcnt; /* reference counter */ struct module *module; /* THIS_MODULE/NULL */ /* get the connection template, if any */ int (*fill_param)(struct ip_vs_conn_param *p, struct sk_buff *skb); bool (*ct_match)(const struct ip_vs_conn_param *p, struct ip_vs_conn *ct); u32 (*hashkey_raw)(const struct ip_vs_conn_param *p, u32 initval, bool inverse); int (*show_pe_data)(const struct ip_vs_conn *cp, char *buf); /* create connections for real-server outgoing packets */ struct ip_vs_conn* (*conn_out)(struct ip_vs_service *svc, struct ip_vs_dest *dest, struct sk_buff *skb, const struct ip_vs_iphdr *iph, __be16 dport, __be16 cport); }; /* The application module object (a.k.a. app incarnation) */ struct ip_vs_app { struct list_head a_list; /* member in app list */ int type; /* IP_VS_APP_TYPE_xxx */ char *name; /* application module name */ __u16 protocol; struct module *module; /* THIS_MODULE/NULL */ struct list_head incs_list; /* list of incarnations */ /* members for application incarnations */ struct list_head p_list; /* member in proto app list */ struct ip_vs_app *app; /* its real application */ __be16 port; /* port number in net order */ atomic_t usecnt; /* usage counter */ struct rcu_head rcu_head; /* output hook: Process packet in inout direction, diff set for TCP. * Return: 0=Error, 1=Payload Not Mangled/Mangled but checksum is ok, * 2=Mangled but checksum was not updated */ int (*pkt_out)(struct ip_vs_app *, struct ip_vs_conn *, struct sk_buff *, int *diff, struct ip_vs_iphdr *ipvsh); /* input hook: Process packet in outin direction, diff set for TCP. * Return: 0=Error, 1=Payload Not Mangled/Mangled but checksum is ok, * 2=Mangled but checksum was not updated */ int (*pkt_in)(struct ip_vs_app *, struct ip_vs_conn *, struct sk_buff *, int *diff, struct ip_vs_iphdr *ipvsh); /* ip_vs_app initializer */ int (*init_conn)(struct ip_vs_app *, struct ip_vs_conn *); /* ip_vs_app finish */ int (*done_conn)(struct ip_vs_app *, struct ip_vs_conn *); /* not used now */ int (*bind_conn)(struct ip_vs_app *, struct ip_vs_conn *, struct ip_vs_protocol *); void (*unbind_conn)(struct ip_vs_app *, struct ip_vs_conn *); int * timeout_table; int * timeouts; int timeouts_size; int (*conn_schedule)(struct sk_buff *skb, struct ip_vs_app *app, int *verdict, struct ip_vs_conn **cpp); struct ip_vs_conn * (*conn_in_get)(const struct sk_buff *skb, struct ip_vs_app *app, const struct iphdr *iph, int inverse); struct ip_vs_conn * (*conn_out_get)(const struct sk_buff *skb, struct ip_vs_app *app, const struct iphdr *iph, int inverse); int (*state_transition)(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, struct ip_vs_app *app); void (*timeout_change)(struct ip_vs_app *app, int flags); }; struct ipvs_master_sync_state { struct list_head sync_queue; struct ip_vs_sync_buff *sync_buff; unsigned long sync_queue_len; unsigned int sync_queue_delay; struct delayed_work master_wakeup_work; struct netns_ipvs *ipvs; }; struct ip_vs_sync_thread_data; /* How much time to keep dests in trash */ #define IP_VS_DEST_TRASH_PERIOD (120 * HZ) struct ipvs_sync_daemon_cfg { union nf_inet_addr mcast_group; int syncid; u16 sync_maxlen; u16 mcast_port; u8 mcast_af; u8 mcast_ttl; /* multicast interface name */ char mcast_ifn[IP_VS_IFNAME_MAXLEN]; }; /* IPVS in network namespace */ struct netns_ipvs { int gen; /* Generation */ int enable; /* enable like nf_hooks do */ /* Hash table: for real service lookups */ #define IP_VS_RTAB_BITS 4 #define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS) #define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1) struct hlist_head rs_table[IP_VS_RTAB_SIZE]; /* ip_vs_app */ struct list_head app_list; /* ip_vs_proto */ #define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */ struct ip_vs_proto_data *proto_data_table[IP_VS_PROTO_TAB_SIZE]; /* ip_vs_proto_tcp */ #ifdef CONFIG_IP_VS_PROTO_TCP #define TCP_APP_TAB_BITS 4 #define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS) #define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1) struct list_head tcp_apps[TCP_APP_TAB_SIZE]; #endif /* ip_vs_proto_udp */ #ifdef CONFIG_IP_VS_PROTO_UDP #define UDP_APP_TAB_BITS 4 #define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS) #define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1) struct list_head udp_apps[UDP_APP_TAB_SIZE]; #endif /* ip_vs_proto_sctp */ #ifdef CONFIG_IP_VS_PROTO_SCTP #define SCTP_APP_TAB_BITS 4 #define SCTP_APP_TAB_SIZE (1 << SCTP_APP_TAB_BITS) #define SCTP_APP_TAB_MASK (SCTP_APP_TAB_SIZE - 1) /* Hash table for SCTP application incarnations */ struct list_head sctp_apps[SCTP_APP_TAB_SIZE]; #endif /* ip_vs_conn */ atomic_t conn_count; /* connection counter */ /* ip_vs_ctl */ struct ip_vs_stats_rcu *tot_stats; /* Statistics & est. */ int num_services; /* no of virtual services */ int num_services6; /* IPv6 virtual services */ /* Trash for destinations */ struct list_head dest_trash; spinlock_t dest_trash_lock; struct timer_list dest_trash_timer; /* expiration timer */ /* Service counters */ atomic_t ftpsvc_counter; atomic_t nullsvc_counter; atomic_t conn_out_counter; #ifdef CONFIG_SYSCTL /* delayed work for expiring no dest connections */ struct delayed_work expire_nodest_conn_work; /* 1/rate drop and drop-entry variables */ struct delayed_work defense_work; /* Work handler */ int drop_rate; int drop_counter; int old_secure_tcp; atomic_t dropentry; /* locks in ctl.c */ spinlock_t dropentry_lock; /* drop entry handling */ spinlock_t droppacket_lock; /* drop packet handling */ spinlock_t securetcp_lock; /* state and timeout tables */ /* sys-ctl struct */ struct ctl_table_header *sysctl_hdr; struct ctl_table *sysctl_tbl; #endif /* sysctl variables */ int sysctl_amemthresh; int sysctl_am_droprate; int sysctl_drop_entry; int sysctl_drop_packet; int sysctl_secure_tcp; #ifdef CONFIG_IP_VS_NFCT int sysctl_conntrack; #endif int sysctl_snat_reroute; int sysctl_sync_ver; int sysctl_sync_ports; int sysctl_sync_persist_mode; unsigned long sysctl_sync_qlen_max; int sysctl_sync_sock_size; int sysctl_cache_bypass; int sysctl_expire_nodest_conn; int sysctl_sloppy_tcp; int sysctl_sloppy_sctp; int sysctl_expire_quiescent_template; int sysctl_sync_threshold[2]; unsigned int sysctl_sync_refresh_period; int sysctl_sync_retries; int sysctl_nat_icmp_send; int sysctl_pmtu_disc; int sysctl_backup_only; int sysctl_conn_reuse_mode; int sysctl_schedule_icmp; int sysctl_ignore_tunneled; int sysctl_run_estimation; #ifdef CONFIG_SYSCTL cpumask_var_t sysctl_est_cpulist; /* kthread cpumask */ int est_cpulist_valid; /* cpulist set */ int sysctl_est_nice; /* kthread nice */ int est_stopped; /* stop tasks */ #endif /* ip_vs_lblc */ int sysctl_lblc_expiration; struct ctl_table_header *lblc_ctl_header; struct ctl_table *lblc_ctl_table; /* ip_vs_lblcr */ int sysctl_lblcr_expiration; struct ctl_table_header *lblcr_ctl_header; struct ctl_table *lblcr_ctl_table; /* ip_vs_est */ struct delayed_work est_reload_work;/* Reload kthread tasks */ struct mutex est_mutex; /* protect kthread tasks */ struct hlist_head est_temp_list; /* Ests during calc phase */ struct ip_vs_est_kt_data **est_kt_arr; /* Array of kthread data ptrs */ unsigned long est_max_threads;/* Hard limit of kthreads */ int est_calc_phase; /* Calculation phase */ int est_chain_max; /* Calculated chain_max */ int est_kt_count; /* Allocated ptrs */ int est_add_ktid; /* ktid where to add ests */ atomic_t est_genid; /* kthreads reload genid */ atomic_t est_genid_done; /* applied genid */ /* ip_vs_sync */ spinlock_t sync_lock; struct ipvs_master_sync_state *ms; spinlock_t sync_buff_lock; struct ip_vs_sync_thread_data *master_tinfo; struct ip_vs_sync_thread_data *backup_tinfo; int threads_mask; volatile int sync_state; struct mutex sync_mutex; struct ipvs_sync_daemon_cfg mcfg; /* Master Configuration */ struct ipvs_sync_daemon_cfg bcfg; /* Backup Configuration */ /* net name space ptr */ struct net *net; /* Needed by timer routines */ /* Number of heterogeneous destinations, needed because heterogeneous * are not supported when synchronization is enabled. */ unsigned int mixed_address_family_dests; unsigned int hooks_afmask; /* &1=AF_INET, &2=AF_INET6 */ }; #define DEFAULT_SYNC_THRESHOLD 3 #define DEFAULT_SYNC_PERIOD 50 #define DEFAULT_SYNC_VER 1 #define DEFAULT_SLOPPY_TCP 0 #define DEFAULT_SLOPPY_SCTP 0 #define DEFAULT_SYNC_REFRESH_PERIOD (0U * HZ) #define DEFAULT_SYNC_RETRIES 0 #define IPVS_SYNC_WAKEUP_RATE 8 #define IPVS_SYNC_QLEN_MAX (IPVS_SYNC_WAKEUP_RATE * 4) #define IPVS_SYNC_SEND_DELAY (HZ / 50) #define IPVS_SYNC_CHECK_PERIOD HZ #define IPVS_SYNC_FLUSH_TIME (HZ * 2) #define IPVS_SYNC_PORTS_MAX (1 << 6) #ifdef CONFIG_SYSCTL static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs) { return ipvs->sysctl_sync_threshold[0]; } static inline int sysctl_sync_period(struct netns_ipvs *ipvs) { return READ_ONCE(ipvs->sysctl_sync_threshold[1]); } static inline unsigned int sysctl_sync_refresh_period(struct netns_ipvs *ipvs) { return READ_ONCE(ipvs->sysctl_sync_refresh_period); } static inline int sysctl_sync_retries(struct netns_ipvs *ipvs) { return ipvs->sysctl_sync_retries; } static inline int sysctl_sync_ver(struct netns_ipvs *ipvs) { return ipvs->sysctl_sync_ver; } static inline int sysctl_sloppy_tcp(struct netns_ipvs *ipvs) { return ipvs->sysctl_sloppy_tcp; } static inline int sysctl_sloppy_sctp(struct netns_ipvs *ipvs) { return ipvs->sysctl_sloppy_sctp; } static inline int sysctl_sync_ports(struct netns_ipvs *ipvs) { return READ_ONCE(ipvs->sysctl_sync_ports); } static inline int sysctl_sync_persist_mode(struct netns_ipvs *ipvs) { return ipvs->sysctl_sync_persist_mode; } static inline unsigned long sysctl_sync_qlen_max(struct netns_ipvs *ipvs) { return ipvs->sysctl_sync_qlen_max; } static inline int sysctl_sync_sock_size(struct netns_ipvs *ipvs) { return ipvs->sysctl_sync_sock_size; } static inline int sysctl_pmtu_disc(struct netns_ipvs *ipvs) { return ipvs->sysctl_pmtu_disc; } static inline int sysctl_backup_only(struct netns_ipvs *ipvs) { return ipvs->sync_state & IP_VS_STATE_BACKUP && ipvs->sysctl_backup_only; } static inline int sysctl_conn_reuse_mode(struct netns_ipvs *ipvs) { return ipvs->sysctl_conn_reuse_mode; } static inline int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) { return ipvs->sysctl_expire_nodest_conn; } static inline int sysctl_schedule_icmp(struct netns_ipvs *ipvs) { return ipvs->sysctl_schedule_icmp; } static inline int sysctl_ignore_tunneled(struct netns_ipvs *ipvs) { return ipvs->sysctl_ignore_tunneled; } static inline int sysctl_cache_bypass(struct netns_ipvs *ipvs) { return ipvs->sysctl_cache_bypass; } static inline int sysctl_run_estimation(struct netns_ipvs *ipvs) { return ipvs->sysctl_run_estimation; } static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs) { if (ipvs->est_cpulist_valid) return ipvs->sysctl_est_cpulist; else return housekeeping_cpumask(HK_TYPE_KTHREAD); } static inline int sysctl_est_nice(struct netns_ipvs *ipvs) { return ipvs->sysctl_est_nice; } #else static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs) { return DEFAULT_SYNC_THRESHOLD; } static inline int sysctl_sync_period(struct netns_ipvs *ipvs) { return DEFAULT_SYNC_PERIOD; } static inline unsigned int sysctl_sync_refresh_period(struct netns_ipvs *ipvs) { return DEFAULT_SYNC_REFRESH_PERIOD; } static inline int sysctl_sync_retries(struct netns_ipvs *ipvs) { return DEFAULT_SYNC_RETRIES & 3; } static inline int sysctl_sync_ver(struct netns_ipvs *ipvs) { return DEFAULT_SYNC_VER; } static inline int sysctl_sloppy_tcp(struct netns_ipvs *ipvs) { return DEFAULT_SLOPPY_TCP; } static inline int sysctl_sloppy_sctp(struct netns_ipvs *ipvs) { return DEFAULT_SLOPPY_SCTP; } static inline int sysctl_sync_ports(struct netns_ipvs *ipvs) { return 1; } static inline int sysctl_sync_persist_mode(struct netns_ipvs *ipvs) { return 0; } static inline unsigned long sysctl_sync_qlen_max(struct netns_ipvs *ipvs) { return IPVS_SYNC_QLEN_MAX; } static inline int sysctl_sync_sock_size(struct netns_ipvs *ipvs) { return 0; } static inline int sysctl_pmtu_disc(struct netns_ipvs *ipvs) { return 1; } static inline int sysctl_backup_only(struct netns_ipvs *ipvs) { return 0; } static inline int sysctl_conn_reuse_mode(struct netns_ipvs *ipvs) { return 1; } static inline int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) { return 0; } static inline int sysctl_schedule_icmp(struct netns_ipvs *ipvs) { return 0; } static inline int sysctl_ignore_tunneled(struct netns_ipvs *ipvs) { return 0; } static inline int sysctl_cache_bypass(struct netns_ipvs *ipvs) { return 0; } static inline int sysctl_run_estimation(struct netns_ipvs *ipvs) { return 1; } static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs) { return housekeeping_cpumask(HK_TYPE_KTHREAD); } static inline int sysctl_est_nice(struct netns_ipvs *ipvs) { return IPVS_EST_NICE; } #endif /* IPVS core functions * (from ip_vs_core.c) */ const char *ip_vs_proto_name(unsigned int proto); void ip_vs_init_hash_table(struct list_head *table, int rows); struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc, struct ip_vs_dest *dest, struct sk_buff *skb, const struct ip_vs_iphdr *iph, __be16 dport, __be16 cport); #define IP_VS_INIT_HASH_TABLE(t) ip_vs_init_hash_table((t), ARRAY_SIZE((t))) #define IP_VS_APP_TYPE_FTP 1 /* ip_vs_conn handling functions * (from ip_vs_conn.c) */ enum { IP_VS_DIR_INPUT = 0, IP_VS_DIR_OUTPUT, IP_VS_DIR_INPUT_ONLY, IP_VS_DIR_LAST, }; static inline void ip_vs_conn_fill_param(struct netns_ipvs *ipvs, int af, int protocol, const union nf_inet_addr *caddr, __be16 cport, const union nf_inet_addr *vaddr, __be16 vport, struct ip_vs_conn_param *p) { p->ipvs = ipvs; p->af = af; p->protocol = protocol; p->caddr = caddr; p->cport = cport; p->vaddr = vaddr; p->vport = vport; p->pe = NULL; p->pe_data = NULL; } struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p); struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p); struct ip_vs_conn * ip_vs_conn_in_get_proto(struct netns_ipvs *ipvs, int af, const struct sk_buff *skb, const struct ip_vs_iphdr *iph); struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p); struct ip_vs_conn * ip_vs_conn_out_get_proto(struct netns_ipvs *ipvs, int af, const struct sk_buff *skb, const struct ip_vs_iphdr *iph); /* Get reference to gain full access to conn. * By default, RCU read-side critical sections have access only to * conn fields and its PE data, see ip_vs_conn_rcu_free() for reference. */ static inline bool __ip_vs_conn_get(struct ip_vs_conn *cp) { return refcount_inc_not_zero(&cp->refcnt); } /* put back the conn without restarting its timer */ static inline void __ip_vs_conn_put(struct ip_vs_conn *cp) { smp_mb__before_atomic(); refcount_dec(&cp->refcnt); } void ip_vs_conn_put(struct ip_vs_conn *cp); void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport); struct ip_vs_conn *ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, const union nf_inet_addr *daddr, __be16 dport, unsigned int flags, struct ip_vs_dest *dest, __u32 fwmark); void ip_vs_conn_expire_now(struct ip_vs_conn *cp); const char *ip_vs_state_name(const struct ip_vs_conn *cp); void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp); int ip_vs_check_template(struct ip_vs_conn *ct, struct ip_vs_dest *cdest); void ip_vs_random_dropentry(struct netns_ipvs *ipvs); int ip_vs_conn_init(void); void ip_vs_conn_cleanup(void); static inline void ip_vs_control_del(struct ip_vs_conn *cp) { struct ip_vs_conn *ctl_cp = cp->control; if (!ctl_cp) { IP_VS_ERR_BUF("request control DEL for uncontrolled: " "%s:%d to %s:%d\n", IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport)); return; } IP_VS_DBG_BUF(7, "DELeting control for: " "cp.dst=%s:%d ctl_cp.dst=%s:%d\n", IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &ctl_cp->caddr), ntohs(ctl_cp->cport)); cp->control = NULL; if (atomic_read(&ctl_cp->n_control) == 0) { IP_VS_ERR_BUF("BUG control DEL with n=0 : " "%s:%d to %s:%d\n", IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport)); return; } atomic_dec(&ctl_cp->n_control); } static inline void ip_vs_control_add(struct ip_vs_conn *cp, struct ip_vs_conn *ctl_cp) { if (cp->control) { IP_VS_ERR_BUF("request control ADD for already controlled: " "%s:%d to %s:%d\n", IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport)); ip_vs_control_del(cp); } IP_VS_DBG_BUF(7, "ADDing control for: " "cp.dst=%s:%d ctl_cp.dst=%s:%d\n", IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &ctl_cp->caddr), ntohs(ctl_cp->cport)); cp->control = ctl_cp; atomic_inc(&ctl_cp->n_control); } /* Mark our template as assured */ static inline void ip_vs_control_assure_ct(struct ip_vs_conn *cp) { struct ip_vs_conn *ct = cp->control; if (ct && !(ct->state & IP_VS_CTPL_S_ASSURED) && (ct->flags & IP_VS_CONN_F_TEMPLATE)) ct->state |= IP_VS_CTPL_S_ASSURED; } /* IPVS netns init & cleanup functions */ int ip_vs_estimator_net_init(struct netns_ipvs *ipvs); int ip_vs_control_net_init(struct netns_ipvs *ipvs); int ip_vs_protocol_net_init(struct netns_ipvs *ipvs); int ip_vs_app_net_init(struct netns_ipvs *ipvs); int ip_vs_conn_net_init(struct netns_ipvs *ipvs); int ip_vs_sync_net_init(struct netns_ipvs *ipvs); void ip_vs_conn_net_cleanup(struct netns_ipvs *ipvs); void ip_vs_app_net_cleanup(struct netns_ipvs *ipvs); void ip_vs_protocol_net_cleanup(struct netns_ipvs *ipvs); void ip_vs_control_net_cleanup(struct netns_ipvs *ipvs); void ip_vs_estimator_net_cleanup(struct netns_ipvs *ipvs); void ip_vs_sync_net_cleanup(struct netns_ipvs *ipvs); void ip_vs_service_nets_cleanup(struct list_head *net_list); /* IPVS application functions * (from ip_vs_app.c) */ #define IP_VS_APP_MAX_PORTS 8 struct ip_vs_app *register_ip_vs_app(struct netns_ipvs *ipvs, struct ip_vs_app *app); void unregister_ip_vs_app(struct netns_ipvs *ipvs, struct ip_vs_app *app); int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp); void ip_vs_unbind_app(struct ip_vs_conn *cp); int register_ip_vs_app_inc(struct netns_ipvs *ipvs, struct ip_vs_app *app, __u16 proto, __u16 port); int ip_vs_app_inc_get(struct ip_vs_app *inc); void ip_vs_app_inc_put(struct ip_vs_app *inc); int ip_vs_app_pkt_out(struct ip_vs_conn *, struct sk_buff *skb, struct ip_vs_iphdr *ipvsh); int ip_vs_app_pkt_in(struct ip_vs_conn *, struct sk_buff *skb, struct ip_vs_iphdr *ipvsh); int register_ip_vs_pe(struct ip_vs_pe *pe); int unregister_ip_vs_pe(struct ip_vs_pe *pe); struct ip_vs_pe *ip_vs_pe_getbyname(const char *name); struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name); /* Use a #define to avoid all of module.h just for these trivial ops */ #define ip_vs_pe_get(pe) \ if (pe && pe->module) \ __module_get(pe->module); #define ip_vs_pe_put(pe) \ if (pe && pe->module) \ module_put(pe->module); /* IPVS protocol functions (from ip_vs_proto.c) */ int ip_vs_protocol_init(void); void ip_vs_protocol_cleanup(void); void ip_vs_protocol_timeout_change(struct netns_ipvs *ipvs, int flags); int *ip_vs_create_timeout_table(int *table, int size); void ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp, const struct sk_buff *skb, int offset, const char *msg); extern struct ip_vs_protocol ip_vs_protocol_tcp; extern struct ip_vs_protocol ip_vs_protocol_udp; extern struct ip_vs_protocol ip_vs_protocol_icmp; extern struct ip_vs_protocol ip_vs_protocol_esp; extern struct ip_vs_protocol ip_vs_protocol_ah; extern struct ip_vs_protocol ip_vs_protocol_sctp; /* Registering/unregistering scheduler functions * (from ip_vs_sched.c) */ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler); int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler); int ip_vs_bind_scheduler(struct ip_vs_service *svc, struct ip_vs_scheduler *scheduler); void ip_vs_unbind_scheduler(struct ip_vs_service *svc, struct ip_vs_scheduler *sched); struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name); void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler); struct ip_vs_conn * ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, struct ip_vs_proto_data *pd, int *ignored, struct ip_vs_iphdr *iph); int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, struct ip_vs_proto_data *pd, struct ip_vs_iphdr *iph); void ip_vs_scheduler_err(struct ip_vs_service *svc, const char *msg); /* IPVS control data and functions (from ip_vs_ctl.c) */ extern struct ip_vs_stats ip_vs_stats; extern int sysctl_ip_vs_sync_ver; struct ip_vs_service * ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol, const union nf_inet_addr *vaddr, __be16 vport); bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol, const union nf_inet_addr *daddr, __be16 dport); struct ip_vs_dest * ip_vs_find_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol, const union nf_inet_addr *daddr, __be16 dport); struct ip_vs_dest *ip_vs_find_tunnel(struct netns_ipvs *ipvs, int af, const union nf_inet_addr *daddr, __be16 tun_port); int ip_vs_use_count_inc(void); void ip_vs_use_count_dec(void); int ip_vs_register_nl_ioctl(void); void ip_vs_unregister_nl_ioctl(void); int ip_vs_control_init(void); void ip_vs_control_cleanup(void); struct ip_vs_dest * ip_vs_find_dest(struct netns_ipvs *ipvs, int svc_af, int dest_af, const union nf_inet_addr *daddr, __be16 dport, const union nf_inet_addr *vaddr, __be16 vport, __u16 protocol, __u32 fwmark, __u32 flags); void ip_vs_try_bind_dest(struct ip_vs_conn *cp); static inline void ip_vs_dest_hold(struct ip_vs_dest *dest) { refcount_inc(&dest->refcnt); } static inline void ip_vs_dest_put(struct ip_vs_dest *dest) { smp_mb__before_atomic(); refcount_dec(&dest->refcnt); } static inline void ip_vs_dest_put_and_free(struct ip_vs_dest *dest) { if (refcount_dec_and_test(&dest->refcnt)) kfree(dest); } /* IPVS sync daemon data and function prototypes * (from ip_vs_sync.c) */ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *cfg, int state); int stop_sync_thread(struct netns_ipvs *ipvs, int state); void ip_vs_sync_conn(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, int pkts); /* IPVS rate estimator prototypes (from ip_vs_est.c) */ int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats); void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats); void ip_vs_zero_estimator(struct ip_vs_stats *stats); void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats); void ip_vs_est_reload_start(struct netns_ipvs *ipvs); int ip_vs_est_kthread_start(struct netns_ipvs *ipvs, struct ip_vs_est_kt_data *kd); void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd); static inline void ip_vs_est_stopped_recalc(struct netns_ipvs *ipvs) { #ifdef CONFIG_SYSCTL /* Stop tasks while cpulist is empty or if disabled with flag */ ipvs->est_stopped = !sysctl_run_estimation(ipvs) || (ipvs->est_cpulist_valid && cpumask_empty(sysctl_est_cpulist(ipvs))); #endif } static inline bool ip_vs_est_stopped(struct netns_ipvs *ipvs) { #ifdef CONFIG_SYSCTL return ipvs->est_stopped; #else return false; #endif } static inline int ip_vs_est_max_threads(struct netns_ipvs *ipvs) { unsigned int limit = IPVS_EST_CPU_KTHREADS * cpumask_weight(sysctl_est_cpulist(ipvs)); return max(1U, limit); } /* Various IPVS packet transmitters (from ip_vs_xmit.c) */ int ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); int ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); int ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); int ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); int ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); int ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, int offset, unsigned int hooknum, struct ip_vs_iphdr *iph); void ip_vs_dest_dst_rcu_free(struct rcu_head *head); #ifdef CONFIG_IP_VS_IPV6 int ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); int ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); int ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); int ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); int ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, int offset, unsigned int hooknum, struct ip_vs_iphdr *iph); #endif #ifdef CONFIG_SYSCTL /* This is a simple mechanism to ignore packets when * we are loaded. Just set ip_vs_drop_rate to 'n' and * we start to drop 1/rate of the packets */ static inline int ip_vs_todrop(struct netns_ipvs *ipvs) { if (!ipvs->drop_rate) return 0; if (--ipvs->drop_counter > 0) return 0; ipvs->drop_counter = ipvs->drop_rate; return 1; } #else static inline int ip_vs_todrop(struct netns_ipvs *ipvs) { return 0; } #endif #ifdef CONFIG_SYSCTL /* Enqueue delayed work for expiring no dest connections * Only run when sysctl_expire_nodest=1 */ static inline void ip_vs_enqueue_expire_nodest_conns(struct netns_ipvs *ipvs) { if (sysctl_expire_nodest_conn(ipvs)) queue_delayed_work(system_long_wq, &ipvs->expire_nodest_conn_work, 1); } void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs); #else static inline void ip_vs_enqueue_expire_nodest_conns(struct netns_ipvs *ipvs) {} #endif #define IP_VS_DFWD_METHOD(dest) (atomic_read(&(dest)->conn_flags) & \ IP_VS_CONN_F_FWD_MASK) /* ip_vs_fwd_tag returns the forwarding tag of the connection */ #define IP_VS_FWD_METHOD(cp) (cp->flags & IP_VS_CONN_F_FWD_MASK) static inline char ip_vs_fwd_tag(struct ip_vs_conn *cp) { char fwd; switch (IP_VS_FWD_METHOD(cp)) { case IP_VS_CONN_F_MASQ: fwd = 'M'; break; case IP_VS_CONN_F_LOCALNODE: fwd = 'L'; break; case IP_VS_CONN_F_TUNNEL: fwd = 'T'; break; case IP_VS_CONN_F_DROUTE: fwd = 'R'; break; case IP_VS_CONN_F_BYPASS: fwd = 'B'; break; default: fwd = '?'; break; } return fwd; } void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, int dir); #ifdef CONFIG_IP_VS_IPV6 void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, int dir); #endif __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset); static inline __wsum ip_vs_check_diff4(__be32 old, __be32 new, __wsum oldsum) { __be32 diff[2] = { ~old, new }; return csum_partial(diff, sizeof(diff), oldsum); } #ifdef CONFIG_IP_VS_IPV6 static inline __wsum ip_vs_check_diff16(const __be32 *old, const __be32 *new, __wsum oldsum) { __be32 diff[8] = { ~old[3], ~old[2], ~old[1], ~old[0], new[3], new[2], new[1], new[0] }; return csum_partial(diff, sizeof(diff), oldsum); } #endif static inline __wsum ip_vs_check_diff2(__be16 old, __be16 new, __wsum oldsum) { __be16 diff[2] = { ~old, new }; return csum_partial(diff, sizeof(diff), oldsum); } /* Forget current conntrack (unconfirmed) and attach notrack entry */ static inline void ip_vs_notrack(struct sk_buff *skb) { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); if (ct) { nf_conntrack_put(&ct->ct_general); nf_ct_set(skb, NULL, IP_CT_UNTRACKED); } #endif } #ifdef CONFIG_IP_VS_NFCT /* Netfilter connection tracking * (from ip_vs_nfct.c) */ static inline int ip_vs_conntrack_enabled(struct netns_ipvs *ipvs) { #ifdef CONFIG_SYSCTL return ipvs->sysctl_conntrack; #else return 0; #endif } void ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin); int ip_vs_confirm_conntrack(struct sk_buff *skb); void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct, struct ip_vs_conn *cp, u_int8_t proto, const __be16 port, int from_rs); void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp); #else static inline int ip_vs_conntrack_enabled(struct netns_ipvs *ipvs) { return 0; } static inline void ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin) { } static inline int ip_vs_confirm_conntrack(struct sk_buff *skb) { return NF_ACCEPT; } static inline void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp) { } #endif /* CONFIG_IP_VS_NFCT */ /* Using old conntrack that can not be redirected to another real server? */ static inline bool ip_vs_conn_uses_old_conntrack(struct ip_vs_conn *cp, struct sk_buff *skb) { #ifdef CONFIG_IP_VS_NFCT enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); if (ct && nf_ct_is_confirmed(ct)) return true; #endif return false; } static inline int ip_vs_register_conntrack(struct ip_vs_service *svc) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) int afmask = (svc->af == AF_INET6) ? 2 : 1; int ret = 0; if (!(svc->conntrack_afmask & afmask)) { ret = nf_ct_netns_get(svc->ipvs->net, svc->af); if (ret >= 0) svc->conntrack_afmask |= afmask; } return ret; #else return 0; #endif } static inline void ip_vs_unregister_conntrack(struct ip_vs_service *svc) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) int afmask = (svc->af == AF_INET6) ? 2 : 1; if (svc->conntrack_afmask & afmask) { nf_ct_netns_put(svc->ipvs->net, svc->af); svc->conntrack_afmask &= ~afmask; } #endif } int ip_vs_register_hooks(struct netns_ipvs *ipvs, unsigned int af); void ip_vs_unregister_hooks(struct netns_ipvs *ipvs, unsigned int af); static inline int ip_vs_dest_conn_overhead(struct ip_vs_dest *dest) { /* We think the overhead of processing active connections is 256 * times higher than that of inactive connections in average. (This * 256 times might not be accurate, we will change it later) We * use the following formula to estimate the overhead now: * dest->activeconns*256 + dest->inactconns */ return (atomic_read(&dest->activeconns) << 8) + atomic_read(&dest->inactconns); } #ifdef CONFIG_IP_VS_PROTO_TCP INDIRECT_CALLABLE_DECLARE(int tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)); #endif #ifdef CONFIG_IP_VS_PROTO_UDP INDIRECT_CALLABLE_DECLARE(int udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)); #endif #endif /* _NET_IP_VS_H */
32 136 23 4 4 4 4 4 4 4 4 3 1 4 973 15 246 245 245 241 4382 4382 4384 3412 3407 1010 1015 347 3 129 5 16 12 42 26 16 14 1 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* internal.h: mm/ internal definitions * * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #ifndef __MM_INTERNAL_H #define __MM_INTERNAL_H #include <linux/fs.h> #include <linux/khugepaged.h> #include <linux/mm.h> #include <linux/mm_inline.h> #include <linux/pagemap.h> #include <linux/pagewalk.h> #include <linux/rmap.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/swap_cgroup.h> #include <linux/tracepoint-defs.h> /* Internal core VMA manipulation functions. */ #include "vma.h" struct folio_batch; /* * The set of flags that only affect watermark checking and reclaim * behaviour. This is used by the MM to obey the caller constraints * about IO, FS and watermark checking while ignoring placement * hints such as HIGHMEM usage. */ #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\ __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\ __GFP_NOLOCKDEP) /* The GFP flags allowed during early boot */ #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS)) /* Control allocation cpuset and node placement constraints */ #define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE) /* Do not use these with a slab allocator */ #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) /* * Different from WARN_ON_ONCE(), no warning will be issued * when we specify __GFP_NOWARN. */ #define WARN_ON_ONCE_GFP(cond, gfp) ({ \ static bool __section(".data..once") __warned; \ int __ret_warn_once = !!(cond); \ \ if (unlikely(!(gfp & __GFP_NOWARN) && __ret_warn_once && !__warned)) { \ __warned = true; \ WARN_ON(1); \ } \ unlikely(__ret_warn_once); \ }) void page_writeback_init(void); /* * If a 16GB hugetlb folio were mapped by PTEs of all of its 4kB pages, * its nr_pages_mapped would be 0x400000: choose the ENTIRELY_MAPPED bit * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE). Hugetlb currently * leaves nr_pages_mapped at 0, but avoid surprise if it participates later. */ #define ENTIRELY_MAPPED 0x800000 #define FOLIO_PAGES_MAPPED (ENTIRELY_MAPPED - 1) /* * Flags passed to __show_mem() and show_free_areas() to suppress output in * various contexts. */ #define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */ /* * How many individual pages have an elevated _mapcount. Excludes * the folio's entire_mapcount. * * Don't use this function outside of debugging code. */ static inline int folio_nr_pages_mapped(const struct folio *folio) { return atomic_read(&folio->_nr_pages_mapped) & FOLIO_PAGES_MAPPED; } /* * Retrieve the first entry of a folio based on a provided entry within the * folio. We cannot rely on folio->swap as there is no guarantee that it has * been initialized. Used for calling arch_swap_restore() */ static inline swp_entry_t folio_swap(swp_entry_t entry, const struct folio *folio) { swp_entry_t swap = { .val = ALIGN_DOWN(entry.val, folio_nr_pages(folio)), }; return swap; } static inline void *folio_raw_mapping(const struct folio *folio) { unsigned long mapping = (unsigned long)folio->mapping; return (void *)(mapping & ~PAGE_MAPPING_FLAGS); } /* * This is a file-backed mapping, and is about to be memory mapped - invoke its * mmap hook and safely handle error conditions. On error, VMA hooks will be * mutated. * * @file: File which backs the mapping. * @vma: VMA which we are mapping. * * Returns: 0 if success, error otherwise. */ static inline int mmap_file(struct file *file, struct vm_area_struct *vma) { int err = call_mmap(file, vma); if (likely(!err)) return 0; /* * OK, we tried to call the file hook for mmap(), but an error * arose. The mapping is in an inconsistent state and we most not invoke * any further hooks on it. */ vma->vm_ops = &vma_dummy_vm_ops; return err; } /* * If the VMA has a close hook then close it, and since closing it might leave * it in an inconsistent state which makes the use of any hooks suspect, clear * them down by installing dummy empty hooks. */ static inline void vma_close(struct vm_area_struct *vma) { if (vma->vm_ops && vma->vm_ops->close) { vma->vm_ops->close(vma); /* * The mapping is in an inconsistent state, and no further hooks * may be invoked upon it. */ vma->vm_ops = &vma_dummy_vm_ops; } } #ifdef CONFIG_MMU /* Flags for folio_pte_batch(). */ typedef int __bitwise fpb_t; /* Compare PTEs after pte_mkclean(), ignoring the dirty bit. */ #define FPB_IGNORE_DIRTY ((__force fpb_t)BIT(0)) /* Compare PTEs after pte_clear_soft_dirty(), ignoring the soft-dirty bit. */ #define FPB_IGNORE_SOFT_DIRTY ((__force fpb_t)BIT(1)) static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) { if (flags & FPB_IGNORE_DIRTY) pte = pte_mkclean(pte); if (likely(flags & FPB_IGNORE_SOFT_DIRTY)) pte = pte_clear_soft_dirty(pte); return pte_wrprotect(pte_mkold(pte)); } /** * folio_pte_batch - detect a PTE batch for a large folio * @folio: The large folio to detect a PTE batch for. * @addr: The user virtual address the first page is mapped at. * @start_ptep: Page table pointer for the first entry. * @pte: Page table entry for the first page. * @max_nr: The maximum number of table entries to consider. * @flags: Flags to modify the PTE batch semantics. * @any_writable: Optional pointer to indicate whether any entry except the * first one is writable. * @any_young: Optional pointer to indicate whether any entry except the * first one is young. * @any_dirty: Optional pointer to indicate whether any entry except the * first one is dirty. * * Detect a PTE batch: consecutive (present) PTEs that map consecutive * pages of the same large folio. * * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN, * the accessed bit, writable bit, dirty bit (with FPB_IGNORE_DIRTY) and * soft-dirty bit (with FPB_IGNORE_SOFT_DIRTY). * * start_ptep must map any page of the folio. max_nr must be at least one and * must be limited by the caller so scanning cannot exceed a single page table. * * Return: the number of table entries in the batch. */ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags, bool *any_writable, bool *any_young, bool *any_dirty) { unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio); const pte_t *end_ptep = start_ptep + max_nr; pte_t expected_pte, *ptep; bool writable, young, dirty; int nr; if (any_writable) *any_writable = false; if (any_young) *any_young = false; if (any_dirty) *any_dirty = false; VM_WARN_ON_FOLIO(!pte_present(pte), folio); VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio); VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio); nr = pte_batch_hint(start_ptep, pte); expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags); ptep = start_ptep + nr; while (ptep < end_ptep) { pte = ptep_get(ptep); if (any_writable) writable = !!pte_write(pte); if (any_young) young = !!pte_young(pte); if (any_dirty) dirty = !!pte_dirty(pte); pte = __pte_batch_clear_ignored(pte, flags); if (!pte_same(pte, expected_pte)) break; /* * Stop immediately once we reached the end of the folio. In * corner cases the next PFN might fall into a different * folio. */ if (pte_pfn(pte) >= folio_end_pfn) break; if (any_writable) *any_writable |= writable; if (any_young) *any_young |= young; if (any_dirty) *any_dirty |= dirty; nr = pte_batch_hint(ptep, pte); expected_pte = pte_advance_pfn(expected_pte, nr); ptep += nr; } return min(ptep - start_ptep, max_nr); } /** * pte_move_swp_offset - Move the swap entry offset field of a swap pte * forward or backward by delta * @pte: The initial pte state; is_swap_pte(pte) must be true and * non_swap_entry() must be false. * @delta: The direction and the offset we are moving; forward if delta * is positive; backward if delta is negative * * Moves the swap offset, while maintaining all other fields, including * swap type, and any swp pte bits. The resulting pte is returned. */ static inline pte_t pte_move_swp_offset(pte_t pte, long delta) { swp_entry_t entry = pte_to_swp_entry(pte); pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry), (swp_offset(entry) + delta))); if (pte_swp_soft_dirty(pte)) new = pte_swp_mksoft_dirty(new); if (pte_swp_exclusive(pte)) new = pte_swp_mkexclusive(new); if (pte_swp_uffd_wp(pte)) new = pte_swp_mkuffd_wp(new); return new; } /** * pte_next_swp_offset - Increment the swap entry offset field of a swap pte. * @pte: The initial pte state; is_swap_pte(pte) must be true and * non_swap_entry() must be false. * * Increments the swap offset, while maintaining all other fields, including * swap type, and any swp pte bits. The resulting pte is returned. */ static inline pte_t pte_next_swp_offset(pte_t pte) { return pte_move_swp_offset(pte, 1); } /** * swap_pte_batch - detect a PTE batch for a set of contiguous swap entries * @start_ptep: Page table pointer for the first entry. * @max_nr: The maximum number of table entries to consider. * @pte: Page table entry for the first entry. * * Detect a batch of contiguous swap entries: consecutive (non-present) PTEs * containing swap entries all with consecutive offsets and targeting the same * swap type, all with matching swp pte bits. * * max_nr must be at least one and must be limited by the caller so scanning * cannot exceed a single page table. * * Return: the number of table entries in the batch. */ static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte) { pte_t expected_pte = pte_next_swp_offset(pte); const pte_t *end_ptep = start_ptep + max_nr; swp_entry_t entry = pte_to_swp_entry(pte); pte_t *ptep = start_ptep + 1; unsigned short cgroup_id; VM_WARN_ON(max_nr < 1); VM_WARN_ON(!is_swap_pte(pte)); VM_WARN_ON(non_swap_entry(entry)); cgroup_id = lookup_swap_cgroup_id(entry); while (ptep < end_ptep) { pte = ptep_get(ptep); if (!pte_same(pte, expected_pte)) break; if (lookup_swap_cgroup_id(pte_to_swp_entry(pte)) != cgroup_id) break; expected_pte = pte_next_swp_offset(expected_pte); ptep++; } return ptep - start_ptep; } #endif /* CONFIG_MMU */ void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio, int nr_throttled); static inline void acct_reclaim_writeback(struct folio *folio) { pg_data_t *pgdat = folio_pgdat(folio); int nr_throttled = atomic_read(&pgdat->nr_writeback_throttled); if (nr_throttled) __acct_reclaim_writeback(pgdat, folio, nr_throttled); } static inline void wake_throttle_isolated(pg_data_t *pgdat) { wait_queue_head_t *wqh; wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_ISOLATED]; if (waitqueue_active(wqh)) wake_up(wqh); } vm_fault_t __vmf_anon_prepare(struct vm_fault *vmf); static inline vm_fault_t vmf_anon_prepare(struct vm_fault *vmf) { vm_fault_t ret = __vmf_anon_prepare(vmf); if (unlikely(ret & VM_FAULT_RETRY)) vma_end_read(vmf->vma); return ret; } vm_fault_t do_swap_page(struct vm_fault *vmf); void folio_rotate_reclaimable(struct folio *folio); bool __folio_end_writeback(struct folio *folio); void deactivate_file_folio(struct folio *folio); void folio_activate(struct folio *folio); void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling, bool mm_wr_locked); void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte); struct zap_details; void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, struct zap_details *details); int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio, gfp_t gfp); void page_cache_ra_order(struct readahead_control *, struct file_ra_state *, unsigned int order); void force_page_cache_ra(struct readahead_control *, unsigned long nr); static inline void force_page_cache_readahead(struct address_space *mapping, struct file *file, pgoff_t index, unsigned long nr_to_read) { DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, index); force_page_cache_ra(&ractl, nr_to_read); } unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices); unsigned find_get_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices); void filemap_free_folio(struct address_space *mapping, struct folio *folio); int truncate_inode_folio(struct address_space *mapping, struct folio *folio); bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end); long mapping_evict_folio(struct address_space *mapping, struct folio *folio); unsigned long mapping_try_invalidate(struct address_space *mapping, pgoff_t start, pgoff_t end, unsigned long *nr_failed); /** * folio_evictable - Test whether a folio is evictable. * @folio: The folio to test. * * Test whether @folio is evictable -- i.e., should be placed on * active/inactive lists vs unevictable list. * * Reasons folio might not be evictable: * 1. folio's mapping marked unevictable * 2. One of the pages in the folio is part of an mlocked VMA */ static inline bool folio_evictable(struct folio *folio) { bool ret; /* Prevent address_space of inode and swap cache from being freed */ rcu_read_lock(); ret = !mapping_unevictable(folio_mapping(folio)) && !folio_test_mlocked(folio); rcu_read_unlock(); return ret; } /* * Turn a non-refcounted page (->_refcount == 0) into refcounted with * a count of one. */ static inline void set_page_refcounted(struct page *page) { VM_BUG_ON_PAGE(PageTail(page), page); VM_BUG_ON_PAGE(page_ref_count(page), page); set_page_count(page, 1); } /* * Return true if a folio needs ->release_folio() calling upon it. */ static inline bool folio_needs_release(struct folio *folio) { struct address_space *mapping = folio_mapping(folio); return folio_has_private(folio) || (mapping && mapping_release_always(mapping)); } extern unsigned long highest_memmap_pfn; /* * Maximum number of reclaim retries without progress before the OOM * killer is consider the only way forward. */ #define MAX_RECLAIM_RETRIES 16 /* * in mm/vmscan.c: */ bool folio_isolate_lru(struct folio *folio); void folio_putback_lru(struct folio *folio); extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason); /* * in mm/rmap.c: */ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); /* * in mm/page_alloc.c */ #define K(x) ((x) << (PAGE_SHIFT-10)) extern char * const zone_names[MAX_NR_ZONES]; /* perform sanity checks on struct pages being allocated or freed */ DECLARE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled); extern int min_free_kbytes; void setup_per_zone_wmarks(void); void calculate_min_free_kbytes(void); int __meminit init_per_zone_wmark_min(void); void page_alloc_sysctl_init(void); /* * Structure for holding the mostly immutable allocation parameters passed * between functions involved in allocations, including the alloc_pages* * family of functions. * * nodemask, migratetype and highest_zoneidx are initialized only once in * __alloc_pages() and then never change. * * zonelist, preferred_zone and highest_zoneidx are set first in * __alloc_pages() for the fast path, and might be later changed * in __alloc_pages_slowpath(). All other functions pass the whole structure * by a const pointer. */ struct alloc_context { struct zonelist *zonelist; nodemask_t *nodemask; struct zoneref *preferred_zoneref; int migratetype; /* * highest_zoneidx represents highest usable zone index of * the allocation request. Due to the nature of the zone, * memory on lower zone than the highest_zoneidx will be * protected by lowmem_reserve[highest_zoneidx]. * * highest_zoneidx is also used by reclaim/compaction to limit * the target zone since higher zone than this index cannot be * usable for this allocation request. */ enum zone_type highest_zoneidx; bool spread_dirty_pages; }; /* * This function returns the order of a free page in the buddy system. In * general, page_zone(page)->lock must be held by the caller to prevent the * page from being allocated in parallel and returning garbage as the order. * If a caller does not hold page_zone(page)->lock, it must guarantee that the * page cannot be allocated or merged in parallel. Alternatively, it must * handle invalid values gracefully, and use buddy_order_unsafe() below. */ static inline unsigned int buddy_order(struct page *page) { /* PageBuddy() must be checked by the caller */ return page_private(page); } /* * Like buddy_order(), but for callers who cannot afford to hold the zone lock. * PageBuddy() should be checked first by the caller to minimize race window, * and invalid values must be handled gracefully. * * READ_ONCE is used so that if the caller assigns the result into a local * variable and e.g. tests it for valid range before using, the compiler cannot * decide to remove the variable and inline the page_private(page) multiple * times, potentially observing different values in the tests and the actual * use of the result. */ #define buddy_order_unsafe(page) READ_ONCE(page_private(page)) /* * This function checks whether a page is free && is the buddy * we can coalesce a page and its buddy if * (a) the buddy is not in a hole (check before calling!) && * (b) the buddy is in the buddy system && * (c) a page and its buddy have the same order && * (d) a page and its buddy are in the same zone. * * For recording whether a page is in the buddy system, we set PageBuddy. * Setting, clearing, and testing PageBuddy is serialized by zone->lock. * * For recording page's order, we use page_private(page). */ static inline bool page_is_buddy(struct page *page, struct page *buddy, unsigned int order) { if (!page_is_guard(buddy) && !PageBuddy(buddy)) return false; if (buddy_order(buddy) != order) return false; /* * zone check is done late to avoid uselessly calculating * zone/node ids for pages that could never merge. */ if (page_zone_id(page) != page_zone_id(buddy)) return false; VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); return true; } /* * Locate the struct page for both the matching buddy in our * pair (buddy1) and the combined O(n+1) page they form (page). * * 1) Any buddy B1 will have an order O twin B2 which satisfies * the following equation: * B2 = B1 ^ (1 << O) * For example, if the starting buddy (buddy2) is #8 its order * 1 buddy is #10: * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 * * 2) Any buddy B will have an order O+1 parent P which * satisfies the following equation: * P = B & ~(1 << O) * * Assumption: *_mem_map is contiguous at least up to MAX_PAGE_ORDER */ static inline unsigned long __find_buddy_pfn(unsigned long page_pfn, unsigned int order) { return page_pfn ^ (1 << order); } /* * Find the buddy of @page and validate it. * @page: The input page * @pfn: The pfn of the page, it saves a call to page_to_pfn() when the * function is used in the performance-critical __free_one_page(). * @order: The order of the page * @buddy_pfn: The output pointer to the buddy pfn, it also saves a call to * page_to_pfn(). * * The found buddy can be a non PageBuddy, out of @page's zone, or its order is * not the same as @page. The validation is necessary before use it. * * Return: the found buddy page or NULL if not found. */ static inline struct page *find_buddy_page_pfn(struct page *page, unsigned long pfn, unsigned int order, unsigned long *buddy_pfn) { unsigned long __buddy_pfn = __find_buddy_pfn(pfn, order); struct page *buddy; buddy = page + (__buddy_pfn - pfn); if (buddy_pfn) *buddy_pfn = __buddy_pfn; if (page_is_buddy(page, buddy, order)) return buddy; return NULL; } extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn, unsigned long end_pfn, struct zone *zone); static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn, unsigned long end_pfn, struct zone *zone) { if (zone->contiguous) return pfn_to_page(start_pfn); return __pageblock_pfn_to_page(start_pfn, end_pfn, zone); } void set_zone_contiguous(struct zone *zone); static inline void clear_zone_contiguous(struct zone *zone) { zone->contiguous = false; } extern int __isolate_free_page(struct page *page, unsigned int order); extern void __putback_isolated_page(struct page *page, unsigned int order, int mt); extern void memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order); extern void __free_pages_core(struct page *page, unsigned int order, enum meminit_context context); /* * This will have no effect, other than possibly generating a warning, if the * caller passes in a non-large folio. */ static inline void folio_set_order(struct folio *folio, unsigned int order) { if (WARN_ON_ONCE(!order || !folio_test_large(folio))) return; folio->_flags_1 = (folio->_flags_1 & ~0xffUL) | order; #ifdef CONFIG_64BIT folio->_folio_nr_pages = 1U << order; #endif } bool __folio_unqueue_deferred_split(struct folio *folio); static inline bool folio_unqueue_deferred_split(struct folio *folio) { if (folio_order(folio) <= 1 || !folio_test_large_rmappable(folio)) return false; /* * At this point, there is no one trying to add the folio to * deferred_list. If folio is not in deferred_list, it's safe * to check without acquiring the split_queue_lock. */ if (data_race(list_empty(&folio->_deferred_list))) return false; return __folio_unqueue_deferred_split(folio); } static inline struct folio *page_rmappable_folio(struct page *page) { struct folio *folio = (struct folio *)page; if (folio && folio_test_large(folio)) folio_set_large_rmappable(folio); return folio; } static inline void prep_compound_head(struct page *page, unsigned int order) { struct folio *folio = (struct folio *)page; folio_set_order(folio, order); atomic_set(&folio->_large_mapcount, -1); atomic_set(&folio->_entire_mapcount, -1); atomic_set(&folio->_nr_pages_mapped, 0); atomic_set(&folio->_pincount, 0); if (order > 1) INIT_LIST_HEAD(&folio->_deferred_list); } static inline void prep_compound_tail(struct page *head, int tail_idx) { struct page *p = head + tail_idx; p->mapping = TAIL_MAPPING; set_compound_head(p, head); set_page_private(p, 0); } extern void prep_compound_page(struct page *page, unsigned int order); void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); extern bool free_pages_prepare(struct page *page, unsigned int order); extern int user_min_free_kbytes; struct page *__alloc_frozen_pages_noprof(gfp_t, unsigned int order, int nid, nodemask_t *); #define __alloc_frozen_pages(...) \ alloc_hooks(__alloc_frozen_pages_noprof(__VA_ARGS__)) void free_frozen_pages(struct page *page, unsigned int order); void free_unref_folios(struct folio_batch *fbatch); #ifdef CONFIG_NUMA struct page *alloc_frozen_pages_noprof(gfp_t, unsigned int order); #else static inline struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned int order) { return __alloc_frozen_pages_noprof(gfp, order, numa_node_id(), NULL); } #endif #define alloc_frozen_pages(...) \ alloc_hooks(alloc_frozen_pages_noprof(__VA_ARGS__)) extern void zone_pcp_reset(struct zone *zone); extern void zone_pcp_disable(struct zone *zone); extern void zone_pcp_enable(struct zone *zone); extern void zone_pcp_init(struct zone *zone); extern void *memmap_alloc(phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, int nid, bool exact_nid); void memmap_init_range(unsigned long, int, unsigned long, unsigned long, unsigned long, enum meminit_context, struct vmem_altmap *, int); #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* * in mm/compaction.c */ /* * compact_control is used to track pages being migrated and the free pages * they are being migrated to during memory compaction. The free_pfn starts * at the end of a zone and migrate_pfn begins at the start. Movable pages * are moved to the end of a zone during a compaction run and the run * completes when free_pfn <= migrate_pfn */ struct compact_control { struct list_head freepages[NR_PAGE_ORDERS]; /* List of free pages to migrate to */ struct list_head migratepages; /* List of pages being migrated */ unsigned int nr_freepages; /* Number of isolated free pages */ unsigned int nr_migratepages; /* Number of pages to migrate */ unsigned long free_pfn; /* isolate_freepages search base */ /* * Acts as an in/out parameter to page isolation for migration. * isolate_migratepages uses it as a search base. * isolate_migratepages_block will update the value to the next pfn * after the last isolated one. */ unsigned long migrate_pfn; unsigned long fast_start_pfn; /* a pfn to start linear scan from */ struct zone *zone; unsigned long total_migrate_scanned; unsigned long total_free_scanned; unsigned short fast_search_fail;/* failures to use free list searches */ short search_order; /* order to start a fast search at */ const gfp_t gfp_mask; /* gfp mask of a direct compactor */ int order; /* order a direct compactor needs */ int migratetype; /* migratetype of direct compactor */ const unsigned int alloc_flags; /* alloc flags of a direct compactor */ const int highest_zoneidx; /* zone index of a direct compactor */ enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ bool no_set_skip_hint; /* Don't mark blocks for skipping */ bool ignore_block_suitable; /* Scan blocks considered unsuitable */ bool direct_compaction; /* False from kcompactd or /proc/... */ bool proactive_compaction; /* kcompactd proactive compaction */ bool whole_zone; /* Whole zone should/has been scanned */ bool contended; /* Signal lock contention */ bool finish_pageblock; /* Scan the remainder of a pageblock. Used * when there are potentially transient * isolation or migration failures to * ensure forward progress. */ bool alloc_contig; /* alloc_contig_range allocation */ }; /* * Used in direct compaction when a page should be taken from the freelists * immediately when one is created during the free path. */ struct capture_control { struct compact_control *cc; struct page *page; }; unsigned long isolate_freepages_range(struct compact_control *cc, unsigned long start_pfn, unsigned long end_pfn); int isolate_migratepages_range(struct compact_control *cc, unsigned long low_pfn, unsigned long end_pfn); /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ void init_cma_reserved_pageblock(struct page *page); #endif /* CONFIG_COMPACTION || CONFIG_CMA */ int find_suitable_fallback(struct free_area *area, unsigned int order, int migratetype, bool only_stealable, bool *can_steal); static inline bool free_area_empty(struct free_area *area, int migratetype) { return list_empty(&area->free_list[migratetype]); } /* mm/util.c */ struct anon_vma *folio_anon_vma(const struct folio *folio); #ifdef CONFIG_MMU void unmap_mapping_folio(struct folio *folio); extern long populate_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *locked); extern long faultin_page_range(struct mm_struct *mm, unsigned long start, unsigned long end, bool write, int *locked); extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags, unsigned long bytes); /* * NOTE: This function can't tell whether the folio is "fully mapped" in the * range. * "fully mapped" means all the pages of folio is associated with the page * table of range while this function just check whether the folio range is * within the range [start, end). Function caller needs to do page table * check if it cares about the page table association. * * Typical usage (like mlock or madvise) is: * Caller knows at least 1 page of folio is associated with page table of VMA * and the range [start, end) is intersect with the VMA range. Caller wants * to know whether the folio is fully associated with the range. It calls * this function to check whether the folio is in the range first. Then checks * the page table to know whether the folio is fully mapped to the range. */ static inline bool folio_within_range(struct folio *folio, struct vm_area_struct *vma, unsigned long start, unsigned long end) { pgoff_t pgoff, addr; unsigned long vma_pglen = vma_pages(vma); VM_WARN_ON_FOLIO(folio_test_ksm(folio), folio); if (start > end) return false; if (start < vma->vm_start) start = vma->vm_start; if (end > vma->vm_end) end = vma->vm_end; pgoff = folio_pgoff(folio); /* if folio start address is not in vma range */ if (!in_range(pgoff, vma->vm_pgoff, vma_pglen)) return false; addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); return !(addr < start || end - addr < folio_size(folio)); } static inline bool folio_within_vma(struct folio *folio, struct vm_area_struct *vma) { return folio_within_range(folio, vma, vma->vm_start, vma->vm_end); } /* * mlock_vma_folio() and munlock_vma_folio(): * should be called with vma's mmap_lock held for read or write, * under page table lock for the pte/pmd being added or removed. * * mlock is usually called at the end of folio_add_*_rmap_*(), munlock at * the end of folio_remove_rmap_*(); but new anon folios are managed by * folio_add_lru_vma() calling mlock_new_folio(). */ void mlock_folio(struct folio *folio); static inline void mlock_vma_folio(struct folio *folio, struct vm_area_struct *vma) { /* * The VM_SPECIAL check here serves two purposes. * 1) VM_IO check prevents migration from double-counting during mlock. * 2) Although mmap_region() and mlock_fixup() take care that VM_LOCKED * is never left set on a VM_SPECIAL vma, there is an interval while * file->f_op->mmap() is using vm_insert_page(s), when VM_LOCKED may * still be set while VM_SPECIAL bits are added: so ignore it then. */ if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED)) mlock_folio(folio); } void munlock_folio(struct folio *folio); static inline void munlock_vma_folio(struct folio *folio, struct vm_area_struct *vma) { /* * munlock if the function is called. Ideally, we should only * do munlock if any page of folio is unmapped from VMA and * cause folio not fully mapped to VMA. * * But it's not easy to confirm that's the situation. So we * always munlock the folio and page reclaim will correct it * if it's wrong. */ if (unlikely(vma->vm_flags & VM_LOCKED)) munlock_folio(folio); } void mlock_new_folio(struct folio *folio); bool need_mlock_drain(int cpu); void mlock_drain_local(void); void mlock_drain_remote(int cpu); extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); /** * vma_address - Find the virtual address a page range is mapped at * @vma: The vma which maps this object. * @pgoff: The page offset within its object. * @nr_pages: The number of pages to consider. * * If any page in this range is mapped by this VMA, return the first address * where any of these pages appear. Otherwise, return -EFAULT. */ static inline unsigned long vma_address(const struct vm_area_struct *vma, pgoff_t pgoff, unsigned long nr_pages) { unsigned long address; if (pgoff >= vma->vm_pgoff) { address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); /* Check for address beyond vma (or wrapped through 0?) */ if (address < vma->vm_start || address >= vma->vm_end) address = -EFAULT; } else if (pgoff + nr_pages - 1 >= vma->vm_pgoff) { /* Test above avoids possibility of wrap to 0 on 32-bit */ address = vma->vm_start; } else { address = -EFAULT; } return address; } /* * Then at what user virtual address will none of the range be found in vma? * Assumes that vma_address() already returned a good starting address. */ static inline unsigned long vma_address_end(struct page_vma_mapped_walk *pvmw) { struct vm_area_struct *vma = pvmw->vma; pgoff_t pgoff; unsigned long address; /* Common case, plus ->pgoff is invalid for KSM */ if (pvmw->nr_pages == 1) return pvmw->address + PAGE_SIZE; pgoff = pvmw->pgoff + pvmw->nr_pages; address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); /* Check for address beyond vma (or wrapped through 0?) */ if (address < vma->vm_start || address > vma->vm_end) address = vma->vm_end; return address; } static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, struct file *fpin) { int flags = vmf->flags; if (fpin) return fpin; /* * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or * anything, so we only pin the file and drop the mmap_lock if only * FAULT_FLAG_ALLOW_RETRY is set, while this is the first attempt. */ if (fault_flag_allow_retry_first(flags) && !(flags & FAULT_FLAG_RETRY_NOWAIT)) { fpin = get_file(vmf->vma->vm_file); release_fault_lock(vmf); } return fpin; } #else /* !CONFIG_MMU */ static inline void unmap_mapping_folio(struct folio *folio) { } static inline void mlock_new_folio(struct folio *folio) { } static inline bool need_mlock_drain(int cpu) { return false; } static inline void mlock_drain_local(void) { } static inline void mlock_drain_remote(int cpu) { } static inline void vunmap_range_noflush(unsigned long start, unsigned long end) { } #endif /* !CONFIG_MMU */ /* Memory initialisation debug and verification */ #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT DECLARE_STATIC_KEY_TRUE(deferred_pages); bool __init deferred_grow_zone(struct zone *zone, unsigned int order); #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ enum mminit_level { MMINIT_WARNING, MMINIT_VERIFY, MMINIT_TRACE }; #ifdef CONFIG_DEBUG_MEMORY_INIT extern int mminit_loglevel; #define mminit_dprintk(level, prefix, fmt, arg...) \ do { \ if (level < mminit_loglevel) { \ if (level <= MMINIT_WARNING) \ pr_warn("mminit::" prefix " " fmt, ##arg); \ else \ printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \ } \ } while (0) extern void mminit_verify_pageflags_layout(void); extern void mminit_verify_zonelist(void); #else static inline void mminit_dprintk(enum mminit_level level, const char *prefix, const char *fmt, ...) { } static inline void mminit_verify_pageflags_layout(void) { } static inline void mminit_verify_zonelist(void) { } #endif /* CONFIG_DEBUG_MEMORY_INIT */ #define NODE_RECLAIM_NOSCAN -2 #define NODE_RECLAIM_FULL -1 #define NODE_RECLAIM_SOME 0 #define NODE_RECLAIM_SUCCESS 1 #ifdef CONFIG_NUMA extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int); extern int find_next_best_node(int node, nodemask_t *used_node_mask); #else static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask, unsigned int order) { return NODE_RECLAIM_NOSCAN; } static inline int find_next_best_node(int node, nodemask_t *used_node_mask) { return NUMA_NO_NODE; } #endif /* * mm/memory-failure.c */ #ifdef CONFIG_MEMORY_FAILURE void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu); void shake_folio(struct folio *folio); extern int hwpoison_filter(struct page *p); extern u32 hwpoison_filter_dev_major; extern u32 hwpoison_filter_dev_minor; extern u64 hwpoison_filter_flags_mask; extern u64 hwpoison_filter_flags_value; extern u64 hwpoison_filter_memcg; extern u32 hwpoison_filter_enable; #define MAGIC_HWPOISON 0x48575053U /* HWPS */ void SetPageHWPoisonTakenOff(struct page *page); void ClearPageHWPoisonTakenOff(struct page *page); bool take_page_off_buddy(struct page *page); bool put_page_back_buddy(struct page *page); struct task_struct *task_early_kill(struct task_struct *tsk, int force_early); void add_to_kill_ksm(struct task_struct *tsk, const struct page *p, struct vm_area_struct *vma, struct list_head *to_kill, unsigned long ksm_addr); unsigned long page_mapped_in_vma(const struct page *page, struct vm_area_struct *vma); #else static inline void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu) { } #endif extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); extern void set_pageblock_order(void); struct folio *alloc_migrate_folio(struct folio *src, unsigned long private); unsigned long reclaim_pages(struct list_head *folio_list); unsigned int reclaim_clean_pages_from_list(struct zone *zone, struct list_head *folio_list); /* The ALLOC_WMARK bits are used as an index to zone->watermark */ #define ALLOC_WMARK_MIN WMARK_MIN #define ALLOC_WMARK_LOW WMARK_LOW #define ALLOC_WMARK_HIGH WMARK_HIGH #define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */ /* Mask to get the watermark bits */ #define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) /* * Only MMU archs have async oom victim reclaim - aka oom_reaper so we * cannot assume a reduced access to memory reserves is sufficient for * !MMU */ #ifdef CONFIG_MMU #define ALLOC_OOM 0x08 #else #define ALLOC_OOM ALLOC_NO_WATERMARKS #endif #define ALLOC_NON_BLOCK 0x10 /* Caller cannot block. Allow access * to 25% of the min watermark or * 62.5% if __GFP_HIGH is set. */ #define ALLOC_MIN_RESERVE 0x20 /* __GFP_HIGH set. Allow access to 50% * of the min watermark. */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ #ifdef CONFIG_ZONE_DMA32 #define ALLOC_NOFRAGMENT 0x100 /* avoid mixing pageblock types */ #else #define ALLOC_NOFRAGMENT 0x0 #endif #define ALLOC_HIGHATOMIC 0x200 /* Allows access to MIGRATE_HIGHATOMIC */ #define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */ /* Flags that allow allocations below the min watermark. */ #define ALLOC_RESERVES (ALLOC_NON_BLOCK|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM) enum ttu_flags; struct tlbflush_unmap_batch; /* * only for MM internal work items which do not depend on * any allocations or locks which might depend on allocations */ extern struct workqueue_struct *mm_percpu_wq; #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH void try_to_unmap_flush(void); void try_to_unmap_flush_dirty(void); void flush_tlb_batched_pending(struct mm_struct *mm); #else static inline void try_to_unmap_flush(void) { } static inline void try_to_unmap_flush_dirty(void) { } static inline void flush_tlb_batched_pending(struct mm_struct *mm) { } #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ extern const struct trace_print_flags pageflag_names[]; extern const struct trace_print_flags vmaflag_names[]; extern const struct trace_print_flags gfpflag_names[]; static inline bool is_migrate_highatomic(enum migratetype migratetype) { return migratetype == MIGRATE_HIGHATOMIC; } void setup_zone_pageset(struct zone *zone); struct migration_target_control { int nid; /* preferred node id */ nodemask_t *nmask; gfp_t gfp_mask; enum migrate_reason reason; }; /* * mm/filemap.c */ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe, struct folio *folio, loff_t fpos, size_t size); /* * mm/vmalloc.c */ #ifdef CONFIG_MMU void __init vmalloc_init(void); int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift); unsigned int get_vm_area_page_order(struct vm_struct *vm); #else static inline void vmalloc_init(void) { } static inline int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { return -EINVAL; } #endif int __must_check __vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift); void vunmap_range_noflush(unsigned long start, unsigned long end); void __vunmap_range_noflush(unsigned long start, unsigned long end); int numa_migrate_check(struct folio *folio, struct vm_fault *vmf, unsigned long addr, int *flags, bool writable, int *last_cpupid); void free_zone_device_folio(struct folio *folio); int migrate_device_coherent_folio(struct folio *folio); struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long align, unsigned long shift, unsigned long flags, unsigned long start, unsigned long end, int node, gfp_t gfp_mask, const void *caller); /* * mm/gup.c */ int __must_check try_grab_folio(struct folio *folio, int refs, unsigned int flags); /* * mm/huge_memory.c */ void touch_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud, bool write); void touch_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, bool write); /* * Parses a string with mem suffixes into its order. Useful to parse kernel * parameters. */ static inline int get_order_from_str(const char *size_str, unsigned long valid_orders) { unsigned long size; char *endptr; int order; size = memparse(size_str, &endptr); if (!is_power_of_2(size)) return -EINVAL; order = get_order(size); if (BIT(order) & ~valid_orders) return -EINVAL; return order; } enum { /* mark page accessed */ FOLL_TOUCH = 1 << 16, /* a retry, previous pass started an IO */ FOLL_TRIED = 1 << 17, /* we are working on non-current tsk/mm */ FOLL_REMOTE = 1 << 18, /* pages must be released via unpin_user_page */ FOLL_PIN = 1 << 19, /* gup_fast: prevent fall-back to slow gup */ FOLL_FAST_ONLY = 1 << 20, /* allow unlocking the mmap lock */ FOLL_UNLOCKABLE = 1 << 21, /* VMA lookup+checks compatible with MADV_POPULATE_(READ|WRITE) */ FOLL_MADV_POPULATE = 1 << 22, }; #define INTERNAL_GUP_FLAGS (FOLL_TOUCH | FOLL_TRIED | FOLL_REMOTE | FOLL_PIN | \ FOLL_FAST_ONLY | FOLL_UNLOCKABLE | \ FOLL_MADV_POPULATE) /* * Indicates for which pages that are write-protected in the page table, * whether GUP has to trigger unsharing via FAULT_FLAG_UNSHARE such that the * GUP pin will remain consistent with the pages mapped into the page tables * of the MM. * * Temporary unmapping of PageAnonExclusive() pages or clearing of * PageAnonExclusive() has to protect against concurrent GUP: * * Ordinary GUP: Using the PT lock * * GUP-fast and fork(): mm->write_protect_seq * * GUP-fast and KSM or temporary unmapping (swap, migration): see * folio_try_share_anon_rmap_*() * * Must be called with the (sub)page that's actually referenced via the * page table entry, which might not necessarily be the head page for a * PTE-mapped THP. * * If the vma is NULL, we're coming from the GUP-fast path and might have * to fallback to the slow path just to lookup the vma. */ static inline bool gup_must_unshare(struct vm_area_struct *vma, unsigned int flags, struct page *page) { /* * FOLL_WRITE is implicitly handled correctly as the page table entry * has to be writable -- and if it references (part of) an anonymous * folio, that part is required to be marked exclusive. */ if ((flags & (FOLL_WRITE | FOLL_PIN)) != FOLL_PIN) return false; /* * Note: PageAnon(page) is stable until the page is actually getting * freed. */ if (!PageAnon(page)) { /* * We only care about R/O long-term pining: R/O short-term * pinning does not have the semantics to observe successive * changes through the process page tables. */ if (!(flags & FOLL_LONGTERM)) return false; /* We really need the vma ... */ if (!vma) return true; /* * ... because we only care about writable private ("COW") * mappings where we have to break COW early. */ return is_cow_mapping(vma->vm_flags); } /* Paired with a memory barrier in folio_try_share_anon_rmap_*(). */ if (IS_ENABLED(CONFIG_HAVE_GUP_FAST)) smp_rmb(); /* * Note that KSM pages cannot be exclusive, and consequently, * cannot get pinned. */ return !PageAnonExclusive(page); } extern bool mirrored_kernelcore; extern bool memblock_has_mirror(void); static __always_inline void vma_set_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff) { vma->vm_start = start; vma->vm_end = end; vma->vm_pgoff = pgoff; } static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) { /* * NOTE: we must check this before VM_SOFTDIRTY on soft-dirty * enablements, because when without soft-dirty being compiled in, * VM_SOFTDIRTY is defined as 0x0, then !(vm_flags & VM_SOFTDIRTY) * will be constantly true. */ if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)) return false; /* * Soft-dirty is kind of special: its tracking is enabled when the * vma flags not set. */ return !(vma->vm_flags & VM_SOFTDIRTY); } static inline bool pmd_needs_soft_dirty_wp(struct vm_area_struct *vma, pmd_t pmd) { return vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd); } static inline bool pte_needs_soft_dirty_wp(struct vm_area_struct *vma, pte_t pte) { return vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte); } void __meminit __init_single_page(struct page *page, unsigned long pfn, unsigned long zone, int nid); /* shrinker related functions */ unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority); #ifdef CONFIG_SHRINKER_DEBUG static inline __printf(2, 0) int shrinker_debugfs_name_alloc( struct shrinker *shrinker, const char *fmt, va_list ap) { shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); return shrinker->name ? 0 : -ENOMEM; } static inline void shrinker_debugfs_name_free(struct shrinker *shrinker) { kfree_const(shrinker->name); shrinker->name = NULL; } extern int shrinker_debugfs_add(struct shrinker *shrinker); extern struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, int *debugfs_id); extern void shrinker_debugfs_remove(struct dentry *debugfs_entry, int debugfs_id); #else /* CONFIG_SHRINKER_DEBUG */ static inline int shrinker_debugfs_add(struct shrinker *shrinker) { return 0; } static inline int shrinker_debugfs_name_alloc(struct shrinker *shrinker, const char *fmt, va_list ap) { return 0; } static inline void shrinker_debugfs_name_free(struct shrinker *shrinker) { } static inline struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, int *debugfs_id) { *debugfs_id = -1; return NULL; } static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry, int debugfs_id) { } #endif /* CONFIG_SHRINKER_DEBUG */ /* Only track the nodes of mappings with shadow entries */ void workingset_update_node(struct xa_node *node); extern struct list_lru shadow_nodes; #define mapping_set_update(xas, mapping) do { \ if (!dax_mapping(mapping) && !shmem_mapping(mapping)) { \ xas_set_update(xas, workingset_update_node); \ xas_set_lru(xas, &shadow_nodes); \ } \ } while (0) /* mremap.c */ unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len, bool need_rmap_locks, bool for_stack); #ifdef CONFIG_UNACCEPTED_MEMORY void accept_page(struct page *page); #else /* CONFIG_UNACCEPTED_MEMORY */ static inline void accept_page(struct page *page) { } #endif /* CONFIG_UNACCEPTED_MEMORY */ /* pagewalk.c */ int walk_page_range_mm(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private); /* pt_reclaim.c */ bool try_get_and_clear_pmd(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdval); void free_pte(struct mm_struct *mm, unsigned long addr, struct mmu_gather *tlb, pmd_t pmdval); void try_to_free_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, struct mmu_gather *tlb); #ifdef CONFIG_PT_RECLAIM bool reclaim_pt_is_enabled(unsigned long start, unsigned long end, struct zap_details *details); #else static inline bool reclaim_pt_is_enabled(unsigned long start, unsigned long end, struct zap_details *details) { return false; } #endif /* CONFIG_PT_RECLAIM */ #endif /* __MM_INTERNAL_H */
54 54 31 30 30 28 1 30 31 29 1 675 712 712 707 2 534 186 679 675 674 603 77 676 118 118 102 118 11 16 86 88 89 87 689 18 616 102 73 88 708 627 88 704 16 707 678 39 658 56 614 713 709 710 712 715 10 695 18 125 3 24 865 865 872 865 105 866 335 713 10 5 5 5 4 98 10 10 10 10 10 10 4 4 62 62 63 61 62 62 63 61 63 63 62 63 63 63 68 5 63 1427 116 1388 10 1430 22 254 109 1198 1160 36 1433 1434 1431 169 1416 1410 4 1377 115 1419 1226 250 1389 109 1418 200 837 84 537 258 1406 1369 1381 1375 1372 1370 1373 1361 1371 7 7 18 4 4 281 4 285 84 155 222 224 6 220 240 241 244 79 37 243 2 2 247 246 241 230 226 227 1 234 234 233 235 233 1 1 1 1 1 231 232 232 235 232 233 230 10 244 18 229 148 106 106 105 19 229 147 105 129 130 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 116 17 86 98 5 5 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 // SPDX-License-Identifier: GPL-2.0-only /* * fs/dcache.c * * Complete reimplementation * (C) 1997 Thomas Schoebel-Theuer, * with heavy changes by Linus Torvalds */ /* * Notes on the allocation strategy: * * The dcache is a master of the icache - whenever a dcache entry * exists, the inode will always exist. "iput()" is done either when * the dcache entry is deleted or garbage collected. */ #include <linux/ratelimit.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/fs.h> #include <linux/fscrypt.h> #include <linux/fsnotify.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/hash.h> #include <linux/cache.h> #include <linux/export.h> #include <linux/security.h> #include <linux/seqlock.h> #include <linux/memblock.h> #include <linux/bit_spinlock.h> #include <linux/rculist_bl.h> #include <linux/list_lru.h> #include "internal.h" #include "mount.h" #include <asm/runtime-const.h> /* * Usage: * dcache->d_inode->i_lock protects: * - i_dentry, d_u.d_alias, d_inode of aliases * dcache_hash_bucket lock protects: * - the dcache hash table * s_roots bl list spinlock protects: * - the s_roots list (see __d_drop) * dentry->d_sb->s_dentry_lru_lock protects: * - the dcache lru lists and counters * d_lock protects: * - d_flags * - d_name * - d_lru * - d_count * - d_unhashed() * - d_parent and d_chilren * - childrens' d_sib and d_parent * - d_u.d_alias, d_inode * * Ordering: * dentry->d_inode->i_lock * dentry->d_lock * dentry->d_sb->s_dentry_lru_lock * dcache_hash_bucket lock * s_roots lock * * If there is an ancestor relationship: * dentry->d_parent->...->d_parent->d_lock * ... * dentry->d_parent->d_lock * dentry->d_lock * * If no ancestor relationship: * arbitrary, since it's serialized on rename_lock */ int sysctl_vfs_cache_pressure __read_mostly = 100; EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); EXPORT_SYMBOL(rename_lock); static struct kmem_cache *dentry_cache __ro_after_init; const struct qstr empty_name = QSTR_INIT("", 0); EXPORT_SYMBOL(empty_name); const struct qstr slash_name = QSTR_INIT("/", 1); EXPORT_SYMBOL(slash_name); const struct qstr dotdot_name = QSTR_INIT("..", 2); EXPORT_SYMBOL(dotdot_name); /* * This is the single most critical data structure when it comes * to the dcache: the hashtable for lookups. Somebody should try * to make this good - I've just made it work. * * This hash-function tries to avoid losing too many bits of hash * information, yet avoid using a prime hash-size or similar. * * Marking the variables "used" ensures that the compiler doesn't * optimize them away completely on architectures with runtime * constant infrastructure, this allows debuggers to see their * values. But updating these values has no effect on those arches. */ static unsigned int d_hash_shift __ro_after_init __used; static struct hlist_bl_head *dentry_hashtable __ro_after_init __used; static inline struct hlist_bl_head *d_hash(unsigned long hashlen) { return runtime_const_ptr(dentry_hashtable) + runtime_const_shift_right_32(hashlen, d_hash_shift); } #define IN_LOOKUP_SHIFT 10 static struct hlist_bl_head in_lookup_hashtable[1 << IN_LOOKUP_SHIFT]; static inline struct hlist_bl_head *in_lookup_hash(const struct dentry *parent, unsigned int hash) { hash += (unsigned long) parent / L1_CACHE_BYTES; return in_lookup_hashtable + hash_32(hash, IN_LOOKUP_SHIFT); } struct dentry_stat_t { long nr_dentry; long nr_unused; long age_limit; /* age in seconds */ long want_pages; /* pages requested by system */ long nr_negative; /* # of unused negative dentries */ long dummy; /* Reserved for future use */ }; static DEFINE_PER_CPU(long, nr_dentry); static DEFINE_PER_CPU(long, nr_dentry_unused); static DEFINE_PER_CPU(long, nr_dentry_negative); static int dentry_negative_policy; #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) /* Statistics gathering. */ static struct dentry_stat_t dentry_stat = { .age_limit = 45, }; /* * Here we resort to our own counters instead of using generic per-cpu counters * for consistency with what the vfs inode code does. We are expected to harvest * better code and performance by having our own specialized counters. * * Please note that the loop is done over all possible CPUs, not over all online * CPUs. The reason for this is that we don't want to play games with CPUs going * on and off. If one of them goes off, we will just keep their counters. * * glommer: See cffbc8a for details, and if you ever intend to change this, * please update all vfs counters to match. */ static long get_nr_dentry(void) { int i; long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_dentry, i); return sum < 0 ? 0 : sum; } static long get_nr_dentry_unused(void) { int i; long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_dentry_unused, i); return sum < 0 ? 0 : sum; } static long get_nr_dentry_negative(void) { int i; long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_dentry_negative, i); return sum < 0 ? 0 : sum; } static int proc_nr_dentry(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { dentry_stat.nr_dentry = get_nr_dentry(); dentry_stat.nr_unused = get_nr_dentry_unused(); dentry_stat.nr_negative = get_nr_dentry_negative(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } static const struct ctl_table fs_dcache_sysctls[] = { { .procname = "dentry-state", .data = &dentry_stat, .maxlen = 6*sizeof(long), .mode = 0444, .proc_handler = proc_nr_dentry, }, { .procname = "dentry-negative", .data = &dentry_negative_policy, .maxlen = sizeof(dentry_negative_policy), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, }; static int __init init_fs_dcache_sysctls(void) { register_sysctl_init("fs", fs_dcache_sysctls); return 0; } fs_initcall(init_fs_dcache_sysctls); #endif /* * Compare 2 name strings, return 0 if they match, otherwise non-zero. * The strings are both count bytes long, and count is non-zero. */ #ifdef CONFIG_DCACHE_WORD_ACCESS #include <asm/word-at-a-time.h> /* * NOTE! 'cs' and 'scount' come from a dentry, so it has a * aligned allocation for this particular component. We don't * strictly need the load_unaligned_zeropad() safety, but it * doesn't hurt either. * * In contrast, 'ct' and 'tcount' can be from a pathname, and do * need the careful unaligned handling. */ static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount) { unsigned long a,b,mask; for (;;) { a = read_word_at_a_time(cs); b = load_unaligned_zeropad(ct); if (tcount < sizeof(unsigned long)) break; if (unlikely(a != b)) return 1; cs += sizeof(unsigned long); ct += sizeof(unsigned long); tcount -= sizeof(unsigned long); if (!tcount) return 0; } mask = bytemask_from_count(tcount); return unlikely(!!((a ^ b) & mask)); } #else static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount) { do { if (*cs != *ct) return 1; cs++; ct++; tcount--; } while (tcount); return 0; } #endif static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount) { /* * Be careful about RCU walk racing with rename: * use 'READ_ONCE' to fetch the name pointer. * * NOTE! Even if a rename will mean that the length * was not loaded atomically, we don't care. The * RCU walk will check the sequence count eventually, * and catch it. And we won't overrun the buffer, * because we're reading the name pointer atomically, * and a dentry name is guaranteed to be properly * terminated with a NUL byte. * * End result: even if 'len' is wrong, we'll exit * early because the data cannot match (there can * be no NUL in the ct/tcount data) */ const unsigned char *cs = READ_ONCE(dentry->d_name.name); return dentry_string_cmp(cs, ct, tcount); } /* * long names are allocated separately from dentry and never modified. * Refcounted, freeing is RCU-delayed. See take_dentry_name_snapshot() * for the reason why ->count and ->head can't be combined into a union. * dentry_string_cmp() relies upon ->name[] being word-aligned. */ struct external_name { atomic_t count; struct rcu_head head; unsigned char name[] __aligned(sizeof(unsigned long)); }; static inline struct external_name *external_name(struct dentry *dentry) { return container_of(dentry->d_name.name, struct external_name, name[0]); } static void __d_free(struct rcu_head *head) { struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); kmem_cache_free(dentry_cache, dentry); } static void __d_free_external(struct rcu_head *head) { struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); kfree(external_name(dentry)); kmem_cache_free(dentry_cache, dentry); } static inline int dname_external(const struct dentry *dentry) { return dentry->d_name.name != dentry->d_shortname.string; } void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry) { unsigned seq; const unsigned char *s; rcu_read_lock(); retry: seq = read_seqcount_begin(&dentry->d_seq); s = READ_ONCE(dentry->d_name.name); name->name.hash_len = dentry->d_name.hash_len; name->name.name = name->inline_name.string; if (likely(s == dentry->d_shortname.string)) { name->inline_name = dentry->d_shortname; } else { struct external_name *p; p = container_of(s, struct external_name, name[0]); // get a valid reference if (unlikely(!atomic_inc_not_zero(&p->count))) goto retry; name->name.name = s; } if (read_seqcount_retry(&dentry->d_seq, seq)) { release_dentry_name_snapshot(name); goto retry; } rcu_read_unlock(); } EXPORT_SYMBOL(take_dentry_name_snapshot); void release_dentry_name_snapshot(struct name_snapshot *name) { if (unlikely(name->name.name != name->inline_name.string)) { struct external_name *p; p = container_of(name->name.name, struct external_name, name[0]); if (unlikely(atomic_dec_and_test(&p->count))) kfree_rcu(p, head); } } EXPORT_SYMBOL(release_dentry_name_snapshot); static inline void __d_set_inode_and_type(struct dentry *dentry, struct inode *inode, unsigned type_flags) { unsigned flags; dentry->d_inode = inode; flags = READ_ONCE(dentry->d_flags); flags &= ~DCACHE_ENTRY_TYPE; flags |= type_flags; smp_store_release(&dentry->d_flags, flags); } static inline void __d_clear_type_and_inode(struct dentry *dentry) { unsigned flags = READ_ONCE(dentry->d_flags); flags &= ~DCACHE_ENTRY_TYPE; WRITE_ONCE(dentry->d_flags, flags); dentry->d_inode = NULL; /* * The negative counter only tracks dentries on the LRU. Don't inc if * d_lru is on another list. */ if ((flags & (DCACHE_LRU_LIST|DCACHE_SHRINK_LIST)) == DCACHE_LRU_LIST) this_cpu_inc(nr_dentry_negative); } static void dentry_free(struct dentry *dentry) { WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias)); if (unlikely(dname_external(dentry))) { struct external_name *p = external_name(dentry); if (likely(atomic_dec_and_test(&p->count))) { call_rcu(&dentry->d_u.d_rcu, __d_free_external); return; } } /* if dentry was never visible to RCU, immediate free is OK */ if (dentry->d_flags & DCACHE_NORCU) __d_free(&dentry->d_u.d_rcu); else call_rcu(&dentry->d_u.d_rcu, __d_free); } /* * Release the dentry's inode, using the filesystem * d_iput() operation if defined. */ static void dentry_unlink_inode(struct dentry * dentry) __releases(dentry->d_lock) __releases(dentry->d_inode->i_lock) { struct inode *inode = dentry->d_inode; raw_write_seqcount_begin(&dentry->d_seq); __d_clear_type_and_inode(dentry); hlist_del_init(&dentry->d_u.d_alias); raw_write_seqcount_end(&dentry->d_seq); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); if (!inode->i_nlink) fsnotify_inoderemove(inode); if (dentry->d_op && dentry->d_op->d_iput) dentry->d_op->d_iput(dentry, inode); else iput(inode); } /* * The DCACHE_LRU_LIST bit is set whenever the 'd_lru' entry * is in use - which includes both the "real" per-superblock * LRU list _and_ the DCACHE_SHRINK_LIST use. * * The DCACHE_SHRINK_LIST bit is set whenever the dentry is * on the shrink list (ie not on the superblock LRU list). * * The per-cpu "nr_dentry_unused" counters are updated with * the DCACHE_LRU_LIST bit. * * The per-cpu "nr_dentry_negative" counters are only updated * when deleted from or added to the per-superblock LRU list, not * from/to the shrink list. That is to avoid an unneeded dec/inc * pair when moving from LRU to shrink list in select_collect(). * * These helper functions make sure we always follow the * rules. d_lock must be held by the caller. */ #define D_FLAG_VERIFY(dentry,x) WARN_ON_ONCE(((dentry)->d_flags & (DCACHE_LRU_LIST | DCACHE_SHRINK_LIST)) != (x)) static void d_lru_add(struct dentry *dentry) { D_FLAG_VERIFY(dentry, 0); dentry->d_flags |= DCACHE_LRU_LIST; this_cpu_inc(nr_dentry_unused); if (d_is_negative(dentry)) this_cpu_inc(nr_dentry_negative); WARN_ON_ONCE(!list_lru_add_obj( &dentry->d_sb->s_dentry_lru, &dentry->d_lru)); } static void d_lru_del(struct dentry *dentry) { D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags &= ~DCACHE_LRU_LIST; this_cpu_dec(nr_dentry_unused); if (d_is_negative(dentry)) this_cpu_dec(nr_dentry_negative); WARN_ON_ONCE(!list_lru_del_obj( &dentry->d_sb->s_dentry_lru, &dentry->d_lru)); } static void d_shrink_del(struct dentry *dentry) { D_FLAG_VERIFY(dentry, DCACHE_SHRINK_LIST | DCACHE_LRU_LIST); list_del_init(&dentry->d_lru); dentry->d_flags &= ~(DCACHE_SHRINK_LIST | DCACHE_LRU_LIST); this_cpu_dec(nr_dentry_unused); } static void d_shrink_add(struct dentry *dentry, struct list_head *list) { D_FLAG_VERIFY(dentry, 0); list_add(&dentry->d_lru, list); dentry->d_flags |= DCACHE_SHRINK_LIST | DCACHE_LRU_LIST; this_cpu_inc(nr_dentry_unused); } /* * These can only be called under the global LRU lock, ie during the * callback for freeing the LRU list. "isolate" removes it from the * LRU lists entirely, while shrink_move moves it to the indicated * private list. */ static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry) { D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags &= ~DCACHE_LRU_LIST; this_cpu_dec(nr_dentry_unused); if (d_is_negative(dentry)) this_cpu_dec(nr_dentry_negative); list_lru_isolate(lru, &dentry->d_lru); } static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry, struct list_head *list) { D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags |= DCACHE_SHRINK_LIST; if (d_is_negative(dentry)) this_cpu_dec(nr_dentry_negative); list_lru_isolate_move(lru, &dentry->d_lru, list); } static void ___d_drop(struct dentry *dentry) { struct hlist_bl_head *b; /* * Hashed dentries are normally on the dentry hashtable, * with the exception of those newly allocated by * d_obtain_root, which are always IS_ROOT: */ if (unlikely(IS_ROOT(dentry))) b = &dentry->d_sb->s_roots; else b = d_hash(dentry->d_name.hash); hlist_bl_lock(b); __hlist_bl_del(&dentry->d_hash); hlist_bl_unlock(b); } void __d_drop(struct dentry *dentry) { if (!d_unhashed(dentry)) { ___d_drop(dentry); dentry->d_hash.pprev = NULL; write_seqcount_invalidate(&dentry->d_seq); } } EXPORT_SYMBOL(__d_drop); /** * d_drop - drop a dentry * @dentry: dentry to drop * * d_drop() unhashes the entry from the parent dentry hashes, so that it won't * be found through a VFS lookup any more. Note that this is different from * deleting the dentry - d_delete will try to mark the dentry negative if * possible, giving a successful _negative_ lookup, while d_drop will * just make the cache lookup fail. * * d_drop() is used mainly for stuff that wants to invalidate a dentry for some * reason (NFS timeouts or autofs deletes). * * __d_drop requires dentry->d_lock * * ___d_drop doesn't mark dentry as "unhashed" * (dentry->d_hash.pprev will be LIST_POISON2, not NULL). */ void d_drop(struct dentry *dentry) { spin_lock(&dentry->d_lock); __d_drop(dentry); spin_unlock(&dentry->d_lock); } EXPORT_SYMBOL(d_drop); static inline void dentry_unlist(struct dentry *dentry) { struct dentry *next; /* * Inform d_walk() and shrink_dentry_list() that we are no longer * attached to the dentry tree */ dentry->d_flags |= DCACHE_DENTRY_KILLED; if (unlikely(hlist_unhashed(&dentry->d_sib))) return; __hlist_del(&dentry->d_sib); /* * Cursors can move around the list of children. While we'd been * a normal list member, it didn't matter - ->d_sib.next would've * been updated. However, from now on it won't be and for the * things like d_walk() it might end up with a nasty surprise. * Normally d_walk() doesn't care about cursors moving around - * ->d_lock on parent prevents that and since a cursor has no children * of its own, we get through it without ever unlocking the parent. * There is one exception, though - if we ascend from a child that * gets killed as soon as we unlock it, the next sibling is found * using the value left in its ->d_sib.next. And if _that_ * pointed to a cursor, and cursor got moved (e.g. by lseek()) * before d_walk() regains parent->d_lock, we'll end up skipping * everything the cursor had been moved past. * * Solution: make sure that the pointer left behind in ->d_sib.next * points to something that won't be moving around. I.e. skip the * cursors. */ while (dentry->d_sib.next) { next = hlist_entry(dentry->d_sib.next, struct dentry, d_sib); if (likely(!(next->d_flags & DCACHE_DENTRY_CURSOR))) break; dentry->d_sib.next = next->d_sib.next; } } static struct dentry *__dentry_kill(struct dentry *dentry) { struct dentry *parent = NULL; bool can_free = true; /* * The dentry is now unrecoverably dead to the world. */ lockref_mark_dead(&dentry->d_lockref); /* * inform the fs via d_prune that this dentry is about to be * unhashed and destroyed. */ if (dentry->d_flags & DCACHE_OP_PRUNE) dentry->d_op->d_prune(dentry); if (dentry->d_flags & DCACHE_LRU_LIST) { if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) d_lru_del(dentry); } /* if it was on the hash then remove it */ __d_drop(dentry); if (dentry->d_inode) dentry_unlink_inode(dentry); else spin_unlock(&dentry->d_lock); this_cpu_dec(nr_dentry); if (dentry->d_op && dentry->d_op->d_release) dentry->d_op->d_release(dentry); cond_resched(); /* now that it's negative, ->d_parent is stable */ if (!IS_ROOT(dentry)) { parent = dentry->d_parent; spin_lock(&parent->d_lock); } spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); dentry_unlist(dentry); if (dentry->d_flags & DCACHE_SHRINK_LIST) can_free = false; spin_unlock(&dentry->d_lock); if (likely(can_free)) dentry_free(dentry); if (parent && --parent->d_lockref.count) { spin_unlock(&parent->d_lock); return NULL; } return parent; } /* * Lock a dentry for feeding it to __dentry_kill(). * Called under rcu_read_lock() and dentry->d_lock; the former * guarantees that nothing we access will be freed under us. * Note that dentry is *not* protected from concurrent dentry_kill(), * d_delete(), etc. * * Return false if dentry is busy. Otherwise, return true and have * that dentry's inode locked. */ static bool lock_for_kill(struct dentry *dentry) { struct inode *inode = dentry->d_inode; if (unlikely(dentry->d_lockref.count)) return false; if (!inode || likely(spin_trylock(&inode->i_lock))) return true; do { spin_unlock(&dentry->d_lock); spin_lock(&inode->i_lock); spin_lock(&dentry->d_lock); if (likely(inode == dentry->d_inode)) break; spin_unlock(&inode->i_lock); inode = dentry->d_inode; } while (inode); if (likely(!dentry->d_lockref.count)) return true; if (inode) spin_unlock(&inode->i_lock); return false; } /* * Decide if dentry is worth retaining. Usually this is called with dentry * locked; if not locked, we are more limited and might not be able to tell * without a lock. False in this case means "punt to locked path and recheck". * * In case we aren't locked, these predicates are not "stable". However, it is * sufficient that at some point after we dropped the reference the dentry was * hashed and the flags had the proper value. Other dentry users may have * re-gotten a reference to the dentry and change that, but our work is done - * we can leave the dentry around with a zero refcount. */ static inline bool retain_dentry(struct dentry *dentry, bool locked) { unsigned int d_flags; smp_rmb(); d_flags = READ_ONCE(dentry->d_flags); // Unreachable? Nobody would be able to look it up, no point retaining if (unlikely(d_unhashed(dentry))) return false; // Same if it's disconnected if (unlikely(d_flags & DCACHE_DISCONNECTED)) return false; // ->d_delete() might tell us not to bother, but that requires // ->d_lock; can't decide without it if (unlikely(d_flags & DCACHE_OP_DELETE)) { if (!locked || dentry->d_op->d_delete(dentry)) return false; } // Explicitly told not to bother if (unlikely(d_flags & DCACHE_DONTCACHE)) return false; // At this point it looks like we ought to keep it. We also might // need to do something - put it on LRU if it wasn't there already // and mark it referenced if it was on LRU, but not marked yet. // Unfortunately, both actions require ->d_lock, so in lockless // case we'd have to punt rather than doing those. if (unlikely(!(d_flags & DCACHE_LRU_LIST))) { if (!locked) return false; d_lru_add(dentry); } else if (unlikely(!(d_flags & DCACHE_REFERENCED))) { if (!locked) return false; dentry->d_flags |= DCACHE_REFERENCED; } return true; } void d_mark_dontcache(struct inode *inode) { struct dentry *de; spin_lock(&inode->i_lock); hlist_for_each_entry(de, &inode->i_dentry, d_u.d_alias) { spin_lock(&de->d_lock); de->d_flags |= DCACHE_DONTCACHE; spin_unlock(&de->d_lock); } inode->i_state |= I_DONTCACHE; spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(d_mark_dontcache); /* * Try to do a lockless dput(), and return whether that was successful. * * If unsuccessful, we return false, having already taken the dentry lock. * In that case refcount is guaranteed to be zero and we have already * decided that it's not worth keeping around. * * The caller needs to hold the RCU read lock, so that the dentry is * guaranteed to stay around even if the refcount goes down to zero! */ static inline bool fast_dput(struct dentry *dentry) { int ret; /* * try to decrement the lockref optimistically. */ ret = lockref_put_return(&dentry->d_lockref); /* * If the lockref_put_return() failed due to the lock being held * by somebody else, the fast path has failed. We will need to * get the lock, and then check the count again. */ if (unlikely(ret < 0)) { spin_lock(&dentry->d_lock); if (WARN_ON_ONCE(dentry->d_lockref.count <= 0)) { spin_unlock(&dentry->d_lock); return true; } dentry->d_lockref.count--; goto locked; } /* * If we weren't the last ref, we're done. */ if (ret) return true; /* * Can we decide that decrement of refcount is all we needed without * taking the lock? There's a very common case when it's all we need - * dentry looks like it ought to be retained and there's nothing else * to do. */ if (retain_dentry(dentry, false)) return true; /* * Either not worth retaining or we can't tell without the lock. * Get the lock, then. We've already decremented the refcount to 0, * but we'll need to re-check the situation after getting the lock. */ spin_lock(&dentry->d_lock); /* * Did somebody else grab a reference to it in the meantime, and * we're no longer the last user after all? Alternatively, somebody * else could have killed it and marked it dead. Either way, we * don't need to do anything else. */ locked: if (dentry->d_lockref.count || retain_dentry(dentry, true)) { spin_unlock(&dentry->d_lock); return true; } return false; } /* * This is dput * * This is complicated by the fact that we do not want to put * dentries that are no longer on any hash chain on the unused * list: we'd much rather just get rid of them immediately. * * However, that implies that we have to traverse the dentry * tree upwards to the parents which might _also_ now be * scheduled for deletion (it may have been only waiting for * its last child to go away). * * This tail recursion is done by hand as we don't want to depend * on the compiler to always get this right (gcc generally doesn't). * Real recursion would eat up our stack space. */ /* * dput - release a dentry * @dentry: dentry to release * * Release a dentry. This will drop the usage count and if appropriate * call the dentry unlink method as well as removing it from the queues and * releasing its resources. If the parent dentries were scheduled for release * they too may now get deleted. */ void dput(struct dentry *dentry) { if (!dentry) return; might_sleep(); rcu_read_lock(); if (likely(fast_dput(dentry))) { rcu_read_unlock(); return; } while (lock_for_kill(dentry)) { rcu_read_unlock(); dentry = __dentry_kill(dentry); if (!dentry) return; if (retain_dentry(dentry, true)) { spin_unlock(&dentry->d_lock); return; } rcu_read_lock(); } rcu_read_unlock(); spin_unlock(&dentry->d_lock); } EXPORT_SYMBOL(dput); static void to_shrink_list(struct dentry *dentry, struct list_head *list) __must_hold(&dentry->d_lock) { if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { if (dentry->d_flags & DCACHE_LRU_LIST) d_lru_del(dentry); d_shrink_add(dentry, list); } } void dput_to_list(struct dentry *dentry, struct list_head *list) { rcu_read_lock(); if (likely(fast_dput(dentry))) { rcu_read_unlock(); return; } rcu_read_unlock(); to_shrink_list(dentry, list); spin_unlock(&dentry->d_lock); } struct dentry *dget_parent(struct dentry *dentry) { int gotref; struct dentry *ret; unsigned seq; /* * Do optimistic parent lookup without any * locking. */ rcu_read_lock(); seq = raw_seqcount_begin(&dentry->d_seq); ret = READ_ONCE(dentry->d_parent); gotref = lockref_get_not_zero(&ret->d_lockref); rcu_read_unlock(); if (likely(gotref)) { if (!read_seqcount_retry(&dentry->d_seq, seq)) return ret; dput(ret); } repeat: /* * Don't need rcu_dereference because we re-check it was correct under * the lock. */ rcu_read_lock(); ret = dentry->d_parent; spin_lock(&ret->d_lock); if (unlikely(ret != dentry->d_parent)) { spin_unlock(&ret->d_lock); rcu_read_unlock(); goto repeat; } rcu_read_unlock(); BUG_ON(!ret->d_lockref.count); ret->d_lockref.count++; spin_unlock(&ret->d_lock); return ret; } EXPORT_SYMBOL(dget_parent); static struct dentry * __d_find_any_alias(struct inode *inode) { struct dentry *alias; if (hlist_empty(&inode->i_dentry)) return NULL; alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); lockref_get(&alias->d_lockref); return alias; } /** * d_find_any_alias - find any alias for a given inode * @inode: inode to find an alias for * * If any aliases exist for the given inode, take and return a * reference for one of them. If no aliases exist, return %NULL. */ struct dentry *d_find_any_alias(struct inode *inode) { struct dentry *de; spin_lock(&inode->i_lock); de = __d_find_any_alias(inode); spin_unlock(&inode->i_lock); return de; } EXPORT_SYMBOL(d_find_any_alias); static struct dentry *__d_find_alias(struct inode *inode) { struct dentry *alias; if (S_ISDIR(inode->i_mode)) return __d_find_any_alias(inode); hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { spin_lock(&alias->d_lock); if (!d_unhashed(alias)) { dget_dlock(alias); spin_unlock(&alias->d_lock); return alias; } spin_unlock(&alias->d_lock); } return NULL; } /** * d_find_alias - grab a hashed alias of inode * @inode: inode in question * * If inode has a hashed alias, or is a directory and has any alias, * acquire the reference to alias and return it. Otherwise return NULL. * Notice that if inode is a directory there can be only one alias and * it can be unhashed only if it has no children, or if it is the root * of a filesystem, or if the directory was renamed and d_revalidate * was the first vfs operation to notice. * * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer * any other hashed alias over that one. */ struct dentry *d_find_alias(struct inode *inode) { struct dentry *de = NULL; if (!hlist_empty(&inode->i_dentry)) { spin_lock(&inode->i_lock); de = __d_find_alias(inode); spin_unlock(&inode->i_lock); } return de; } EXPORT_SYMBOL(d_find_alias); /* * Caller MUST be holding rcu_read_lock() and be guaranteed * that inode won't get freed until rcu_read_unlock(). */ struct dentry *d_find_alias_rcu(struct inode *inode) { struct hlist_head *l = &inode->i_dentry; struct dentry *de = NULL; spin_lock(&inode->i_lock); // ->i_dentry and ->i_rcu are colocated, but the latter won't be // used without having I_FREEING set, which means no aliases left if (likely(!(inode->i_state & I_FREEING) && !hlist_empty(l))) { if (S_ISDIR(inode->i_mode)) { de = hlist_entry(l->first, struct dentry, d_u.d_alias); } else { hlist_for_each_entry(de, l, d_u.d_alias) if (!d_unhashed(de)) break; } } spin_unlock(&inode->i_lock); return de; } /* * Try to kill dentries associated with this inode. * WARNING: you must own a reference to inode. */ void d_prune_aliases(struct inode *inode) { LIST_HEAD(dispose); struct dentry *dentry; spin_lock(&inode->i_lock); hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { spin_lock(&dentry->d_lock); if (!dentry->d_lockref.count) to_shrink_list(dentry, &dispose); spin_unlock(&dentry->d_lock); } spin_unlock(&inode->i_lock); shrink_dentry_list(&dispose); } EXPORT_SYMBOL(d_prune_aliases); static inline void shrink_kill(struct dentry *victim) { do { rcu_read_unlock(); victim = __dentry_kill(victim); rcu_read_lock(); } while (victim && lock_for_kill(victim)); rcu_read_unlock(); if (victim) spin_unlock(&victim->d_lock); } void shrink_dentry_list(struct list_head *list) { while (!list_empty(list)) { struct dentry *dentry; dentry = list_entry(list->prev, struct dentry, d_lru); spin_lock(&dentry->d_lock); rcu_read_lock(); if (!lock_for_kill(dentry)) { bool can_free; rcu_read_unlock(); d_shrink_del(dentry); can_free = dentry->d_flags & DCACHE_DENTRY_KILLED; spin_unlock(&dentry->d_lock); if (can_free) dentry_free(dentry); continue; } d_shrink_del(dentry); shrink_kill(dentry); } } static enum lru_status dentry_lru_isolate(struct list_head *item, struct list_lru_one *lru, void *arg) { struct list_head *freeable = arg; struct dentry *dentry = container_of(item, struct dentry, d_lru); /* * we are inverting the lru lock/dentry->d_lock here, * so use a trylock. If we fail to get the lock, just skip * it */ if (!spin_trylock(&dentry->d_lock)) return LRU_SKIP; /* * Referenced dentries are still in use. If they have active * counts, just remove them from the LRU. Otherwise give them * another pass through the LRU. */ if (dentry->d_lockref.count) { d_lru_isolate(lru, dentry); spin_unlock(&dentry->d_lock); return LRU_REMOVED; } if (dentry->d_flags & DCACHE_REFERENCED) { dentry->d_flags &= ~DCACHE_REFERENCED; spin_unlock(&dentry->d_lock); /* * The list move itself will be made by the common LRU code. At * this point, we've dropped the dentry->d_lock but keep the * lru lock. This is safe to do, since every list movement is * protected by the lru lock even if both locks are held. * * This is guaranteed by the fact that all LRU management * functions are intermediated by the LRU API calls like * list_lru_add_obj and list_lru_del_obj. List movement in this file * only ever occur through this functions or through callbacks * like this one, that are called from the LRU API. * * The only exceptions to this are functions like * shrink_dentry_list, and code that first checks for the * DCACHE_SHRINK_LIST flag. Those are guaranteed to be * operating only with stack provided lists after they are * properly isolated from the main list. It is thus, always a * local access. */ return LRU_ROTATE; } d_lru_shrink_move(lru, dentry, freeable); spin_unlock(&dentry->d_lock); return LRU_REMOVED; } /** * prune_dcache_sb - shrink the dcache * @sb: superblock * @sc: shrink control, passed to list_lru_shrink_walk() * * Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This * is done when we need more memory and called from the superblock shrinker * function. * * This function may fail to free any resources if all the dentries are in * use. */ long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc) { LIST_HEAD(dispose); long freed; freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc, dentry_lru_isolate, &dispose); shrink_dentry_list(&dispose); return freed; } static enum lru_status dentry_lru_isolate_shrink(struct list_head *item, struct list_lru_one *lru, void *arg) { struct list_head *freeable = arg; struct dentry *dentry = container_of(item, struct dentry, d_lru); /* * we are inverting the lru lock/dentry->d_lock here, * so use a trylock. If we fail to get the lock, just skip * it */ if (!spin_trylock(&dentry->d_lock)) return LRU_SKIP; d_lru_shrink_move(lru, dentry, freeable); spin_unlock(&dentry->d_lock); return LRU_REMOVED; } /** * shrink_dcache_sb - shrink dcache for a superblock * @sb: superblock * * Shrink the dcache for the specified super block. This is used to free * the dcache before unmounting a file system. */ void shrink_dcache_sb(struct super_block *sb) { do { LIST_HEAD(dispose); list_lru_walk(&sb->s_dentry_lru, dentry_lru_isolate_shrink, &dispose, 1024); shrink_dentry_list(&dispose); } while (list_lru_count(&sb->s_dentry_lru) > 0); } EXPORT_SYMBOL(shrink_dcache_sb); /** * enum d_walk_ret - action to talke during tree walk * @D_WALK_CONTINUE: contrinue walk * @D_WALK_QUIT: quit walk * @D_WALK_NORETRY: quit when retry is needed * @D_WALK_SKIP: skip this dentry and its children */ enum d_walk_ret { D_WALK_CONTINUE, D_WALK_QUIT, D_WALK_NORETRY, D_WALK_SKIP, }; /** * d_walk - walk the dentry tree * @parent: start of walk * @data: data passed to @enter() and @finish() * @enter: callback when first entering the dentry * * The @enter() callbacks are called with d_lock held. */ static void d_walk(struct dentry *parent, void *data, enum d_walk_ret (*enter)(void *, struct dentry *)) { struct dentry *this_parent, *dentry; unsigned seq = 0; enum d_walk_ret ret; bool retry = true; again: read_seqbegin_or_lock(&rename_lock, &seq); this_parent = parent; spin_lock(&this_parent->d_lock); ret = enter(data, this_parent); switch (ret) { case D_WALK_CONTINUE: break; case D_WALK_QUIT: case D_WALK_SKIP: goto out_unlock; case D_WALK_NORETRY: retry = false; break; } repeat: dentry = d_first_child(this_parent); resume: hlist_for_each_entry_from(dentry, d_sib) { if (unlikely(dentry->d_flags & DCACHE_DENTRY_CURSOR)) continue; spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); ret = enter(data, dentry); switch (ret) { case D_WALK_CONTINUE: break; case D_WALK_QUIT: spin_unlock(&dentry->d_lock); goto out_unlock; case D_WALK_NORETRY: retry = false; break; case D_WALK_SKIP: spin_unlock(&dentry->d_lock); continue; } if (!hlist_empty(&dentry->d_children)) { spin_unlock(&this_parent->d_lock); spin_release(&dentry->d_lock.dep_map, _RET_IP_); this_parent = dentry; spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); goto repeat; } spin_unlock(&dentry->d_lock); } /* * All done at this level ... ascend and resume the search. */ rcu_read_lock(); ascend: if (this_parent != parent) { dentry = this_parent; this_parent = dentry->d_parent; spin_unlock(&dentry->d_lock); spin_lock(&this_parent->d_lock); /* might go back up the wrong parent if we have had a rename. */ if (need_seqretry(&rename_lock, seq)) goto rename_retry; /* go into the first sibling still alive */ hlist_for_each_entry_continue(dentry, d_sib) { if (likely(!(dentry->d_flags & DCACHE_DENTRY_KILLED))) { rcu_read_unlock(); goto resume; } } goto ascend; } if (need_seqretry(&rename_lock, seq)) goto rename_retry; rcu_read_unlock(); out_unlock: spin_unlock(&this_parent->d_lock); done_seqretry(&rename_lock, seq); return; rename_retry: spin_unlock(&this_parent->d_lock); rcu_read_unlock(); BUG_ON(seq & 1); if (!retry) return; seq = 1; goto again; } struct check_mount { struct vfsmount *mnt; unsigned int mounted; }; static enum d_walk_ret path_check_mount(void *data, struct dentry *dentry) { struct check_mount *info = data; struct path path = { .mnt = info->mnt, .dentry = dentry }; if (likely(!d_mountpoint(dentry))) return D_WALK_CONTINUE; if (__path_is_mountpoint(&path)) { info->mounted = 1; return D_WALK_QUIT; } return D_WALK_CONTINUE; } /** * path_has_submounts - check for mounts over a dentry in the * current namespace. * @parent: path to check. * * Return true if the parent or its subdirectories contain * a mount point in the current namespace. */ int path_has_submounts(const struct path *parent) { struct check_mount data = { .mnt = parent->mnt, .mounted = 0 }; read_seqlock_excl(&mount_lock); d_walk(parent->dentry, &data, path_check_mount); read_sequnlock_excl(&mount_lock); return data.mounted; } EXPORT_SYMBOL(path_has_submounts); /* * Called by mount code to set a mountpoint and check if the mountpoint is * reachable (e.g. NFS can unhash a directory dentry and then the complete * subtree can become unreachable). * * Only one of d_invalidate() and d_set_mounted() must succeed. For * this reason take rename_lock and d_lock on dentry and ancestors. */ int d_set_mounted(struct dentry *dentry) { struct dentry *p; int ret = -ENOENT; write_seqlock(&rename_lock); for (p = dentry->d_parent; !IS_ROOT(p); p = p->d_parent) { /* Need exclusion wrt. d_invalidate() */ spin_lock(&p->d_lock); if (unlikely(d_unhashed(p))) { spin_unlock(&p->d_lock); goto out; } spin_unlock(&p->d_lock); } spin_lock(&dentry->d_lock); if (!d_unlinked(dentry)) { ret = -EBUSY; if (!d_mountpoint(dentry)) { dentry->d_flags |= DCACHE_MOUNTED; ret = 0; } } spin_unlock(&dentry->d_lock); out: write_sequnlock(&rename_lock); return ret; } /* * Search the dentry child list of the specified parent, * and move any unused dentries to the end of the unused * list for prune_dcache(). We descend to the next level * whenever the d_children list is non-empty and continue * searching. * * It returns zero iff there are no unused children, * otherwise it returns the number of children moved to * the end of the unused list. This may not be the total * number of unused children, because select_parent can * drop the lock and return early due to latency * constraints. */ struct select_data { struct dentry *start; union { long found; struct dentry *victim; }; struct list_head dispose; }; static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) { struct select_data *data = _data; enum d_walk_ret ret = D_WALK_CONTINUE; if (data->start == dentry) goto out; if (dentry->d_flags & DCACHE_SHRINK_LIST) { data->found++; } else if (!dentry->d_lockref.count) { to_shrink_list(dentry, &data->dispose); data->found++; } else if (dentry->d_lockref.count < 0) { data->found++; } /* * We can return to the caller if we have found some (this * ensures forward progress). We'll be coming back to find * the rest. */ if (!list_empty(&data->dispose)) ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY; out: return ret; } static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry) { struct select_data *data = _data; enum d_walk_ret ret = D_WALK_CONTINUE; if (data->start == dentry) goto out; if (!dentry->d_lockref.count) { if (dentry->d_flags & DCACHE_SHRINK_LIST) { rcu_read_lock(); data->victim = dentry; return D_WALK_QUIT; } to_shrink_list(dentry, &data->dispose); } /* * We can return to the caller if we have found some (this * ensures forward progress). We'll be coming back to find * the rest. */ if (!list_empty(&data->dispose)) ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY; out: return ret; } /** * shrink_dcache_parent - prune dcache * @parent: parent of entries to prune * * Prune the dcache to remove unused children of the parent dentry. */ void shrink_dcache_parent(struct dentry *parent) { for (;;) { struct select_data data = {.start = parent}; INIT_LIST_HEAD(&data.dispose); d_walk(parent, &data, select_collect); if (!list_empty(&data.dispose)) { shrink_dentry_list(&data.dispose); continue; } cond_resched(); if (!data.found) break; data.victim = NULL; d_walk(parent, &data, select_collect2); if (data.victim) { spin_lock(&data.victim->d_lock); if (!lock_for_kill(data.victim)) { spin_unlock(&data.victim->d_lock); rcu_read_unlock(); } else { shrink_kill(data.victim); } } if (!list_empty(&data.dispose)) shrink_dentry_list(&data.dispose); } } EXPORT_SYMBOL(shrink_dcache_parent); static enum d_walk_ret umount_check(void *_data, struct dentry *dentry) { /* it has busy descendents; complain about those instead */ if (!hlist_empty(&dentry->d_children)) return D_WALK_CONTINUE; /* root with refcount 1 is fine */ if (dentry == _data && dentry->d_lockref.count == 1) return D_WALK_CONTINUE; WARN(1, "BUG: Dentry %p{i=%lx,n=%pd} " " still in use (%d) [unmount of %s %s]\n", dentry, dentry->d_inode ? dentry->d_inode->i_ino : 0UL, dentry, dentry->d_lockref.count, dentry->d_sb->s_type->name, dentry->d_sb->s_id); return D_WALK_CONTINUE; } static void do_one_tree(struct dentry *dentry) { shrink_dcache_parent(dentry); d_walk(dentry, dentry, umount_check); d_drop(dentry); dput(dentry); } /* * destroy the dentries attached to a superblock on unmounting */ void shrink_dcache_for_umount(struct super_block *sb) { struct dentry *dentry; rwsem_assert_held_write(&sb->s_umount); dentry = sb->s_root; sb->s_root = NULL; do_one_tree(dentry); while (!hlist_bl_empty(&sb->s_roots)) { dentry = dget(hlist_bl_entry(hlist_bl_first(&sb->s_roots), struct dentry, d_hash)); do_one_tree(dentry); } } static enum d_walk_ret find_submount(void *_data, struct dentry *dentry) { struct dentry **victim = _data; if (d_mountpoint(dentry)) { *victim = dget_dlock(dentry); return D_WALK_QUIT; } return D_WALK_CONTINUE; } /** * d_invalidate - detach submounts, prune dcache, and drop * @dentry: dentry to invalidate (aka detach, prune and drop) */ void d_invalidate(struct dentry *dentry) { bool had_submounts = false; spin_lock(&dentry->d_lock); if (d_unhashed(dentry)) { spin_unlock(&dentry->d_lock); return; } __d_drop(dentry); spin_unlock(&dentry->d_lock); /* Negative dentries can be dropped without further checks */ if (!dentry->d_inode) return; shrink_dcache_parent(dentry); for (;;) { struct dentry *victim = NULL; d_walk(dentry, &victim, find_submount); if (!victim) { if (had_submounts) shrink_dcache_parent(dentry); return; } had_submounts = true; detach_mounts(victim); dput(victim); } } EXPORT_SYMBOL(d_invalidate); /** * __d_alloc - allocate a dcache entry * @sb: filesystem it will belong to * @name: qstr of the name * * Allocates a dentry. It returns %NULL if there is insufficient memory * available. On a success the dentry is returned. The name passed in is * copied and the copy passed in may be reused after this call. */ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) { struct dentry *dentry; char *dname; int err; dentry = kmem_cache_alloc_lru(dentry_cache, &sb->s_dentry_lru, GFP_KERNEL); if (!dentry) return NULL; /* * We guarantee that the inline name is always NUL-terminated. * This way the memcpy() done by the name switching in rename * will still always have a NUL at the end, even if we might * be overwriting an internal NUL character */ dentry->d_shortname.string[DNAME_INLINE_LEN-1] = 0; if (unlikely(!name)) { name = &slash_name; dname = dentry->d_shortname.string; } else if (name->len > DNAME_INLINE_LEN-1) { size_t size = offsetof(struct external_name, name[1]); struct external_name *p = kmalloc(size + name->len, GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE); if (!p) { kmem_cache_free(dentry_cache, dentry); return NULL; } atomic_set(&p->count, 1); dname = p->name; } else { dname = dentry->d_shortname.string; } dentry->d_name.len = name->len; dentry->d_name.hash = name->hash; memcpy(dname, name->name, name->len); dname[name->len] = 0; /* Make sure we always see the terminating NUL character */ smp_store_release(&dentry->d_name.name, dname); /* ^^^ */ dentry->d_flags = 0; lockref_init(&dentry->d_lockref, 1); seqcount_spinlock_init(&dentry->d_seq, &dentry->d_lock); dentry->d_inode = NULL; dentry->d_parent = dentry; dentry->d_sb = sb; dentry->d_op = NULL; dentry->d_fsdata = NULL; INIT_HLIST_BL_NODE(&dentry->d_hash); INIT_LIST_HEAD(&dentry->d_lru); INIT_HLIST_HEAD(&dentry->d_children); INIT_HLIST_NODE(&dentry->d_u.d_alias); INIT_HLIST_NODE(&dentry->d_sib); d_set_d_op(dentry, dentry->d_sb->s_d_op); if (dentry->d_op && dentry->d_op->d_init) { err = dentry->d_op->d_init(dentry); if (err) { if (dname_external(dentry)) kfree(external_name(dentry)); kmem_cache_free(dentry_cache, dentry); return NULL; } } this_cpu_inc(nr_dentry); return dentry; } /** * d_alloc - allocate a dcache entry * @parent: parent of entry to allocate * @name: qstr of the name * * Allocates a dentry. It returns %NULL if there is insufficient memory * available. On a success the dentry is returned. The name passed in is * copied and the copy passed in may be reused after this call. */ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) { struct dentry *dentry = __d_alloc(parent->d_sb, name); if (!dentry) return NULL; spin_lock(&parent->d_lock); /* * don't need child lock because it is not subject * to concurrency here */ dentry->d_parent = dget_dlock(parent); hlist_add_head(&dentry->d_sib, &parent->d_children); spin_unlock(&parent->d_lock); return dentry; } EXPORT_SYMBOL(d_alloc); struct dentry *d_alloc_anon(struct super_block *sb) { return __d_alloc(sb, NULL); } EXPORT_SYMBOL(d_alloc_anon); struct dentry *d_alloc_cursor(struct dentry * parent) { struct dentry *dentry = d_alloc_anon(parent->d_sb); if (dentry) { dentry->d_flags |= DCACHE_DENTRY_CURSOR; dentry->d_parent = dget(parent); } return dentry; } /** * d_alloc_pseudo - allocate a dentry (for lookup-less filesystems) * @sb: the superblock * @name: qstr of the name * * For a filesystem that just pins its dentries in memory and never * performs lookups at all, return an unhashed IS_ROOT dentry. * This is used for pipes, sockets et.al. - the stuff that should * never be anyone's children or parents. Unlike all other * dentries, these will not have RCU delay between dropping the * last reference and freeing them. * * The only user is alloc_file_pseudo() and that's what should * be considered a public interface. Don't use directly. */ struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name) { static const struct dentry_operations anon_ops = { .d_dname = simple_dname }; struct dentry *dentry = __d_alloc(sb, name); if (likely(dentry)) { dentry->d_flags |= DCACHE_NORCU; if (!sb->s_d_op) d_set_d_op(dentry, &anon_ops); } return dentry; } struct dentry *d_alloc_name(struct dentry *parent, const char *name) { struct qstr q; q.name = name; q.hash_len = hashlen_string(parent, name); return d_alloc(parent, &q); } EXPORT_SYMBOL(d_alloc_name); void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) { WARN_ON_ONCE(dentry->d_op); WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH | DCACHE_OP_COMPARE | DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE | DCACHE_OP_DELETE | DCACHE_OP_REAL)); dentry->d_op = op; if (!op) return; if (op->d_hash) dentry->d_flags |= DCACHE_OP_HASH; if (op->d_compare) dentry->d_flags |= DCACHE_OP_COMPARE; if (op->d_revalidate) dentry->d_flags |= DCACHE_OP_REVALIDATE; if (op->d_weak_revalidate) dentry->d_flags |= DCACHE_OP_WEAK_REVALIDATE; if (op->d_delete) dentry->d_flags |= DCACHE_OP_DELETE; if (op->d_prune) dentry->d_flags |= DCACHE_OP_PRUNE; if (op->d_real) dentry->d_flags |= DCACHE_OP_REAL; } EXPORT_SYMBOL(d_set_d_op); static unsigned d_flags_for_inode(struct inode *inode) { unsigned add_flags = DCACHE_REGULAR_TYPE; if (!inode) return DCACHE_MISS_TYPE; if (S_ISDIR(inode->i_mode)) { add_flags = DCACHE_DIRECTORY_TYPE; if (unlikely(!(inode->i_opflags & IOP_LOOKUP))) { if (unlikely(!inode->i_op->lookup)) add_flags = DCACHE_AUTODIR_TYPE; else inode->i_opflags |= IOP_LOOKUP; } goto type_determined; } if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) { if (unlikely(inode->i_op->get_link)) { add_flags = DCACHE_SYMLINK_TYPE; goto type_determined; } inode->i_opflags |= IOP_NOFOLLOW; } if (unlikely(!S_ISREG(inode->i_mode))) add_flags = DCACHE_SPECIAL_TYPE; type_determined: if (unlikely(IS_AUTOMOUNT(inode))) add_flags |= DCACHE_NEED_AUTOMOUNT; return add_flags; } static void __d_instantiate(struct dentry *dentry, struct inode *inode) { unsigned add_flags = d_flags_for_inode(inode); WARN_ON(d_in_lookup(dentry)); spin_lock(&dentry->d_lock); /* * The negative counter only tracks dentries on the LRU. Don't dec if * d_lru is on another list. */ if ((dentry->d_flags & (DCACHE_LRU_LIST|DCACHE_SHRINK_LIST)) == DCACHE_LRU_LIST) this_cpu_dec(nr_dentry_negative); hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); raw_write_seqcount_begin(&dentry->d_seq); __d_set_inode_and_type(dentry, inode, add_flags); raw_write_seqcount_end(&dentry->d_seq); fsnotify_update_flags(dentry); spin_unlock(&dentry->d_lock); } /** * d_instantiate - fill in inode information for a dentry * @entry: dentry to complete * @inode: inode to attach to this dentry * * Fill in inode information in the entry. * * This turns negative dentries into productive full members * of society. * * NOTE! This assumes that the inode count has been incremented * (or otherwise set) by the caller to indicate that it is now * in use by the dcache. */ void d_instantiate(struct dentry *entry, struct inode * inode) { BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); if (inode) { security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); __d_instantiate(entry, inode); spin_unlock(&inode->i_lock); } } EXPORT_SYMBOL(d_instantiate); /* * This should be equivalent to d_instantiate() + unlock_new_inode(), * with lockdep-related part of unlock_new_inode() done before * anything else. Use that instead of open-coding d_instantiate()/ * unlock_new_inode() combinations. */ void d_instantiate_new(struct dentry *entry, struct inode *inode) { BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); BUG_ON(!inode); lockdep_annotate_inode_mutex_key(inode); security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); __d_instantiate(entry, inode); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW & ~I_CREATING; /* * Pairs with the barrier in prepare_to_wait_event() to make sure * ___wait_var_event() either sees the bit cleared or * waitqueue_active() check in wake_up_var() sees the waiter. */ smp_mb(); inode_wake_up_bit(inode, __I_NEW); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(d_instantiate_new); struct dentry *d_make_root(struct inode *root_inode) { struct dentry *res = NULL; if (root_inode) { res = d_alloc_anon(root_inode->i_sb); if (res) d_instantiate(res, root_inode); else iput(root_inode); } return res; } EXPORT_SYMBOL(d_make_root); static struct dentry *__d_obtain_alias(struct inode *inode, bool disconnected) { struct super_block *sb; struct dentry *new, *res; if (!inode) return ERR_PTR(-ESTALE); if (IS_ERR(inode)) return ERR_CAST(inode); sb = inode->i_sb; res = d_find_any_alias(inode); /* existing alias? */ if (res) goto out; new = d_alloc_anon(sb); if (!new) { res = ERR_PTR(-ENOMEM); goto out; } security_d_instantiate(new, inode); spin_lock(&inode->i_lock); res = __d_find_any_alias(inode); /* recheck under lock */ if (likely(!res)) { /* still no alias, attach a disconnected dentry */ unsigned add_flags = d_flags_for_inode(inode); if (disconnected) add_flags |= DCACHE_DISCONNECTED; spin_lock(&new->d_lock); __d_set_inode_and_type(new, inode, add_flags); hlist_add_head(&new->d_u.d_alias, &inode->i_dentry); if (!disconnected) { hlist_bl_lock(&sb->s_roots); hlist_bl_add_head(&new->d_hash, &sb->s_roots); hlist_bl_unlock(&sb->s_roots); } spin_unlock(&new->d_lock); spin_unlock(&inode->i_lock); inode = NULL; /* consumed by new->d_inode */ res = new; } else { spin_unlock(&inode->i_lock); dput(new); } out: iput(inode); return res; } /** * d_obtain_alias - find or allocate a DISCONNECTED dentry for a given inode * @inode: inode to allocate the dentry for * * Obtain a dentry for an inode resulting from NFS filehandle conversion or * similar open by handle operations. The returned dentry may be anonymous, * or may have a full name (if the inode was already in the cache). * * When called on a directory inode, we must ensure that the inode only ever * has one dentry. If a dentry is found, that is returned instead of * allocating a new one. * * On successful return, the reference to the inode has been transferred * to the dentry. In case of an error the reference on the inode is released. * To make it easier to use in export operations a %NULL or IS_ERR inode may * be passed in and the error will be propagated to the return value, * with a %NULL @inode replaced by ERR_PTR(-ESTALE). */ struct dentry *d_obtain_alias(struct inode *inode) { return __d_obtain_alias(inode, true); } EXPORT_SYMBOL(d_obtain_alias); /** * d_obtain_root - find or allocate a dentry for a given inode * @inode: inode to allocate the dentry for * * Obtain an IS_ROOT dentry for the root of a filesystem. * * We must ensure that directory inodes only ever have one dentry. If a * dentry is found, that is returned instead of allocating a new one. * * On successful return, the reference to the inode has been transferred * to the dentry. In case of an error the reference on the inode is * released. A %NULL or IS_ERR inode may be passed in and will be the * error will be propagate to the return value, with a %NULL @inode * replaced by ERR_PTR(-ESTALE). */ struct dentry *d_obtain_root(struct inode *inode) { return __d_obtain_alias(inode, false); } EXPORT_SYMBOL(d_obtain_root); /** * d_add_ci - lookup or allocate new dentry with case-exact name * @dentry: the negative dentry that was passed to the parent's lookup func * @inode: the inode case-insensitive lookup has found * @name: the case-exact name to be associated with the returned dentry * * This is to avoid filling the dcache with case-insensitive names to the * same inode, only the actual correct case is stored in the dcache for * case-insensitive filesystems. * * For a case-insensitive lookup match and if the case-exact dentry * already exists in the dcache, use it and return it. * * If no entry exists with the exact case name, allocate new dentry with * the exact case, and return the spliced entry. */ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode, struct qstr *name) { struct dentry *found, *res; /* * First check if a dentry matching the name already exists, * if not go ahead and create it now. */ found = d_hash_and_lookup(dentry->d_parent, name); if (found) { iput(inode); return found; } if (d_in_lookup(dentry)) { found = d_alloc_parallel(dentry->d_parent, name, dentry->d_wait); if (IS_ERR(found) || !d_in_lookup(found)) { iput(inode); return found; } } else { found = d_alloc(dentry->d_parent, name); if (!found) { iput(inode); return ERR_PTR(-ENOMEM); } } res = d_splice_alias(inode, found); if (res) { d_lookup_done(found); dput(found); return res; } return found; } EXPORT_SYMBOL(d_add_ci); /** * d_same_name - compare dentry name with case-exact name * @dentry: the negative dentry that was passed to the parent's lookup func * @parent: parent dentry * @name: the case-exact name to be associated with the returned dentry * * Return: true if names are same, or false */ bool d_same_name(const struct dentry *dentry, const struct dentry *parent, const struct qstr *name) { if (likely(!(parent->d_flags & DCACHE_OP_COMPARE))) { if (dentry->d_name.len != name->len) return false; return dentry_cmp(dentry, name->name, name->len) == 0; } return parent->d_op->d_compare(dentry, dentry->d_name.len, dentry->d_name.name, name) == 0; } EXPORT_SYMBOL_GPL(d_same_name); /* * This is __d_lookup_rcu() when the parent dentry has * DCACHE_OP_COMPARE, which makes things much nastier. */ static noinline struct dentry *__d_lookup_rcu_op_compare( const struct dentry *parent, const struct qstr *name, unsigned *seqp) { u64 hashlen = name->hash_len; struct hlist_bl_head *b = d_hash(hashlen); struct hlist_bl_node *node; struct dentry *dentry; hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { int tlen; const char *tname; unsigned seq; seqretry: seq = raw_seqcount_begin(&dentry->d_seq); if (dentry->d_parent != parent) continue; if (d_unhashed(dentry)) continue; if (dentry->d_name.hash != hashlen_hash(hashlen)) continue; tlen = dentry->d_name.len; tname = dentry->d_name.name; /* we want a consistent (name,len) pair */ if (read_seqcount_retry(&dentry->d_seq, seq)) { cpu_relax(); goto seqretry; } if (parent->d_op->d_compare(dentry, tlen, tname, name) != 0) continue; *seqp = seq; return dentry; } return NULL; } /** * __d_lookup_rcu - search for a dentry (racy, store-free) * @parent: parent dentry * @name: qstr of name we wish to find * @seqp: returns d_seq value at the point where the dentry was found * Returns: dentry, or NULL * * __d_lookup_rcu is the dcache lookup function for rcu-walk name * resolution (store-free path walking) design described in * Documentation/filesystems/path-lookup.txt. * * This is not to be used outside core vfs. * * __d_lookup_rcu must only be used in rcu-walk mode, ie. with vfsmount lock * held, and rcu_read_lock held. The returned dentry must not be stored into * without taking d_lock and checking d_seq sequence count against @seq * returned here. * * Alternatively, __d_lookup_rcu may be called again to look up the child of * the returned dentry, so long as its parent's seqlock is checked after the * child is looked up. Thus, an interlocking stepping of sequence lock checks * is formed, giving integrity down the path walk. * * NOTE! The caller *has* to check the resulting dentry against the sequence * number we've returned before using any of the resulting dentry state! */ struct dentry *__d_lookup_rcu(const struct dentry *parent, const struct qstr *name, unsigned *seqp) { u64 hashlen = name->hash_len; const unsigned char *str = name->name; struct hlist_bl_head *b = d_hash(hashlen); struct hlist_bl_node *node; struct dentry *dentry; /* * Note: There is significant duplication with __d_lookup_rcu which is * required to prevent single threaded performance regressions * especially on architectures where smp_rmb (in seqcounts) are costly. * Keep the two functions in sync. */ if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) return __d_lookup_rcu_op_compare(parent, name, seqp); /* * The hash list is protected using RCU. * * Carefully use d_seq when comparing a candidate dentry, to avoid * races with d_move(). * * It is possible that concurrent renames can mess up our list * walk here and result in missing our dentry, resulting in the * false-negative result. d_lookup() protects against concurrent * renames using rename_lock seqlock. * * See Documentation/filesystems/path-lookup.txt for more details. */ hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { unsigned seq; /* * The dentry sequence count protects us from concurrent * renames, and thus protects parent and name fields. * * The caller must perform a seqcount check in order * to do anything useful with the returned dentry. * * NOTE! We do a "raw" seqcount_begin here. That means that * we don't wait for the sequence count to stabilize if it * is in the middle of a sequence change. If we do the slow * dentry compare, we will do seqretries until it is stable, * and if we end up with a successful lookup, we actually * want to exit RCU lookup anyway. * * Note that raw_seqcount_begin still *does* smp_rmb(), so * we are still guaranteed NUL-termination of ->d_name.name. */ seq = raw_seqcount_begin(&dentry->d_seq); if (dentry->d_parent != parent) continue; if (d_unhashed(dentry)) continue; if (dentry->d_name.hash_len != hashlen) continue; if (dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0) continue; *seqp = seq; return dentry; } return NULL; } /** * d_lookup - search for a dentry * @parent: parent dentry * @name: qstr of name we wish to find * Returns: dentry, or NULL * * d_lookup searches the children of the parent dentry for the name in * question. If the dentry is found its reference count is incremented and the * dentry is returned. The caller must use dput to free the entry when it has * finished using it. %NULL is returned if the dentry does not exist. */ struct dentry *d_lookup(const struct dentry *parent, const struct qstr *name) { struct dentry *dentry; unsigned seq; do { seq = read_seqbegin(&rename_lock); dentry = __d_lookup(parent, name); if (dentry) break; } while (read_seqretry(&rename_lock, seq)); return dentry; } EXPORT_SYMBOL(d_lookup); /** * __d_lookup - search for a dentry (racy) * @parent: parent dentry * @name: qstr of name we wish to find * Returns: dentry, or NULL * * __d_lookup is like d_lookup, however it may (rarely) return a * false-negative result due to unrelated rename activity. * * __d_lookup is slightly faster by avoiding rename_lock read seqlock, * however it must be used carefully, eg. with a following d_lookup in * the case of failure. * * __d_lookup callers must be commented. */ struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name) { unsigned int hash = name->hash; struct hlist_bl_head *b = d_hash(hash); struct hlist_bl_node *node; struct dentry *found = NULL; struct dentry *dentry; /* * Note: There is significant duplication with __d_lookup_rcu which is * required to prevent single threaded performance regressions * especially on architectures where smp_rmb (in seqcounts) are costly. * Keep the two functions in sync. */ /* * The hash list is protected using RCU. * * Take d_lock when comparing a candidate dentry, to avoid races * with d_move(). * * It is possible that concurrent renames can mess up our list * walk here and result in missing our dentry, resulting in the * false-negative result. d_lookup() protects against concurrent * renames using rename_lock seqlock. * * See Documentation/filesystems/path-lookup.txt for more details. */ rcu_read_lock(); hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { if (dentry->d_name.hash != hash) continue; spin_lock(&dentry->d_lock); if (dentry->d_parent != parent) goto next; if (d_unhashed(dentry)) goto next; if (!d_same_name(dentry, parent, name)) goto next; dentry->d_lockref.count++; found = dentry; spin_unlock(&dentry->d_lock); break; next: spin_unlock(&dentry->d_lock); } rcu_read_unlock(); return found; } /** * d_hash_and_lookup - hash the qstr then search for a dentry * @dir: Directory to search in * @name: qstr of name we wish to find * * On lookup failure NULL is returned; on bad name - ERR_PTR(-error) */ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name) { /* * Check for a fs-specific hash function. Note that we must * calculate the standard hash first, as the d_op->d_hash() * routine may choose to leave the hash value unchanged. */ name->hash = full_name_hash(dir, name->name, name->len); if (dir->d_flags & DCACHE_OP_HASH) { int err = dir->d_op->d_hash(dir, name); if (unlikely(err < 0)) return ERR_PTR(err); } return d_lookup(dir, name); } EXPORT_SYMBOL(d_hash_and_lookup); /* * When a file is deleted, we have two options: * - turn this dentry into a negative dentry * - unhash this dentry and free it. * * Usually, we want to just turn this into * a negative dentry, but if anybody else is * currently using the dentry or the inode * we can't do that and we fall back on removing * it from the hash queues and waiting for * it to be deleted later when it has no users */ /** * d_delete - delete a dentry * @dentry: The dentry to delete * * Turn the dentry into a negative dentry if possible, otherwise * remove it from the hash queues so it can be deleted later */ void d_delete(struct dentry * dentry) { struct inode *inode = dentry->d_inode; spin_lock(&inode->i_lock); spin_lock(&dentry->d_lock); /* * Are we the only user? */ if (dentry->d_lockref.count == 1) { if (dentry_negative_policy) __d_drop(dentry); dentry->d_flags &= ~DCACHE_CANT_MOUNT; dentry_unlink_inode(dentry); } else { __d_drop(dentry); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); } } EXPORT_SYMBOL(d_delete); static void __d_rehash(struct dentry *entry) { struct hlist_bl_head *b = d_hash(entry->d_name.hash); hlist_bl_lock(b); hlist_bl_add_head_rcu(&entry->d_hash, b); hlist_bl_unlock(b); } /** * d_rehash - add an entry back to the hash * @entry: dentry to add to the hash * * Adds a dentry to the hash according to its name. */ void d_rehash(struct dentry * entry) { spin_lock(&entry->d_lock); __d_rehash(entry); spin_unlock(&entry->d_lock); } EXPORT_SYMBOL(d_rehash); static inline unsigned start_dir_add(struct inode *dir) { preempt_disable_nested(); for (;;) { unsigned n = dir->i_dir_seq; if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n) return n; cpu_relax(); } } static inline void end_dir_add(struct inode *dir, unsigned int n, wait_queue_head_t *d_wait) { smp_store_release(&dir->i_dir_seq, n + 2); preempt_enable_nested(); wake_up_all(d_wait); } static void d_wait_lookup(struct dentry *dentry) { if (d_in_lookup(dentry)) { DECLARE_WAITQUEUE(wait, current); add_wait_queue(dentry->d_wait, &wait); do { set_current_state(TASK_UNINTERRUPTIBLE); spin_unlock(&dentry->d_lock); schedule(); spin_lock(&dentry->d_lock); } while (d_in_lookup(dentry)); } } struct dentry *d_alloc_parallel(struct dentry *parent, const struct qstr *name, wait_queue_head_t *wq) { unsigned int hash = name->hash; struct hlist_bl_head *b = in_lookup_hash(parent, hash); struct hlist_bl_node *node; struct dentry *new = d_alloc(parent, name); struct dentry *dentry; unsigned seq, r_seq, d_seq; if (unlikely(!new)) return ERR_PTR(-ENOMEM); retry: rcu_read_lock(); seq = smp_load_acquire(&parent->d_inode->i_dir_seq); r_seq = read_seqbegin(&rename_lock); dentry = __d_lookup_rcu(parent, name, &d_seq); if (unlikely(dentry)) { if (!lockref_get_not_dead(&dentry->d_lockref)) { rcu_read_unlock(); goto retry; } if (read_seqcount_retry(&dentry->d_seq, d_seq)) { rcu_read_unlock(); dput(dentry); goto retry; } rcu_read_unlock(); dput(new); return dentry; } if (unlikely(read_seqretry(&rename_lock, r_seq))) { rcu_read_unlock(); goto retry; } if (unlikely(seq & 1)) { rcu_read_unlock(); goto retry; } hlist_bl_lock(b); if (unlikely(READ_ONCE(parent->d_inode->i_dir_seq) != seq)) { hlist_bl_unlock(b); rcu_read_unlock(); goto retry; } /* * No changes for the parent since the beginning of d_lookup(). * Since all removals from the chain happen with hlist_bl_lock(), * any potential in-lookup matches are going to stay here until * we unlock the chain. All fields are stable in everything * we encounter. */ hlist_bl_for_each_entry(dentry, node, b, d_u.d_in_lookup_hash) { if (dentry->d_name.hash != hash) continue; if (dentry->d_parent != parent) continue; if (!d_same_name(dentry, parent, name)) continue; hlist_bl_unlock(b); /* now we can try to grab a reference */ if (!lockref_get_not_dead(&dentry->d_lockref)) { rcu_read_unlock(); goto retry; } rcu_read_unlock(); /* * somebody is likely to be still doing lookup for it; * wait for them to finish */ spin_lock(&dentry->d_lock); d_wait_lookup(dentry); /* * it's not in-lookup anymore; in principle we should repeat * everything from dcache lookup, but it's likely to be what * d_lookup() would've found anyway. If it is, just return it; * otherwise we really have to repeat the whole thing. */ if (unlikely(dentry->d_name.hash != hash)) goto mismatch; if (unlikely(dentry->d_parent != parent)) goto mismatch; if (unlikely(d_unhashed(dentry))) goto mismatch; if (unlikely(!d_same_name(dentry, parent, name))) goto mismatch; /* OK, it *is* a hashed match; return it */ spin_unlock(&dentry->d_lock); dput(new); return dentry; } rcu_read_unlock(); /* we can't take ->d_lock here; it's OK, though. */ new->d_flags |= DCACHE_PAR_LOOKUP; new->d_wait = wq; hlist_bl_add_head(&new->d_u.d_in_lookup_hash, b); hlist_bl_unlock(b); return new; mismatch: spin_unlock(&dentry->d_lock); dput(dentry); goto retry; } EXPORT_SYMBOL(d_alloc_parallel); /* * - Unhash the dentry * - Retrieve and clear the waitqueue head in dentry * - Return the waitqueue head */ static wait_queue_head_t *__d_lookup_unhash(struct dentry *dentry) { wait_queue_head_t *d_wait; struct hlist_bl_head *b; lockdep_assert_held(&dentry->d_lock); b = in_lookup_hash(dentry->d_parent, dentry->d_name.hash); hlist_bl_lock(b); dentry->d_flags &= ~DCACHE_PAR_LOOKUP; __hlist_bl_del(&dentry->d_u.d_in_lookup_hash); d_wait = dentry->d_wait; dentry->d_wait = NULL; hlist_bl_unlock(b); INIT_HLIST_NODE(&dentry->d_u.d_alias); INIT_LIST_HEAD(&dentry->d_lru); return d_wait; } void __d_lookup_unhash_wake(struct dentry *dentry) { spin_lock(&dentry->d_lock); wake_up_all(__d_lookup_unhash(dentry)); spin_unlock(&dentry->d_lock); } EXPORT_SYMBOL(__d_lookup_unhash_wake); /* inode->i_lock held if inode is non-NULL */ static inline void __d_add(struct dentry *dentry, struct inode *inode) { wait_queue_head_t *d_wait; struct inode *dir = NULL; unsigned n; spin_lock(&dentry->d_lock); if (unlikely(d_in_lookup(dentry))) { dir = dentry->d_parent->d_inode; n = start_dir_add(dir); d_wait = __d_lookup_unhash(dentry); } if (inode) { unsigned add_flags = d_flags_for_inode(inode); hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); raw_write_seqcount_begin(&dentry->d_seq); __d_set_inode_and_type(dentry, inode, add_flags); raw_write_seqcount_end(&dentry->d_seq); fsnotify_update_flags(dentry); } __d_rehash(dentry); if (dir) end_dir_add(dir, n, d_wait); spin_unlock(&dentry->d_lock); if (inode) spin_unlock(&inode->i_lock); } /** * d_add - add dentry to hash queues * @entry: dentry to add * @inode: The inode to attach to this dentry * * This adds the entry to the hash queues and initializes @inode. * The entry was actually filled in earlier during d_alloc(). */ void d_add(struct dentry *entry, struct inode *inode) { if (inode) { security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); } __d_add(entry, inode); } EXPORT_SYMBOL(d_add); /** * d_exact_alias - find and hash an exact unhashed alias * @entry: dentry to add * @inode: The inode to go with this dentry * * If an unhashed dentry with the same name/parent and desired * inode already exists, hash and return it. Otherwise, return * NULL. * * Parent directory should be locked. */ struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode) { struct dentry *alias; unsigned int hash = entry->d_name.hash; spin_lock(&inode->i_lock); hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { /* * Don't need alias->d_lock here, because aliases with * d_parent == entry->d_parent are not subject to name or * parent changes, because the parent inode i_mutex is held. */ if (alias->d_name.hash != hash) continue; if (alias->d_parent != entry->d_parent) continue; if (!d_same_name(alias, entry->d_parent, &entry->d_name)) continue; spin_lock(&alias->d_lock); if (!d_unhashed(alias)) { spin_unlock(&alias->d_lock); alias = NULL; } else { dget_dlock(alias); __d_rehash(alias); spin_unlock(&alias->d_lock); } spin_unlock(&inode->i_lock); return alias; } spin_unlock(&inode->i_lock); return NULL; } EXPORT_SYMBOL(d_exact_alias); static void swap_names(struct dentry *dentry, struct dentry *target) { if (unlikely(dname_external(target))) { if (unlikely(dname_external(dentry))) { /* * Both external: swap the pointers */ swap(target->d_name.name, dentry->d_name.name); } else { /* * dentry:internal, target:external. Steal target's * storage and make target internal. */ dentry->d_name.name = target->d_name.name; target->d_shortname = dentry->d_shortname; target->d_name.name = target->d_shortname.string; } } else { if (unlikely(dname_external(dentry))) { /* * dentry:external, target:internal. Give dentry's * storage to target and make dentry internal */ target->d_name.name = dentry->d_name.name; dentry->d_shortname = target->d_shortname; dentry->d_name.name = dentry->d_shortname.string; } else { /* * Both are internal. */ for (int i = 0; i < DNAME_INLINE_WORDS; i++) swap(dentry->d_shortname.words[i], target->d_shortname.words[i]); } } swap(dentry->d_name.hash_len, target->d_name.hash_len); } static void copy_name(struct dentry *dentry, struct dentry *target) { struct external_name *old_name = NULL; if (unlikely(dname_external(dentry))) old_name = external_name(dentry); if (unlikely(dname_external(target))) { atomic_inc(&external_name(target)->count); dentry->d_name = target->d_name; } else { dentry->d_shortname = target->d_shortname; dentry->d_name.name = dentry->d_shortname.string; dentry->d_name.hash_len = target->d_name.hash_len; } if (old_name && likely(atomic_dec_and_test(&old_name->count))) kfree_rcu(old_name, head); } /* * __d_move - move a dentry * @dentry: entry to move * @target: new dentry * @exchange: exchange the two dentries * * Update the dcache to reflect the move of a file name. Negative * dcache entries should not be moved in this way. Caller must hold * rename_lock, the i_mutex of the source and target directories, * and the sb->s_vfs_rename_mutex if they differ. See lock_rename(). */ static void __d_move(struct dentry *dentry, struct dentry *target, bool exchange) { struct dentry *old_parent, *p; wait_queue_head_t *d_wait; struct inode *dir = NULL; unsigned n; WARN_ON(!dentry->d_inode); if (WARN_ON(dentry == target)) return; BUG_ON(d_ancestor(target, dentry)); old_parent = dentry->d_parent; p = d_ancestor(old_parent, target); if (IS_ROOT(dentry)) { BUG_ON(p); spin_lock(&target->d_parent->d_lock); } else if (!p) { /* target is not a descendent of dentry->d_parent */ spin_lock(&target->d_parent->d_lock); spin_lock_nested(&old_parent->d_lock, DENTRY_D_LOCK_NESTED); } else { BUG_ON(p == dentry); spin_lock(&old_parent->d_lock); if (p != target) spin_lock_nested(&target->d_parent->d_lock, DENTRY_D_LOCK_NESTED); } spin_lock_nested(&dentry->d_lock, 2); spin_lock_nested(&target->d_lock, 3); if (unlikely(d_in_lookup(target))) { dir = target->d_parent->d_inode; n = start_dir_add(dir); d_wait = __d_lookup_unhash(target); } write_seqcount_begin(&dentry->d_seq); write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED); /* unhash both */ if (!d_unhashed(dentry)) ___d_drop(dentry); if (!d_unhashed(target)) ___d_drop(target); /* ... and switch them in the tree */ dentry->d_parent = target->d_parent; if (!exchange) { copy_name(dentry, target); target->d_hash.pprev = NULL; dentry->d_parent->d_lockref.count++; if (dentry != old_parent) /* wasn't IS_ROOT */ WARN_ON(!--old_parent->d_lockref.count); } else { target->d_parent = old_parent; swap_names(dentry, target); if (!hlist_unhashed(&target->d_sib)) __hlist_del(&target->d_sib); hlist_add_head(&target->d_sib, &target->d_parent->d_children); __d_rehash(target); fsnotify_update_flags(target); } if (!hlist_unhashed(&dentry->d_sib)) __hlist_del(&dentry->d_sib); hlist_add_head(&dentry->d_sib, &dentry->d_parent->d_children); __d_rehash(dentry); fsnotify_update_flags(dentry); fscrypt_handle_d_move(dentry); write_seqcount_end(&target->d_seq); write_seqcount_end(&dentry->d_seq); if (dir) end_dir_add(dir, n, d_wait); if (dentry->d_parent != old_parent) spin_unlock(&dentry->d_parent->d_lock); if (dentry != old_parent) spin_unlock(&old_parent->d_lock); spin_unlock(&target->d_lock); spin_unlock(&dentry->d_lock); } /* * d_move - move a dentry * @dentry: entry to move * @target: new dentry * * Update the dcache to reflect the move of a file name. Negative * dcache entries should not be moved in this way. See the locking * requirements for __d_move. */ void d_move(struct dentry *dentry, struct dentry *target) { write_seqlock(&rename_lock); __d_move(dentry, target, false); write_sequnlock(&rename_lock); } EXPORT_SYMBOL(d_move); /* * d_exchange - exchange two dentries * @dentry1: first dentry * @dentry2: second dentry */ void d_exchange(struct dentry *dentry1, struct dentry *dentry2) { write_seqlock(&rename_lock); WARN_ON(!dentry1->d_inode); WARN_ON(!dentry2->d_inode); WARN_ON(IS_ROOT(dentry1)); WARN_ON(IS_ROOT(dentry2)); __d_move(dentry1, dentry2, true); write_sequnlock(&rename_lock); } /** * d_ancestor - search for an ancestor * @p1: ancestor dentry * @p2: child dentry * * Returns the ancestor dentry of p2 which is a child of p1, if p1 is * an ancestor of p2, else NULL. */ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2) { struct dentry *p; for (p = p2; !IS_ROOT(p); p = p->d_parent) { if (p->d_parent == p1) return p; } return NULL; } /* * This helper attempts to cope with remotely renamed directories * * It assumes that the caller is already holding * dentry->d_parent->d_inode->i_mutex, and rename_lock * * Note: If ever the locking in lock_rename() changes, then please * remember to update this too... */ static int __d_unalias(struct dentry *dentry, struct dentry *alias) { struct mutex *m1 = NULL; struct rw_semaphore *m2 = NULL; int ret = -ESTALE; /* If alias and dentry share a parent, then no extra locks required */ if (alias->d_parent == dentry->d_parent) goto out_unalias; /* See lock_rename() */ if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex)) goto out_err; m1 = &dentry->d_sb->s_vfs_rename_mutex; if (!inode_trylock_shared(alias->d_parent->d_inode)) goto out_err; m2 = &alias->d_parent->d_inode->i_rwsem; out_unalias: if (alias->d_op && alias->d_op->d_unalias_trylock && !alias->d_op->d_unalias_trylock(alias)) goto out_err; __d_move(alias, dentry, false); if (alias->d_op && alias->d_op->d_unalias_unlock) alias->d_op->d_unalias_unlock(alias); ret = 0; out_err: if (m2) up_read(m2); if (m1) mutex_unlock(m1); return ret; } /** * d_splice_alias - splice a disconnected dentry into the tree if one exists * @inode: the inode which may have a disconnected dentry * @dentry: a negative dentry which we want to point to the inode. * * If inode is a directory and has an IS_ROOT alias, then d_move that in * place of the given dentry and return it, else simply d_add the inode * to the dentry and return NULL. * * If a non-IS_ROOT directory is found, the filesystem is corrupt, and * we should error out: directories can't have multiple aliases. * * This is needed in the lookup routine of any filesystem that is exportable * (via knfsd) so that we can build dcache paths to directories effectively. * * If a dentry was found and moved, then it is returned. Otherwise NULL * is returned. This matches the expected return value of ->lookup. * * Cluster filesystems may call this function with a negative, hashed dentry. * In that case, we know that the inode will be a regular file, and also this * will only occur during atomic_open. So we need to check for the dentry * being already hashed only in the final case. */ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) { if (IS_ERR(inode)) return ERR_CAST(inode); BUG_ON(!d_unhashed(dentry)); if (!inode) goto out; security_d_instantiate(dentry, inode); spin_lock(&inode->i_lock); if (S_ISDIR(inode->i_mode)) { struct dentry *new = __d_find_any_alias(inode); if (unlikely(new)) { /* The reference to new ensures it remains an alias */ spin_unlock(&inode->i_lock); write_seqlock(&rename_lock); if (unlikely(d_ancestor(new, dentry))) { write_sequnlock(&rename_lock); dput(new); new = ERR_PTR(-ELOOP); pr_warn_ratelimited( "VFS: Lookup of '%s' in %s %s" " would have caused loop\n", dentry->d_name.name, inode->i_sb->s_type->name, inode->i_sb->s_id); } else if (!IS_ROOT(new)) { struct dentry *old_parent = dget(new->d_parent); int err = __d_unalias(dentry, new); write_sequnlock(&rename_lock); if (err) { dput(new); new = ERR_PTR(err); } dput(old_parent); } else { __d_move(new, dentry, false); write_sequnlock(&rename_lock); } iput(inode); return new; } } out: __d_add(dentry, inode); return NULL; } EXPORT_SYMBOL(d_splice_alias); /* * Test whether new_dentry is a subdirectory of old_dentry. * * Trivially implemented using the dcache structure */ /** * is_subdir - is new dentry a subdirectory of old_dentry * @new_dentry: new dentry * @old_dentry: old dentry * * Returns true if new_dentry is a subdirectory of the parent (at any depth). * Returns false otherwise. * Caller must ensure that "new_dentry" is pinned before calling is_subdir() */ bool is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) { bool subdir; unsigned seq; if (new_dentry == old_dentry) return true; /* Access d_parent under rcu as d_move() may change it. */ rcu_read_lock(); seq = read_seqbegin(&rename_lock); subdir = d_ancestor(old_dentry, new_dentry); /* Try lockless once... */ if (read_seqretry(&rename_lock, seq)) { /* ...else acquire lock for progress even on deep chains. */ read_seqlock_excl(&rename_lock); subdir = d_ancestor(old_dentry, new_dentry); read_sequnlock_excl(&rename_lock); } rcu_read_unlock(); return subdir; } EXPORT_SYMBOL(is_subdir); static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry) { struct dentry *root = data; if (dentry != root) { if (d_unhashed(dentry) || !dentry->d_inode) return D_WALK_SKIP; if (!(dentry->d_flags & DCACHE_GENOCIDE)) { dentry->d_flags |= DCACHE_GENOCIDE; dentry->d_lockref.count--; } } return D_WALK_CONTINUE; } void d_genocide(struct dentry *parent) { d_walk(parent, parent, d_genocide_kill); } void d_mark_tmpfile(struct file *file, struct inode *inode) { struct dentry *dentry = file->f_path.dentry; BUG_ON(dname_external(dentry) || !hlist_unhashed(&dentry->d_u.d_alias) || !d_unlinked(dentry)); spin_lock(&dentry->d_parent->d_lock); spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); dentry->d_name.len = sprintf(dentry->d_shortname.string, "#%llu", (unsigned long long)inode->i_ino); spin_unlock(&dentry->d_lock); spin_unlock(&dentry->d_parent->d_lock); } EXPORT_SYMBOL(d_mark_tmpfile); void d_tmpfile(struct file *file, struct inode *inode) { struct dentry *dentry = file->f_path.dentry; inode_dec_link_count(inode); d_mark_tmpfile(file, inode); d_instantiate(dentry, inode); } EXPORT_SYMBOL(d_tmpfile); /* * Obtain inode number of the parent dentry. */ ino_t d_parent_ino(struct dentry *dentry) { struct dentry *parent; struct inode *iparent; unsigned seq; ino_t ret; scoped_guard(rcu) { seq = raw_seqcount_begin(&dentry->d_seq); parent = READ_ONCE(dentry->d_parent); iparent = d_inode_rcu(parent); if (likely(iparent)) { ret = iparent->i_ino; if (!read_seqcount_retry(&dentry->d_seq, seq)) return ret; } } spin_lock(&dentry->d_lock); ret = dentry->d_parent->d_inode->i_ino; spin_unlock(&dentry->d_lock); return ret; } EXPORT_SYMBOL(d_parent_ino); static __initdata unsigned long dhash_entries; static int __init set_dhash_entries(char *str) { if (!str) return 0; dhash_entries = simple_strtoul(str, &str, 0); return 1; } __setup("dhash_entries=", set_dhash_entries); static void __init dcache_init_early(void) { /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. */ if (hashdist) return; dentry_hashtable = alloc_large_system_hash("Dentry cache", sizeof(struct hlist_bl_head), dhash_entries, 13, HASH_EARLY | HASH_ZERO, &d_hash_shift, NULL, 0, 0); d_hash_shift = 32 - d_hash_shift; runtime_const_init(shift, d_hash_shift); runtime_const_init(ptr, dentry_hashtable); } static void __init dcache_init(void) { /* * A constructor could be added for stable state like the lists, * but it is probably not worth it because of the cache nature * of the dcache. */ dentry_cache = KMEM_CACHE_USERCOPY(dentry, SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_ACCOUNT, d_shortname.string); /* Hash may have been set up in dcache_init_early */ if (!hashdist) return; dentry_hashtable = alloc_large_system_hash("Dentry cache", sizeof(struct hlist_bl_head), dhash_entries, 13, HASH_ZERO, &d_hash_shift, NULL, 0, 0); d_hash_shift = 32 - d_hash_shift; runtime_const_init(shift, d_hash_shift); runtime_const_init(ptr, dentry_hashtable); } /* SLAB cache for __getname() consumers */ struct kmem_cache *names_cachep __ro_after_init; EXPORT_SYMBOL(names_cachep); void __init vfs_caches_init_early(void) { int i; for (i = 0; i < ARRAY_SIZE(in_lookup_hashtable); i++) INIT_HLIST_BL_HEAD(&in_lookup_hashtable[i]); dcache_init_early(); inode_init_early(); } void __init vfs_caches_init(void) { names_cachep = kmem_cache_create_usercopy("names_cache", PATH_MAX, 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, 0, PATH_MAX, NULL); dcache_init(); inode_init(); files_init(); files_maxfiles_init(); mnt_init(); bdev_cache_init(); chrdev_init(); }
4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 /* * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved. * Copyright (c) 2016-2017, Dave Watson <davejwatson@fb.com>. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include <linux/list.h> #include <linux/rcupdate.h> #include <linux/spinlock.h> #include <net/inet_connection_sock.h> #include <net/tls.h> #include <net/tls_toe.h> #include "tls.h" static LIST_HEAD(device_list); static DEFINE_SPINLOCK(device_spinlock); static void tls_toe_sk_destruct(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tls_context *ctx = tls_get_ctx(sk); ctx->sk_destruct(sk); /* Free ctx */ rcu_assign_pointer(icsk->icsk_ulp_data, NULL); tls_ctx_free(sk, ctx); } int tls_toe_bypass(struct sock *sk) { struct tls_toe_device *dev; struct tls_context *ctx; int rc = 0; spin_lock_bh(&device_spinlock); list_for_each_entry(dev, &device_list, dev_list) { if (dev->feature && dev->feature(dev)) { ctx = tls_ctx_create(sk); if (!ctx) goto out; ctx->sk_destruct = sk->sk_destruct; sk->sk_destruct = tls_toe_sk_destruct; ctx->rx_conf = TLS_HW_RECORD; ctx->tx_conf = TLS_HW_RECORD; update_sk_prot(sk, ctx); rc = 1; break; } } out: spin_unlock_bh(&device_spinlock); return rc; } void tls_toe_unhash(struct sock *sk) { struct tls_context *ctx = tls_get_ctx(sk); struct tls_toe_device *dev; spin_lock_bh(&device_spinlock); list_for_each_entry(dev, &device_list, dev_list) { if (dev->unhash) { kref_get(&dev->kref); spin_unlock_bh(&device_spinlock); dev->unhash(dev, sk); kref_put(&dev->kref, dev->release); spin_lock_bh(&device_spinlock); } } spin_unlock_bh(&device_spinlock); ctx->sk_proto->unhash(sk); } int tls_toe_hash(struct sock *sk) { struct tls_context *ctx = tls_get_ctx(sk); struct tls_toe_device *dev; int err; err = ctx->sk_proto->hash(sk); spin_lock_bh(&device_spinlock); list_for_each_entry(dev, &device_list, dev_list) { if (dev->hash) { kref_get(&dev->kref); spin_unlock_bh(&device_spinlock); err |= dev->hash(dev, sk); kref_put(&dev->kref, dev->release); spin_lock_bh(&device_spinlock); } } spin_unlock_bh(&device_spinlock); if (err) tls_toe_unhash(sk); return err; } void tls_toe_register_device(struct tls_toe_device *device) { spin_lock_bh(&device_spinlock); list_add_tail(&device->dev_list, &device_list); spin_unlock_bh(&device_spinlock); } EXPORT_SYMBOL(tls_toe_register_device); void tls_toe_unregister_device(struct tls_toe_device *device) { spin_lock_bh(&device_spinlock); list_del(&device->dev_list); spin_unlock_bh(&device_spinlock); } EXPORT_SYMBOL(tls_toe_unregister_device);
3140 3153 1 1 3 3 3 7 8101 7872 7884 7877 7890 24 7879 8090 8101 7861 7920 7880 8868 8871 3594 8095 8863 8577 2 10 12 355 127 1 360 8536 8529 8537 8907 8905 125 124 7 7 3 3 57 2108 6419 8587 8636 7593 8574 8567 1 8579 8583 20 20 20 20 17 7541 7549 7593 3 7554 99 94 94 94 5 30 71 85 83 81 1 1 77 1 1 77 79 1 78 100 1 99 27 74 10 90 1 2 31 71 4 94 96 1 1 97 98 98 7539 7532 7537 514 1 7535 70 7047 2277 7110 1242 1242 4859 4858 46 16 46 4 4 12 21 4 19 2 1 1 1 1 17 1 1 4 12 12 3 12 14 3 5 9 4 9 2 6 8 11 2 12 9 10 4 8 1 1 1 1 4 1 1 3 4 1 3 3 102 1 8582 2 8569 8448 135 6596 8 7 6 7 1 1 1 1 6628 641 6602 6598 9066 9069 9066 9020 1256 324 8619 8577 8546 8580 8607 8583 8619 5 8 6596 1213 1212 1214 1209 1215 213 10 209 1 1458 211 1193 1 210 1 111 147 147 213 212 1 212 3 212 10 211 2012 2012 1812 2020 7 1953 207 7 659 7 7 7 21 1 19 11 9 2 6 2 4 4 2 1 1 1 7 15 1 1 1 2 2 1 2 1 1 8584 1 124 8624 6 1 1 189 8429 7523 1272 1 8577 8541 47 8544 362 1 50 359 2 313 181 133 357 1 358 1 279 82 145 280 3 360 361 8 8 8 8 2829 6805 6799 1 5 1 2 3 6 1 5 6 1 2 3 2 3 4 1 5 642 643 637 3 1009 770 282 3 1 813 270 815 1006 1005 1005 1011 1 962 37 993 1 487 640 1 639 340 301 828 822 1 403 370 760 52 52 6213 605 5803 602 5836 5816 6214 3 7540 7513 139 91 41 7502 1625 5654 1589 1287 428 1299 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 // SPDX-License-Identifier: GPL-2.0-or-later /* * NETLINK Kernel-user communication protocol. * * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * Patrick McHardy <kaber@trash.net> * * Tue Jun 26 14:36:48 MEST 2001 Herbert "herp" Rosmanith * added netlink_proto_exit * Tue Jan 22 18:32:44 BRST 2002 Arnaldo C. de Melo <acme@conectiva.com.br> * use nlk_sk, as sk->protinfo is on a diet 8) * Fri Jul 22 19:51:12 MEST 2005 Harald Welte <laforge@gnumonks.org> * - inc module use count of module that owns * the kernel socket in case userspace opens * socket of same protocol * - remove all module support, since netlink is * mandatory if CONFIG_NET=y these days */ #include <linux/module.h> #include <linux/bpf.h> #include <linux/capability.h> #include <linux/kernel.h> #include <linux/filter.h> #include <linux/init.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/stat.h> #include <linux/socket.h> #include <linux/un.h> #include <linux/fcntl.h> #include <linux/termios.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/notifier.h> #include <linux/security.h> #include <linux/jhash.h> #include <linux/jiffies.h> #include <linux/random.h> #include <linux/bitops.h> #include <linux/mm.h> #include <linux/types.h> #include <linux/audit.h> #include <linux/mutex.h> #include <linux/vmalloc.h> #include <linux/if_arp.h> #include <linux/rhashtable.h> #include <asm/cacheflush.h> #include <linux/hash.h> #include <linux/net_namespace.h> #include <linux/nospec.h> #include <linux/btf_ids.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/sock.h> #include <net/scm.h> #include <net/netlink.h> #define CREATE_TRACE_POINTS #include <trace/events/netlink.h> #include "af_netlink.h" #include "genetlink.h" struct listeners { struct rcu_head rcu; unsigned long masks[]; }; /* state bits */ #define NETLINK_S_CONGESTED 0x0 static inline int netlink_is_kernel(struct sock *sk) { return nlk_test_bit(KERNEL_SOCKET, sk); } struct netlink_table *nl_table __read_mostly; EXPORT_SYMBOL_GPL(nl_table); static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait); static struct lock_class_key nlk_cb_mutex_keys[MAX_LINKS]; static const char *const nlk_cb_mutex_key_strings[MAX_LINKS + 1] = { "nlk_cb_mutex-ROUTE", "nlk_cb_mutex-1", "nlk_cb_mutex-USERSOCK", "nlk_cb_mutex-FIREWALL", "nlk_cb_mutex-SOCK_DIAG", "nlk_cb_mutex-NFLOG", "nlk_cb_mutex-XFRM", "nlk_cb_mutex-SELINUX", "nlk_cb_mutex-ISCSI", "nlk_cb_mutex-AUDIT", "nlk_cb_mutex-FIB_LOOKUP", "nlk_cb_mutex-CONNECTOR", "nlk_cb_mutex-NETFILTER", "nlk_cb_mutex-IP6_FW", "nlk_cb_mutex-DNRTMSG", "nlk_cb_mutex-KOBJECT_UEVENT", "nlk_cb_mutex-GENERIC", "nlk_cb_mutex-17", "nlk_cb_mutex-SCSITRANSPORT", "nlk_cb_mutex-ECRYPTFS", "nlk_cb_mutex-RDMA", "nlk_cb_mutex-CRYPTO", "nlk_cb_mutex-SMC", "nlk_cb_mutex-23", "nlk_cb_mutex-24", "nlk_cb_mutex-25", "nlk_cb_mutex-26", "nlk_cb_mutex-27", "nlk_cb_mutex-28", "nlk_cb_mutex-29", "nlk_cb_mutex-30", "nlk_cb_mutex-31", "nlk_cb_mutex-MAX_LINKS" }; static int netlink_dump(struct sock *sk, bool lock_taken); /* nl_table locking explained: * Lookup and traversal are protected with an RCU read-side lock. Insertion * and removal are protected with per bucket lock while using RCU list * modification primitives and may run in parallel to RCU protected lookups. * Destruction of the Netlink socket may only occur *after* nl_table_lock has * been acquired * either during or after the socket has been removed from * the list and after an RCU grace period. */ DEFINE_RWLOCK(nl_table_lock); EXPORT_SYMBOL_GPL(nl_table_lock); static atomic_t nl_table_users = ATOMIC_INIT(0); #define nl_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&nl_table_lock)); static BLOCKING_NOTIFIER_HEAD(netlink_chain); static const struct rhashtable_params netlink_rhashtable_params; void do_trace_netlink_extack(const char *msg) { trace_netlink_extack(msg); } EXPORT_SYMBOL(do_trace_netlink_extack); static inline u32 netlink_group_mask(u32 group) { if (group > 32) return 0; return group ? 1 << (group - 1) : 0; } static struct sk_buff *netlink_to_full_skb(const struct sk_buff *skb, gfp_t gfp_mask) { unsigned int len = skb->len; struct sk_buff *new; new = alloc_skb(len, gfp_mask); if (new == NULL) return NULL; NETLINK_CB(new).portid = NETLINK_CB(skb).portid; NETLINK_CB(new).dst_group = NETLINK_CB(skb).dst_group; NETLINK_CB(new).creds = NETLINK_CB(skb).creds; skb_put_data(new, skb->data, len); return new; } static unsigned int netlink_tap_net_id; struct netlink_tap_net { struct list_head netlink_tap_all; struct mutex netlink_tap_lock; }; int netlink_add_tap(struct netlink_tap *nt) { struct net *net = dev_net(nt->dev); struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id); if (unlikely(nt->dev->type != ARPHRD_NETLINK)) return -EINVAL; mutex_lock(&nn->netlink_tap_lock); list_add_rcu(&nt->list, &nn->netlink_tap_all); mutex_unlock(&nn->netlink_tap_lock); __module_get(nt->module); return 0; } EXPORT_SYMBOL_GPL(netlink_add_tap); static int __netlink_remove_tap(struct netlink_tap *nt) { struct net *net = dev_net(nt->dev); struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id); bool found = false; struct netlink_tap *tmp; mutex_lock(&nn->netlink_tap_lock); list_for_each_entry(tmp, &nn->netlink_tap_all, list) { if (nt == tmp) { list_del_rcu(&nt->list); found = true; goto out; } } pr_warn("__netlink_remove_tap: %p not found\n", nt); out: mutex_unlock(&nn->netlink_tap_lock); if (found) module_put(nt->module); return found ? 0 : -ENODEV; } int netlink_remove_tap(struct netlink_tap *nt) { int ret; ret = __netlink_remove_tap(nt); synchronize_net(); return ret; } EXPORT_SYMBOL_GPL(netlink_remove_tap); static __net_init int netlink_tap_init_net(struct net *net) { struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id); INIT_LIST_HEAD(&nn->netlink_tap_all); mutex_init(&nn->netlink_tap_lock); return 0; } static struct pernet_operations netlink_tap_net_ops = { .init = netlink_tap_init_net, .id = &netlink_tap_net_id, .size = sizeof(struct netlink_tap_net), }; static bool netlink_filter_tap(const struct sk_buff *skb) { struct sock *sk = skb->sk; /* We take the more conservative approach and * whitelist socket protocols that may pass. */ switch (sk->sk_protocol) { case NETLINK_ROUTE: case NETLINK_USERSOCK: case NETLINK_SOCK_DIAG: case NETLINK_NFLOG: case NETLINK_XFRM: case NETLINK_FIB_LOOKUP: case NETLINK_NETFILTER: case NETLINK_GENERIC: return true; } return false; } static int __netlink_deliver_tap_skb(struct sk_buff *skb, struct net_device *dev) { struct sk_buff *nskb; struct sock *sk = skb->sk; int ret = -ENOMEM; if (!net_eq(dev_net(dev), sock_net(sk))) return 0; dev_hold(dev); if (is_vmalloc_addr(skb->head)) nskb = netlink_to_full_skb(skb, GFP_ATOMIC); else nskb = skb_clone(skb, GFP_ATOMIC); if (nskb) { nskb->dev = dev; nskb->protocol = htons((u16) sk->sk_protocol); nskb->pkt_type = netlink_is_kernel(sk) ? PACKET_KERNEL : PACKET_USER; skb_reset_network_header(nskb); ret = dev_queue_xmit(nskb); if (unlikely(ret > 0)) ret = net_xmit_errno(ret); } dev_put(dev); return ret; } static void __netlink_deliver_tap(struct sk_buff *skb, struct netlink_tap_net *nn) { int ret; struct netlink_tap *tmp; if (!netlink_filter_tap(skb)) return; list_for_each_entry_rcu(tmp, &nn->netlink_tap_all, list) { ret = __netlink_deliver_tap_skb(skb, tmp->dev); if (unlikely(ret)) break; } } static void netlink_deliver_tap(struct net *net, struct sk_buff *skb) { struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id); rcu_read_lock(); if (unlikely(!list_empty(&nn->netlink_tap_all))) __netlink_deliver_tap(skb, nn); rcu_read_unlock(); } static void netlink_deliver_tap_kernel(struct sock *dst, struct sock *src, struct sk_buff *skb) { if (!(netlink_is_kernel(dst) && netlink_is_kernel(src))) netlink_deliver_tap(sock_net(dst), skb); } static void netlink_overrun(struct sock *sk) { if (!nlk_test_bit(RECV_NO_ENOBUFS, sk)) { if (!test_and_set_bit(NETLINK_S_CONGESTED, &nlk_sk(sk)->state)) { WRITE_ONCE(sk->sk_err, ENOBUFS); sk_error_report(sk); } } atomic_inc(&sk->sk_drops); } static void netlink_rcv_wake(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); if (skb_queue_empty_lockless(&sk->sk_receive_queue)) clear_bit(NETLINK_S_CONGESTED, &nlk->state); if (!test_bit(NETLINK_S_CONGESTED, &nlk->state)) wake_up_interruptible(&nlk->wait); } static void netlink_skb_destructor(struct sk_buff *skb) { if (is_vmalloc_addr(skb->head)) { if (!skb->cloned || !atomic_dec_return(&(skb_shinfo(skb)->dataref))) vfree_atomic(skb->head); skb->head = NULL; } if (skb->sk != NULL) sock_rfree(skb); } static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk) { WARN_ON(skb->sk != NULL); skb->sk = sk; skb->destructor = netlink_skb_destructor; atomic_add(skb->truesize, &sk->sk_rmem_alloc); sk_mem_charge(sk, skb->truesize); } static void netlink_sock_destruct(struct sock *sk) { skb_queue_purge(&sk->sk_receive_queue); if (!sock_flag(sk, SOCK_DEAD)) { printk(KERN_ERR "Freeing alive netlink socket %p\n", sk); return; } WARN_ON(atomic_read(&sk->sk_rmem_alloc)); WARN_ON(refcount_read(&sk->sk_wmem_alloc)); WARN_ON(nlk_sk(sk)->groups); } /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on * SMP. Look, when several writers sleep and reader wakes them up, all but one * immediately hit write lock and grab all the cpus. Exclusive sleep solves * this, _but_ remember, it adds useless work on UP machines. */ void netlink_table_grab(void) __acquires(nl_table_lock) { might_sleep(); write_lock_irq(&nl_table_lock); if (atomic_read(&nl_table_users)) { DECLARE_WAITQUEUE(wait, current); add_wait_queue_exclusive(&nl_table_wait, &wait); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (atomic_read(&nl_table_users) == 0) break; write_unlock_irq(&nl_table_lock); schedule(); write_lock_irq(&nl_table_lock); } __set_current_state(TASK_RUNNING); remove_wait_queue(&nl_table_wait, &wait); } } void netlink_table_ungrab(void) __releases(nl_table_lock) { write_unlock_irq(&nl_table_lock); wake_up(&nl_table_wait); } static inline void netlink_lock_table(void) { unsigned long flags; /* read_lock() synchronizes us to netlink_table_grab */ read_lock_irqsave(&nl_table_lock, flags); atomic_inc(&nl_table_users); read_unlock_irqrestore(&nl_table_lock, flags); } static inline void netlink_unlock_table(void) { if (atomic_dec_and_test(&nl_table_users)) wake_up(&nl_table_wait); } struct netlink_compare_arg { possible_net_t pnet; u32 portid; }; /* Doing sizeof directly may yield 4 extra bytes on 64-bit. */ #define netlink_compare_arg_len \ (offsetof(struct netlink_compare_arg, portid) + sizeof(u32)) static inline int netlink_compare(struct rhashtable_compare_arg *arg, const void *ptr) { const struct netlink_compare_arg *x = arg->key; const struct netlink_sock *nlk = ptr; return nlk->portid != x->portid || !net_eq(sock_net(&nlk->sk), read_pnet(&x->pnet)); } static void netlink_compare_arg_init(struct netlink_compare_arg *arg, struct net *net, u32 portid) { memset(arg, 0, sizeof(*arg)); write_pnet(&arg->pnet, net); arg->portid = portid; } static struct sock *__netlink_lookup(struct netlink_table *table, u32 portid, struct net *net) { struct netlink_compare_arg arg; netlink_compare_arg_init(&arg, net, portid); return rhashtable_lookup_fast(&table->hash, &arg, netlink_rhashtable_params); } static int __netlink_insert(struct netlink_table *table, struct sock *sk) { struct netlink_compare_arg arg; netlink_compare_arg_init(&arg, sock_net(sk), nlk_sk(sk)->portid); return rhashtable_lookup_insert_key(&table->hash, &arg, &nlk_sk(sk)->node, netlink_rhashtable_params); } static struct sock *netlink_lookup(struct net *net, int protocol, u32 portid) { struct netlink_table *table = &nl_table[protocol]; struct sock *sk; rcu_read_lock(); sk = __netlink_lookup(table, portid, net); if (sk) sock_hold(sk); rcu_read_unlock(); return sk; } static const struct proto_ops netlink_ops; static void netlink_update_listeners(struct sock *sk) { struct netlink_table *tbl = &nl_table[sk->sk_protocol]; unsigned long mask; unsigned int i; struct listeners *listeners; listeners = nl_deref_protected(tbl->listeners); if (!listeners) return; for (i = 0; i < NLGRPLONGS(tbl->groups); i++) { mask = 0; sk_for_each_bound(sk, &tbl->mc_list) { if (i < NLGRPLONGS(nlk_sk(sk)->ngroups)) mask |= nlk_sk(sk)->groups[i]; } listeners->masks[i] = mask; } /* this function is only called with the netlink table "grabbed", which * makes sure updates are visible before bind or setsockopt return. */ } static int netlink_insert(struct sock *sk, u32 portid) { struct netlink_table *table = &nl_table[sk->sk_protocol]; int err; lock_sock(sk); err = nlk_sk(sk)->portid == portid ? 0 : -EBUSY; if (nlk_sk(sk)->bound) goto err; /* portid can be read locklessly from netlink_getname(). */ WRITE_ONCE(nlk_sk(sk)->portid, portid); sock_hold(sk); err = __netlink_insert(table, sk); if (err) { /* In case the hashtable backend returns with -EBUSY * from here, it must not escape to the caller. */ if (unlikely(err == -EBUSY)) err = -EOVERFLOW; if (err == -EEXIST) err = -EADDRINUSE; sock_put(sk); goto err; } /* We need to ensure that the socket is hashed and visible. */ smp_wmb(); /* Paired with lockless reads from netlink_bind(), * netlink_connect() and netlink_sendmsg(). */ WRITE_ONCE(nlk_sk(sk)->bound, portid); err: release_sock(sk); return err; } static void netlink_remove(struct sock *sk) { struct netlink_table *table; table = &nl_table[sk->sk_protocol]; if (!rhashtable_remove_fast(&table->hash, &nlk_sk(sk)->node, netlink_rhashtable_params)) { WARN_ON(refcount_read(&sk->sk_refcnt) == 1); __sock_put(sk); } netlink_table_grab(); if (nlk_sk(sk)->subscriptions) { __sk_del_bind_node(sk); netlink_update_listeners(sk); } if (sk->sk_protocol == NETLINK_GENERIC) atomic_inc(&genl_sk_destructing_cnt); netlink_table_ungrab(); } static struct proto netlink_proto = { .name = "NETLINK", .owner = THIS_MODULE, .obj_size = sizeof(struct netlink_sock), }; static int __netlink_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; struct netlink_sock *nlk; sock->ops = &netlink_ops; sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto, kern); if (!sk) return -ENOMEM; sock_init_data(sock, sk); nlk = nlk_sk(sk); mutex_init(&nlk->nl_cb_mutex); lockdep_set_class_and_name(&nlk->nl_cb_mutex, nlk_cb_mutex_keys + protocol, nlk_cb_mutex_key_strings[protocol]); init_waitqueue_head(&nlk->wait); sk->sk_destruct = netlink_sock_destruct; sk->sk_protocol = protocol; return 0; } static int netlink_create(struct net *net, struct socket *sock, int protocol, int kern) { struct module *module = NULL; struct netlink_sock *nlk; int (*bind)(struct net *net, int group); void (*unbind)(struct net *net, int group); void (*release)(struct sock *sock, unsigned long *groups); int err = 0; sock->state = SS_UNCONNECTED; if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) return -ESOCKTNOSUPPORT; if (protocol < 0 || protocol >= MAX_LINKS) return -EPROTONOSUPPORT; protocol = array_index_nospec(protocol, MAX_LINKS); netlink_lock_table(); #ifdef CONFIG_MODULES if (!nl_table[protocol].registered) { netlink_unlock_table(); request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol); netlink_lock_table(); } #endif if (nl_table[protocol].registered && try_module_get(nl_table[protocol].module)) module = nl_table[protocol].module; else err = -EPROTONOSUPPORT; bind = nl_table[protocol].bind; unbind = nl_table[protocol].unbind; release = nl_table[protocol].release; netlink_unlock_table(); if (err < 0) goto out; err = __netlink_create(net, sock, protocol, kern); if (err < 0) goto out_module; sock_prot_inuse_add(net, &netlink_proto, 1); nlk = nlk_sk(sock->sk); nlk->module = module; nlk->netlink_bind = bind; nlk->netlink_unbind = unbind; nlk->netlink_release = release; out: return err; out_module: module_put(module); goto out; } static void deferred_put_nlk_sk(struct rcu_head *head) { struct netlink_sock *nlk = container_of(head, struct netlink_sock, rcu); struct sock *sk = &nlk->sk; kfree(nlk->groups); nlk->groups = NULL; if (!refcount_dec_and_test(&sk->sk_refcnt)) return; sk_free(sk); } static int netlink_release(struct socket *sock) { struct sock *sk = sock->sk; struct netlink_sock *nlk; if (!sk) return 0; netlink_remove(sk); sock_orphan(sk); nlk = nlk_sk(sk); /* * OK. Socket is unlinked, any packets that arrive now * will be purged. */ if (nlk->netlink_release) nlk->netlink_release(sk, nlk->groups); /* must not acquire netlink_table_lock in any way again before unbind * and notifying genetlink is done as otherwise it might deadlock */ if (nlk->netlink_unbind) { int i; for (i = 0; i < nlk->ngroups; i++) if (test_bit(i, nlk->groups)) nlk->netlink_unbind(sock_net(sk), i + 1); } if (sk->sk_protocol == NETLINK_GENERIC && atomic_dec_return(&genl_sk_destructing_cnt) == 0) wake_up(&genl_sk_destructing_waitq); sock->sk = NULL; wake_up_interruptible_all(&nlk->wait); skb_queue_purge(&sk->sk_write_queue); if (nlk->portid && nlk->bound) { struct netlink_notify n = { .net = sock_net(sk), .protocol = sk->sk_protocol, .portid = nlk->portid, }; blocking_notifier_call_chain(&netlink_chain, NETLINK_URELEASE, &n); } /* Terminate any outstanding dump */ if (nlk->cb_running) { if (nlk->cb.done) nlk->cb.done(&nlk->cb); module_put(nlk->cb.module); kfree_skb(nlk->cb.skb); } module_put(nlk->module); if (netlink_is_kernel(sk)) { netlink_table_grab(); BUG_ON(nl_table[sk->sk_protocol].registered == 0); if (--nl_table[sk->sk_protocol].registered == 0) { struct listeners *old; old = nl_deref_protected(nl_table[sk->sk_protocol].listeners); RCU_INIT_POINTER(nl_table[sk->sk_protocol].listeners, NULL); kfree_rcu(old, rcu); nl_table[sk->sk_protocol].module = NULL; nl_table[sk->sk_protocol].bind = NULL; nl_table[sk->sk_protocol].unbind = NULL; nl_table[sk->sk_protocol].flags = 0; nl_table[sk->sk_protocol].registered = 0; } netlink_table_ungrab(); } sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1); /* Because struct net might disappear soon, do not keep a pointer. */ if (!sk->sk_net_refcnt && sock_net(sk) != &init_net) { __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false); /* Because of deferred_put_nlk_sk and use of work queue, * it is possible netns will be freed before this socket. */ sock_net_set(sk, &init_net); __netns_tracker_alloc(&init_net, &sk->ns_tracker, false, GFP_KERNEL); } call_rcu(&nlk->rcu, deferred_put_nlk_sk); return 0; } static int netlink_autobind(struct socket *sock) { struct sock *sk = sock->sk; struct net *net = sock_net(sk); struct netlink_table *table = &nl_table[sk->sk_protocol]; s32 portid = task_tgid_vnr(current); int err; s32 rover = -4096; bool ok; retry: cond_resched(); rcu_read_lock(); ok = !__netlink_lookup(table, portid, net); rcu_read_unlock(); if (!ok) { /* Bind collision, search negative portid values. */ if (rover == -4096) /* rover will be in range [S32_MIN, -4097] */ rover = S32_MIN + get_random_u32_below(-4096 - S32_MIN); else if (rover >= -4096) rover = -4097; portid = rover--; goto retry; } err = netlink_insert(sk, portid); if (err == -EADDRINUSE) goto retry; /* If 2 threads race to autobind, that is fine. */ if (err == -EBUSY) err = 0; return err; } /** * __netlink_ns_capable - General netlink message capability test * @nsp: NETLINK_CB of the socket buffer holding a netlink command from userspace. * @user_ns: The user namespace of the capability to use * @cap: The capability to use * * Test to see if the opener of the socket we received the message * from had when the netlink socket was created and the sender of the * message has the capability @cap in the user namespace @user_ns. */ bool __netlink_ns_capable(const struct netlink_skb_parms *nsp, struct user_namespace *user_ns, int cap) { return ((nsp->flags & NETLINK_SKB_DST) || file_ns_capable(nsp->sk->sk_socket->file, user_ns, cap)) && ns_capable(user_ns, cap); } EXPORT_SYMBOL(__netlink_ns_capable); /** * netlink_ns_capable - General netlink message capability test * @skb: socket buffer holding a netlink command from userspace * @user_ns: The user namespace of the capability to use * @cap: The capability to use * * Test to see if the opener of the socket we received the message * from had when the netlink socket was created and the sender of the * message has the capability @cap in the user namespace @user_ns. */ bool netlink_ns_capable(const struct sk_buff *skb, struct user_namespace *user_ns, int cap) { return __netlink_ns_capable(&NETLINK_CB(skb), user_ns, cap); } EXPORT_SYMBOL(netlink_ns_capable); /** * netlink_capable - Netlink global message capability test * @skb: socket buffer holding a netlink command from userspace * @cap: The capability to use * * Test to see if the opener of the socket we received the message * from had when the netlink socket was created and the sender of the * message has the capability @cap in all user namespaces. */ bool netlink_capable(const struct sk_buff *skb, int cap) { return netlink_ns_capable(skb, &init_user_ns, cap); } EXPORT_SYMBOL(netlink_capable); /** * netlink_net_capable - Netlink network namespace message capability test * @skb: socket buffer holding a netlink command from userspace * @cap: The capability to use * * Test to see if the opener of the socket we received the message * from had when the netlink socket was created and the sender of the * message has the capability @cap over the network namespace of * the socket we received the message from. */ bool netlink_net_capable(const struct sk_buff *skb, int cap) { return netlink_ns_capable(skb, sock_net(skb->sk)->user_ns, cap); } EXPORT_SYMBOL(netlink_net_capable); static inline int netlink_allowed(const struct socket *sock, unsigned int flag) { return (nl_table[sock->sk->sk_protocol].flags & flag) || ns_capable(sock_net(sock->sk)->user_ns, CAP_NET_ADMIN); } static void netlink_update_subscriptions(struct sock *sk, unsigned int subscriptions) { struct netlink_sock *nlk = nlk_sk(sk); if (nlk->subscriptions && !subscriptions) __sk_del_bind_node(sk); else if (!nlk->subscriptions && subscriptions) sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list); nlk->subscriptions = subscriptions; } static int netlink_realloc_groups(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); unsigned int groups; unsigned long *new_groups; int err = 0; netlink_table_grab(); groups = nl_table[sk->sk_protocol].groups; if (!nl_table[sk->sk_protocol].registered) { err = -ENOENT; goto out_unlock; } if (nlk->ngroups >= groups) goto out_unlock; new_groups = krealloc(nlk->groups, NLGRPSZ(groups), GFP_ATOMIC); if (new_groups == NULL) { err = -ENOMEM; goto out_unlock; } memset((char *)new_groups + NLGRPSZ(nlk->ngroups), 0, NLGRPSZ(groups) - NLGRPSZ(nlk->ngroups)); nlk->groups = new_groups; nlk->ngroups = groups; out_unlock: netlink_table_ungrab(); return err; } static void netlink_undo_bind(int group, long unsigned int groups, struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); int undo; if (!nlk->netlink_unbind) return; for (undo = 0; undo < group; undo++) if (test_bit(undo, &groups)) nlk->netlink_unbind(sock_net(sk), undo + 1); } static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sock *sk = sock->sk; struct net *net = sock_net(sk); struct netlink_sock *nlk = nlk_sk(sk); struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; int err = 0; unsigned long groups; bool bound; if (addr_len < sizeof(struct sockaddr_nl)) return -EINVAL; if (nladdr->nl_family != AF_NETLINK) return -EINVAL; groups = nladdr->nl_groups; /* Only superuser is allowed to listen multicasts */ if (groups) { if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV)) return -EPERM; err = netlink_realloc_groups(sk); if (err) return err; } if (nlk->ngroups < BITS_PER_LONG) groups &= (1UL << nlk->ngroups) - 1; /* Paired with WRITE_ONCE() in netlink_insert() */ bound = READ_ONCE(nlk->bound); if (bound) { /* Ensure nlk->portid is up-to-date. */ smp_rmb(); if (nladdr->nl_pid != nlk->portid) return -EINVAL; } if (nlk->netlink_bind && groups) { int group; /* nl_groups is a u32, so cap the maximum groups we can bind */ for (group = 0; group < BITS_PER_TYPE(u32); group++) { if (!test_bit(group, &groups)) continue; err = nlk->netlink_bind(net, group + 1); if (!err) continue; netlink_undo_bind(group, groups, sk); return err; } } /* No need for barriers here as we return to user-space without * using any of the bound attributes. */ netlink_lock_table(); if (!bound) { err = nladdr->nl_pid ? netlink_insert(sk, nladdr->nl_pid) : netlink_autobind(sock); if (err) { netlink_undo_bind(BITS_PER_TYPE(u32), groups, sk); goto unlock; } } if (!groups && (nlk->groups == NULL || !(u32)nlk->groups[0])) goto unlock; netlink_unlock_table(); netlink_table_grab(); netlink_update_subscriptions(sk, nlk->subscriptions + hweight32(groups) - hweight32(nlk->groups[0])); nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | groups; netlink_update_listeners(sk); netlink_table_ungrab(); return 0; unlock: netlink_unlock_table(); return err; } static int netlink_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) { int err = 0; struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; if (alen < sizeof(addr->sa_family)) return -EINVAL; if (addr->sa_family == AF_UNSPEC) { /* paired with READ_ONCE() in netlink_getsockbyportid() */ WRITE_ONCE(sk->sk_state, NETLINK_UNCONNECTED); /* dst_portid and dst_group can be read locklessly */ WRITE_ONCE(nlk->dst_portid, 0); WRITE_ONCE(nlk->dst_group, 0); return 0; } if (addr->sa_family != AF_NETLINK) return -EINVAL; if (alen < sizeof(struct sockaddr_nl)) return -EINVAL; if ((nladdr->nl_groups || nladdr->nl_pid) && !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND)) return -EPERM; /* No need for barriers here as we return to user-space without * using any of the bound attributes. * Paired with WRITE_ONCE() in netlink_insert(). */ if (!READ_ONCE(nlk->bound)) err = netlink_autobind(sock); if (err == 0) { /* paired with READ_ONCE() in netlink_getsockbyportid() */ WRITE_ONCE(sk->sk_state, NETLINK_CONNECTED); /* dst_portid and dst_group can be read locklessly */ WRITE_ONCE(nlk->dst_portid, nladdr->nl_pid); WRITE_ONCE(nlk->dst_group, ffs(nladdr->nl_groups)); } return err; } static int netlink_getname(struct socket *sock, struct sockaddr *addr, int peer) { struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); DECLARE_SOCKADDR(struct sockaddr_nl *, nladdr, addr); nladdr->nl_family = AF_NETLINK; nladdr->nl_pad = 0; if (peer) { /* Paired with WRITE_ONCE() in netlink_connect() */ nladdr->nl_pid = READ_ONCE(nlk->dst_portid); nladdr->nl_groups = netlink_group_mask(READ_ONCE(nlk->dst_group)); } else { /* Paired with WRITE_ONCE() in netlink_insert() */ nladdr->nl_pid = READ_ONCE(nlk->portid); netlink_lock_table(); nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0; netlink_unlock_table(); } return sizeof(*nladdr); } static int netlink_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { /* try to hand this ioctl down to the NIC drivers. */ return -ENOIOCTLCMD; } static struct sock *netlink_getsockbyportid(struct sock *ssk, u32 portid) { struct sock *sock; struct netlink_sock *nlk; sock = netlink_lookup(sock_net(ssk), ssk->sk_protocol, portid); if (!sock) return ERR_PTR(-ECONNREFUSED); /* Don't bother queuing skb if kernel socket has no input function */ nlk = nlk_sk(sock); /* dst_portid and sk_state can be changed in netlink_connect() */ if (READ_ONCE(sock->sk_state) == NETLINK_CONNECTED && READ_ONCE(nlk->dst_portid) != nlk_sk(ssk)->portid) { sock_put(sock); return ERR_PTR(-ECONNREFUSED); } return sock; } struct sock *netlink_getsockbyfd(int fd) { CLASS(fd, f)(fd); struct inode *inode; struct sock *sock; if (fd_empty(f)) return ERR_PTR(-EBADF); inode = file_inode(fd_file(f)); if (!S_ISSOCK(inode->i_mode)) return ERR_PTR(-ENOTSOCK); sock = SOCKET_I(inode)->sk; if (sock->sk_family != AF_NETLINK) return ERR_PTR(-EINVAL); sock_hold(sock); return sock; } struct sk_buff *netlink_alloc_large_skb(unsigned int size, int broadcast) { size_t head_size = SKB_HEAD_ALIGN(size); struct sk_buff *skb; void *data; if (head_size <= PAGE_SIZE || broadcast) return alloc_skb(size, GFP_KERNEL); data = kvmalloc(head_size, GFP_KERNEL); if (!data) return NULL; skb = __build_skb(data, head_size); if (!skb) kvfree(data); else if (is_vmalloc_addr(data)) skb->destructor = netlink_skb_destructor; return skb; } /* * Attach a skb to a netlink socket. * The caller must hold a reference to the destination socket. On error, the * reference is dropped. The skb is not send to the destination, just all * all error checks are performed and memory in the queue is reserved. * Return values: * < 0: error. skb freed, reference to sock dropped. * 0: continue * 1: repeat lookup - reference dropped while waiting for socket memory. */ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, long *timeo, struct sock *ssk) { struct netlink_sock *nlk; nlk = nlk_sk(sk); if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || test_bit(NETLINK_S_CONGESTED, &nlk->state))) { DECLARE_WAITQUEUE(wait, current); if (!*timeo) { if (!ssk || netlink_is_kernel(ssk)) netlink_overrun(sk); sock_put(sk); kfree_skb(skb); return -EAGAIN; } __set_current_state(TASK_INTERRUPTIBLE); add_wait_queue(&nlk->wait, &wait); if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || test_bit(NETLINK_S_CONGESTED, &nlk->state)) && !sock_flag(sk, SOCK_DEAD)) *timeo = schedule_timeout(*timeo); __set_current_state(TASK_RUNNING); remove_wait_queue(&nlk->wait, &wait); sock_put(sk); if (signal_pending(current)) { kfree_skb(skb); return sock_intr_errno(*timeo); } return 1; } netlink_skb_set_owner_r(skb, sk); return 0; } static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb) { int len = skb->len; netlink_deliver_tap(sock_net(sk), skb); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk); return len; } int netlink_sendskb(struct sock *sk, struct sk_buff *skb) { int len = __netlink_sendskb(sk, skb); sock_put(sk); return len; } void netlink_detachskb(struct sock *sk, struct sk_buff *skb) { kfree_skb(skb); sock_put(sk); } static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation) { int delta; skb_assert_len(skb); WARN_ON(skb->sk != NULL); delta = skb->end - skb->tail; if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize) return skb; if (skb_shared(skb)) { struct sk_buff *nskb = skb_clone(skb, allocation); if (!nskb) return skb; consume_skb(skb); skb = nskb; } pskb_expand_head(skb, 0, -delta, (allocation & ~__GFP_DIRECT_RECLAIM) | __GFP_NOWARN | __GFP_NORETRY); return skb; } static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb, struct sock *ssk) { int ret; struct netlink_sock *nlk = nlk_sk(sk); ret = -ECONNREFUSED; if (nlk->netlink_rcv != NULL) { ret = skb->len; netlink_skb_set_owner_r(skb, sk); NETLINK_CB(skb).sk = ssk; netlink_deliver_tap_kernel(sk, ssk, skb); nlk->netlink_rcv(skb); consume_skb(skb); } else { kfree_skb(skb); } sock_put(sk); return ret; } int netlink_unicast(struct sock *ssk, struct sk_buff *skb, u32 portid, int nonblock) { struct sock *sk; int err; long timeo; skb = netlink_trim(skb, gfp_any()); timeo = sock_sndtimeo(ssk, nonblock); retry: sk = netlink_getsockbyportid(ssk, portid); if (IS_ERR(sk)) { kfree_skb(skb); return PTR_ERR(sk); } if (netlink_is_kernel(sk)) return netlink_unicast_kernel(sk, skb, ssk); if (sk_filter(sk, skb)) { err = skb->len; kfree_skb(skb); sock_put(sk); return err; } err = netlink_attachskb(sk, skb, &timeo, ssk); if (err == 1) goto retry; if (err) return err; return netlink_sendskb(sk, skb); } EXPORT_SYMBOL(netlink_unicast); int netlink_has_listeners(struct sock *sk, unsigned int group) { int res = 0; struct listeners *listeners; BUG_ON(!netlink_is_kernel(sk)); rcu_read_lock(); listeners = rcu_dereference(nl_table[sk->sk_protocol].listeners); if (listeners && group - 1 < nl_table[sk->sk_protocol].groups) res = test_bit(group - 1, listeners->masks); rcu_read_unlock(); return res; } EXPORT_SYMBOL_GPL(netlink_has_listeners); bool netlink_strict_get_check(struct sk_buff *skb) { return nlk_test_bit(STRICT_CHK, NETLINK_CB(skb).sk); } EXPORT_SYMBOL_GPL(netlink_strict_get_check); static int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb) { struct netlink_sock *nlk = nlk_sk(sk); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && !test_bit(NETLINK_S_CONGESTED, &nlk->state)) { netlink_skb_set_owner_r(skb, sk); __netlink_sendskb(sk, skb); return atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1); } return -1; } struct netlink_broadcast_data { struct sock *exclude_sk; struct net *net; u32 portid; u32 group; int failure; int delivery_failure; int congested; int delivered; gfp_t allocation; struct sk_buff *skb, *skb2; int (*tx_filter)(struct sock *dsk, struct sk_buff *skb, void *data); void *tx_data; }; static void do_one_broadcast(struct sock *sk, struct netlink_broadcast_data *p) { struct netlink_sock *nlk = nlk_sk(sk); int val; if (p->exclude_sk == sk) return; if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups || !test_bit(p->group - 1, nlk->groups)) return; if (!net_eq(sock_net(sk), p->net)) { if (!nlk_test_bit(LISTEN_ALL_NSID, sk)) return; if (!peernet_has_id(sock_net(sk), p->net)) return; if (!file_ns_capable(sk->sk_socket->file, p->net->user_ns, CAP_NET_BROADCAST)) return; } if (p->failure) { netlink_overrun(sk); return; } sock_hold(sk); if (p->skb2 == NULL) { if (skb_shared(p->skb)) { p->skb2 = skb_clone(p->skb, p->allocation); } else { p->skb2 = skb_get(p->skb); /* * skb ownership may have been set when * delivered to a previous socket. */ skb_orphan(p->skb2); } } if (p->skb2 == NULL) { netlink_overrun(sk); /* Clone failed. Notify ALL listeners. */ p->failure = 1; if (nlk_test_bit(BROADCAST_SEND_ERROR, sk)) p->delivery_failure = 1; goto out; } if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) { kfree_skb(p->skb2); p->skb2 = NULL; goto out; } if (sk_filter(sk, p->skb2)) { kfree_skb(p->skb2); p->skb2 = NULL; goto out; } NETLINK_CB(p->skb2).nsid = peernet2id(sock_net(sk), p->net); if (NETLINK_CB(p->skb2).nsid != NETNSA_NSID_NOT_ASSIGNED) NETLINK_CB(p->skb2).nsid_is_set = true; val = netlink_broadcast_deliver(sk, p->skb2); if (val < 0) { netlink_overrun(sk); if (nlk_test_bit(BROADCAST_SEND_ERROR, sk)) p->delivery_failure = 1; } else { p->congested |= val; p->delivered = 1; p->skb2 = NULL; } out: sock_put(sk); } int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 portid, u32 group, gfp_t allocation, netlink_filter_fn filter, void *filter_data) { struct net *net = sock_net(ssk); struct netlink_broadcast_data info; struct sock *sk; skb = netlink_trim(skb, allocation); info.exclude_sk = ssk; info.net = net; info.portid = portid; info.group = group; info.failure = 0; info.delivery_failure = 0; info.congested = 0; info.delivered = 0; info.allocation = allocation; info.skb = skb; info.skb2 = NULL; info.tx_filter = filter; info.tx_data = filter_data; /* While we sleep in clone, do not allow to change socket list */ netlink_lock_table(); sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list) do_one_broadcast(sk, &info); consume_skb(skb); netlink_unlock_table(); if (info.delivery_failure) { kfree_skb(info.skb2); return -ENOBUFS; } consume_skb(info.skb2); if (info.delivered) { if (info.congested && gfpflags_allow_blocking(allocation)) yield(); return 0; } return -ESRCH; } EXPORT_SYMBOL(netlink_broadcast_filtered); int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 portid, u32 group, gfp_t allocation) { return netlink_broadcast_filtered(ssk, skb, portid, group, allocation, NULL, NULL); } EXPORT_SYMBOL(netlink_broadcast); struct netlink_set_err_data { struct sock *exclude_sk; u32 portid; u32 group; int code; }; static int do_one_set_err(struct sock *sk, struct netlink_set_err_data *p) { struct netlink_sock *nlk = nlk_sk(sk); int ret = 0; if (sk == p->exclude_sk) goto out; if (!net_eq(sock_net(sk), sock_net(p->exclude_sk))) goto out; if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups || !test_bit(p->group - 1, nlk->groups)) goto out; if (p->code == ENOBUFS && nlk_test_bit(RECV_NO_ENOBUFS, sk)) { ret = 1; goto out; } WRITE_ONCE(sk->sk_err, p->code); sk_error_report(sk); out: return ret; } /** * netlink_set_err - report error to broadcast listeners * @ssk: the kernel netlink socket, as returned by netlink_kernel_create() * @portid: the PORTID of a process that we want to skip (if any) * @group: the broadcast group that will notice the error * @code: error code, must be negative (as usual in kernelspace) * * This function returns the number of broadcast listeners that have set the * NETLINK_NO_ENOBUFS socket option. */ int netlink_set_err(struct sock *ssk, u32 portid, u32 group, int code) { struct netlink_set_err_data info; unsigned long flags; struct sock *sk; int ret = 0; info.exclude_sk = ssk; info.portid = portid; info.group = group; /* sk->sk_err wants a positive error value */ info.code = -code; read_lock_irqsave(&nl_table_lock, flags); sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list) ret += do_one_set_err(sk, &info); read_unlock_irqrestore(&nl_table_lock, flags); return ret; } EXPORT_SYMBOL(netlink_set_err); /* must be called with netlink table grabbed */ static void netlink_update_socket_mc(struct netlink_sock *nlk, unsigned int group, int is_new) { int old, new = !!is_new, subscriptions; old = test_bit(group - 1, nlk->groups); subscriptions = nlk->subscriptions - old + new; __assign_bit(group - 1, nlk->groups, new); netlink_update_subscriptions(&nlk->sk, subscriptions); netlink_update_listeners(&nlk->sk); } static int netlink_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); unsigned int val = 0; int nr = -1; if (level != SOL_NETLINK) return -ENOPROTOOPT; if (optlen >= sizeof(int) && copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; switch (optname) { case NETLINK_PKTINFO: nr = NETLINK_F_RECV_PKTINFO; break; case NETLINK_ADD_MEMBERSHIP: case NETLINK_DROP_MEMBERSHIP: { int err; if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV)) return -EPERM; err = netlink_realloc_groups(sk); if (err) return err; if (!val || val - 1 >= nlk->ngroups) return -EINVAL; if (optname == NETLINK_ADD_MEMBERSHIP && nlk->netlink_bind) { err = nlk->netlink_bind(sock_net(sk), val); if (err) return err; } netlink_table_grab(); netlink_update_socket_mc(nlk, val, optname == NETLINK_ADD_MEMBERSHIP); netlink_table_ungrab(); if (optname == NETLINK_DROP_MEMBERSHIP && nlk->netlink_unbind) nlk->netlink_unbind(sock_net(sk), val); break; } case NETLINK_BROADCAST_ERROR: nr = NETLINK_F_BROADCAST_SEND_ERROR; break; case NETLINK_NO_ENOBUFS: assign_bit(NETLINK_F_RECV_NO_ENOBUFS, &nlk->flags, val); if (val) { clear_bit(NETLINK_S_CONGESTED, &nlk->state); wake_up_interruptible(&nlk->wait); } break; case NETLINK_LISTEN_ALL_NSID: if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_BROADCAST)) return -EPERM; nr = NETLINK_F_LISTEN_ALL_NSID; break; case NETLINK_CAP_ACK: nr = NETLINK_F_CAP_ACK; break; case NETLINK_EXT_ACK: nr = NETLINK_F_EXT_ACK; break; case NETLINK_GET_STRICT_CHK: nr = NETLINK_F_STRICT_CHK; break; default: return -ENOPROTOOPT; } if (nr >= 0) assign_bit(nr, &nlk->flags, val); return 0; } static int netlink_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); unsigned int flag; int len, val; if (level != SOL_NETLINK) return -ENOPROTOOPT; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; switch (optname) { case NETLINK_PKTINFO: flag = NETLINK_F_RECV_PKTINFO; break; case NETLINK_BROADCAST_ERROR: flag = NETLINK_F_BROADCAST_SEND_ERROR; break; case NETLINK_NO_ENOBUFS: flag = NETLINK_F_RECV_NO_ENOBUFS; break; case NETLINK_LIST_MEMBERSHIPS: { int pos, idx, shift, err = 0; netlink_lock_table(); for (pos = 0; pos * 8 < nlk->ngroups; pos += sizeof(u32)) { if (len - pos < sizeof(u32)) break; idx = pos / sizeof(unsigned long); shift = (pos % sizeof(unsigned long)) * 8; if (put_user((u32)(nlk->groups[idx] >> shift), (u32 __user *)(optval + pos))) { err = -EFAULT; break; } } if (put_user(ALIGN(BITS_TO_BYTES(nlk->ngroups), sizeof(u32)), optlen)) err = -EFAULT; netlink_unlock_table(); return err; } case NETLINK_LISTEN_ALL_NSID: flag = NETLINK_F_LISTEN_ALL_NSID; break; case NETLINK_CAP_ACK: flag = NETLINK_F_CAP_ACK; break; case NETLINK_EXT_ACK: flag = NETLINK_F_EXT_ACK; break; case NETLINK_GET_STRICT_CHK: flag = NETLINK_F_STRICT_CHK; break; default: return -ENOPROTOOPT; } if (len < sizeof(int)) return -EINVAL; len = sizeof(int); val = test_bit(flag, &nlk->flags); if (put_user(len, optlen) || copy_to_user(optval, &val, len)) return -EFAULT; return 0; } static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) { struct nl_pktinfo info; info.group = NETLINK_CB(skb).dst_group; put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info); } static void netlink_cmsg_listen_all_nsid(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) { if (!NETLINK_CB(skb).nsid_is_set) return; put_cmsg(msg, SOL_NETLINK, NETLINK_LISTEN_ALL_NSID, sizeof(int), &NETLINK_CB(skb).nsid); } static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name); u32 dst_portid; u32 dst_group; struct sk_buff *skb; int err; struct scm_cookie scm; u32 netlink_skb_flags = 0; if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; if (len == 0) { pr_warn_once("Zero length message leads to an empty skb\n"); return -ENODATA; } err = scm_send(sock, msg, &scm, true); if (err < 0) return err; if (msg->msg_namelen) { err = -EINVAL; if (msg->msg_namelen < sizeof(struct sockaddr_nl)) goto out; if (addr->nl_family != AF_NETLINK) goto out; dst_portid = addr->nl_pid; dst_group = ffs(addr->nl_groups); err = -EPERM; if ((dst_group || dst_portid) && !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND)) goto out; netlink_skb_flags |= NETLINK_SKB_DST; } else { /* Paired with WRITE_ONCE() in netlink_connect() */ dst_portid = READ_ONCE(nlk->dst_portid); dst_group = READ_ONCE(nlk->dst_group); } /* Paired with WRITE_ONCE() in netlink_insert() */ if (!READ_ONCE(nlk->bound)) { err = netlink_autobind(sock); if (err) goto out; } else { /* Ensure nlk is hashed and visible. */ smp_rmb(); } err = -EMSGSIZE; if (len > sk->sk_sndbuf - 32) goto out; err = -ENOBUFS; skb = netlink_alloc_large_skb(len, dst_group); if (skb == NULL) goto out; NETLINK_CB(skb).portid = nlk->portid; NETLINK_CB(skb).dst_group = dst_group; NETLINK_CB(skb).creds = scm.creds; NETLINK_CB(skb).flags = netlink_skb_flags; err = -EFAULT; if (memcpy_from_msg(skb_put(skb, len), msg, len)) { kfree_skb(skb); goto out; } err = security_netlink_send(sk, skb); if (err) { kfree_skb(skb); goto out; } if (dst_group) { refcount_inc(&skb->users); netlink_broadcast(sk, skb, dst_portid, dst_group, GFP_KERNEL); } err = netlink_unicast(sk, skb, dst_portid, msg->msg_flags & MSG_DONTWAIT); out: scm_destroy(&scm); return err; } static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct scm_cookie scm; struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); size_t copied, max_recvmsg_len; struct sk_buff *skb, *data_skb; int err, ret; if (flags & MSG_OOB) return -EOPNOTSUPP; copied = 0; skb = skb_recv_datagram(sk, flags, &err); if (skb == NULL) goto out; data_skb = skb; #ifdef CONFIG_COMPAT_NETLINK_MESSAGES if (unlikely(skb_shinfo(skb)->frag_list)) { /* * If this skb has a frag_list, then here that means that we * will have to use the frag_list skb's data for compat tasks * and the regular skb's data for normal (non-compat) tasks. * * If we need to send the compat skb, assign it to the * 'data_skb' variable so that it will be used below for data * copying. We keep 'skb' for everything else, including * freeing both later. */ if (flags & MSG_CMSG_COMPAT) data_skb = skb_shinfo(skb)->frag_list; } #endif /* Record the max length of recvmsg() calls for future allocations */ max_recvmsg_len = max(READ_ONCE(nlk->max_recvmsg_len), len); max_recvmsg_len = min_t(size_t, max_recvmsg_len, SKB_WITH_OVERHEAD(32768)); WRITE_ONCE(nlk->max_recvmsg_len, max_recvmsg_len); copied = data_skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(data_skb, 0, msg, copied); if (msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name); addr->nl_family = AF_NETLINK; addr->nl_pad = 0; addr->nl_pid = NETLINK_CB(skb).portid; addr->nl_groups = netlink_group_mask(NETLINK_CB(skb).dst_group); msg->msg_namelen = sizeof(*addr); } if (nlk_test_bit(RECV_PKTINFO, sk)) netlink_cmsg_recv_pktinfo(msg, skb); if (nlk_test_bit(LISTEN_ALL_NSID, sk)) netlink_cmsg_listen_all_nsid(sk, msg, skb); memset(&scm, 0, sizeof(scm)); scm.creds = *NETLINK_CREDS(skb); if (flags & MSG_TRUNC) copied = data_skb->len; skb_free_datagram(sk, skb); if (READ_ONCE(nlk->cb_running) && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) { ret = netlink_dump(sk, false); if (ret) { WRITE_ONCE(sk->sk_err, -ret); sk_error_report(sk); } } scm_recv(sock, msg, &scm, flags); out: netlink_rcv_wake(sk); return err ? : copied; } static void netlink_data_ready(struct sock *sk) { BUG(); } /* * We export these functions to other modules. They provide a * complete set of kernel non-blocking support for message * queueing. */ struct sock * __netlink_kernel_create(struct net *net, int unit, struct module *module, struct netlink_kernel_cfg *cfg) { struct socket *sock; struct sock *sk; struct netlink_sock *nlk; struct listeners *listeners = NULL; unsigned int groups; BUG_ON(!nl_table); if (unit < 0 || unit >= MAX_LINKS) return NULL; if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock)) return NULL; if (__netlink_create(net, sock, unit, 1) < 0) goto out_sock_release_nosk; sk = sock->sk; if (!cfg || cfg->groups < 32) groups = 32; else groups = cfg->groups; listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL); if (!listeners) goto out_sock_release; sk->sk_data_ready = netlink_data_ready; if (cfg && cfg->input) nlk_sk(sk)->netlink_rcv = cfg->input; if (netlink_insert(sk, 0)) goto out_sock_release; nlk = nlk_sk(sk); set_bit(NETLINK_F_KERNEL_SOCKET, &nlk->flags); netlink_table_grab(); if (!nl_table[unit].registered) { nl_table[unit].groups = groups; rcu_assign_pointer(nl_table[unit].listeners, listeners); nl_table[unit].module = module; if (cfg) { nl_table[unit].bind = cfg->bind; nl_table[unit].unbind = cfg->unbind; nl_table[unit].release = cfg->release; nl_table[unit].flags = cfg->flags; } nl_table[unit].registered = 1; } else { kfree(listeners); nl_table[unit].registered++; } netlink_table_ungrab(); return sk; out_sock_release: kfree(listeners); netlink_kernel_release(sk); return NULL; out_sock_release_nosk: sock_release(sock); return NULL; } EXPORT_SYMBOL(__netlink_kernel_create); void netlink_kernel_release(struct sock *sk) { if (sk == NULL || sk->sk_socket == NULL) return; sock_release(sk->sk_socket); } EXPORT_SYMBOL(netlink_kernel_release); int __netlink_change_ngroups(struct sock *sk, unsigned int groups) { struct listeners *new, *old; struct netlink_table *tbl = &nl_table[sk->sk_protocol]; if (groups < 32) groups = 32; if (NLGRPSZ(tbl->groups) < NLGRPSZ(groups)) { new = kzalloc(sizeof(*new) + NLGRPSZ(groups), GFP_ATOMIC); if (!new) return -ENOMEM; old = nl_deref_protected(tbl->listeners); memcpy(new->masks, old->masks, NLGRPSZ(tbl->groups)); rcu_assign_pointer(tbl->listeners, new); kfree_rcu(old, rcu); } tbl->groups = groups; return 0; } /** * netlink_change_ngroups - change number of multicast groups * * This changes the number of multicast groups that are available * on a certain netlink family. Note that it is not possible to * change the number of groups to below 32. Also note that it does * not implicitly call netlink_clear_multicast_users() when the * number of groups is reduced. * * @sk: The kernel netlink socket, as returned by netlink_kernel_create(). * @groups: The new number of groups. */ int netlink_change_ngroups(struct sock *sk, unsigned int groups) { int err; netlink_table_grab(); err = __netlink_change_ngroups(sk, groups); netlink_table_ungrab(); return err; } void __netlink_clear_multicast_users(struct sock *ksk, unsigned int group) { struct sock *sk; struct netlink_table *tbl = &nl_table[ksk->sk_protocol]; struct hlist_node *tmp; sk_for_each_bound_safe(sk, tmp, &tbl->mc_list) netlink_update_socket_mc(nlk_sk(sk), group, 0); } struct nlmsghdr * __nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int flags) { struct nlmsghdr *nlh; int size = nlmsg_msg_size(len); nlh = skb_put(skb, NLMSG_ALIGN(size)); nlh->nlmsg_type = type; nlh->nlmsg_len = size; nlh->nlmsg_flags = flags; nlh->nlmsg_pid = portid; nlh->nlmsg_seq = seq; if (!__builtin_constant_p(size) || NLMSG_ALIGN(size) - size != 0) memset(nlmsg_data(nlh) + len, 0, NLMSG_ALIGN(size) - size); return nlh; } EXPORT_SYMBOL(__nlmsg_put); static size_t netlink_ack_tlv_len(struct netlink_sock *nlk, int err, const struct netlink_ext_ack *extack) { size_t tlvlen; if (!extack || !test_bit(NETLINK_F_EXT_ACK, &nlk->flags)) return 0; tlvlen = 0; if (extack->_msg) tlvlen += nla_total_size(strlen(extack->_msg) + 1); if (extack->cookie_len) tlvlen += nla_total_size(extack->cookie_len); /* Following attributes are only reported as error (not warning) */ if (!err) return tlvlen; if (extack->bad_attr) tlvlen += nla_total_size(sizeof(u32)); if (extack->policy) tlvlen += netlink_policy_dump_attr_size_estimate(extack->policy); if (extack->miss_type) tlvlen += nla_total_size(sizeof(u32)); if (extack->miss_nest) tlvlen += nla_total_size(sizeof(u32)); return tlvlen; } static bool nlmsg_check_in_payload(const struct nlmsghdr *nlh, const void *addr) { return !WARN_ON(addr < nlmsg_data(nlh) || addr - (const void *) nlh >= nlh->nlmsg_len); } static void netlink_ack_tlv_fill(struct sk_buff *skb, const struct nlmsghdr *nlh, int err, const struct netlink_ext_ack *extack) { if (extack->_msg) WARN_ON(nla_put_string(skb, NLMSGERR_ATTR_MSG, extack->_msg)); if (extack->cookie_len) WARN_ON(nla_put(skb, NLMSGERR_ATTR_COOKIE, extack->cookie_len, extack->cookie)); if (!err) return; if (extack->bad_attr && nlmsg_check_in_payload(nlh, extack->bad_attr)) WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_OFFS, (u8 *)extack->bad_attr - (const u8 *)nlh)); if (extack->policy) netlink_policy_dump_write_attr(skb, extack->policy, NLMSGERR_ATTR_POLICY); if (extack->miss_type) WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_MISS_TYPE, extack->miss_type)); if (extack->miss_nest && nlmsg_check_in_payload(nlh, extack->miss_nest)) WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_MISS_NEST, (u8 *)extack->miss_nest - (const u8 *)nlh)); } /* * It looks a bit ugly. * It would be better to create kernel thread. */ static int netlink_dump_done(struct netlink_sock *nlk, struct sk_buff *skb, struct netlink_callback *cb, struct netlink_ext_ack *extack) { struct nlmsghdr *nlh; size_t extack_len; nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE, sizeof(nlk->dump_done_errno), NLM_F_MULTI | cb->answer_flags); if (WARN_ON(!nlh)) return -ENOBUFS; nl_dump_check_consistent(cb, nlh); memcpy(nlmsg_data(nlh), &nlk->dump_done_errno, sizeof(nlk->dump_done_errno)); extack_len = netlink_ack_tlv_len(nlk, nlk->dump_done_errno, extack); if (extack_len) { nlh->nlmsg_flags |= NLM_F_ACK_TLVS; if (skb_tailroom(skb) >= extack_len) { netlink_ack_tlv_fill(skb, cb->nlh, nlk->dump_done_errno, extack); nlmsg_end(skb, nlh); } } return 0; } static int netlink_dump(struct sock *sk, bool lock_taken) { struct netlink_sock *nlk = nlk_sk(sk); struct netlink_ext_ack extack = {}; struct netlink_callback *cb; struct sk_buff *skb = NULL; size_t max_recvmsg_len; struct module *module; int err = -ENOBUFS; int alloc_min_size; int alloc_size; if (!lock_taken) mutex_lock(&nlk->nl_cb_mutex); if (!nlk->cb_running) { err = -EINVAL; goto errout_skb; } if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) goto errout_skb; /* NLMSG_GOODSIZE is small to avoid high order allocations being * required, but it makes sense to _attempt_ a 32KiB allocation * to reduce number of system calls on dump operations, if user * ever provided a big enough buffer. */ cb = &nlk->cb; alloc_min_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE); max_recvmsg_len = READ_ONCE(nlk->max_recvmsg_len); if (alloc_min_size < max_recvmsg_len) { alloc_size = max_recvmsg_len; skb = alloc_skb(alloc_size, (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) | __GFP_NOWARN | __GFP_NORETRY); } if (!skb) { alloc_size = alloc_min_size; skb = alloc_skb(alloc_size, GFP_KERNEL); } if (!skb) goto errout_skb; /* Trim skb to allocated size. User is expected to provide buffer as * large as max(min_dump_alloc, 32KiB (max_recvmsg_len capped at * netlink_recvmsg())). dump will pack as many smaller messages as * could fit within the allocated skb. skb is typically allocated * with larger space than required (could be as much as near 2x the * requested size with align to next power of 2 approach). Allowing * dump to use the excess space makes it difficult for a user to have a * reasonable static buffer based on the expected largest dump of a * single netdev. The outcome is MSG_TRUNC error. */ skb_reserve(skb, skb_tailroom(skb) - alloc_size); /* Make sure malicious BPF programs can not read unitialized memory * from skb->head -> skb->data */ skb_reset_network_header(skb); skb_reset_mac_header(skb); netlink_skb_set_owner_r(skb, sk); if (nlk->dump_done_errno > 0) { cb->extack = &extack; nlk->dump_done_errno = cb->dump(skb, cb); /* EMSGSIZE plus something already in the skb means * that there's more to dump but current skb has filled up. * If the callback really wants to return EMSGSIZE to user space * it needs to do so again, on the next cb->dump() call, * without putting data in the skb. */ if (nlk->dump_done_errno == -EMSGSIZE && skb->len) nlk->dump_done_errno = skb->len; cb->extack = NULL; } if (nlk->dump_done_errno > 0 || skb_tailroom(skb) < nlmsg_total_size(sizeof(nlk->dump_done_errno))) { mutex_unlock(&nlk->nl_cb_mutex); if (sk_filter(sk, skb)) kfree_skb(skb); else __netlink_sendskb(sk, skb); return 0; } if (netlink_dump_done(nlk, skb, cb, &extack)) goto errout_skb; #ifdef CONFIG_COMPAT_NETLINK_MESSAGES /* frag_list skb's data is used for compat tasks * and the regular skb's data for normal (non-compat) tasks. * See netlink_recvmsg(). */ if (unlikely(skb_shinfo(skb)->frag_list)) { if (netlink_dump_done(nlk, skb_shinfo(skb)->frag_list, cb, &extack)) goto errout_skb; } #endif if (sk_filter(sk, skb)) kfree_skb(skb); else __netlink_sendskb(sk, skb); if (cb->done) cb->done(cb); WRITE_ONCE(nlk->cb_running, false); module = cb->module; skb = cb->skb; mutex_unlock(&nlk->nl_cb_mutex); module_put(module); consume_skb(skb); return 0; errout_skb: mutex_unlock(&nlk->nl_cb_mutex); kfree_skb(skb); return err; } int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, const struct nlmsghdr *nlh, struct netlink_dump_control *control) { struct netlink_callback *cb; struct netlink_sock *nlk; struct sock *sk; int ret; refcount_inc(&skb->users); sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).portid); if (sk == NULL) { ret = -ECONNREFUSED; goto error_free; } nlk = nlk_sk(sk); mutex_lock(&nlk->nl_cb_mutex); /* A dump is in progress... */ if (nlk->cb_running) { ret = -EBUSY; goto error_unlock; } /* add reference of module which cb->dump belongs to */ if (!try_module_get(control->module)) { ret = -EPROTONOSUPPORT; goto error_unlock; } cb = &nlk->cb; memset(cb, 0, sizeof(*cb)); cb->dump = control->dump; cb->done = control->done; cb->nlh = nlh; cb->data = control->data; cb->module = control->module; cb->min_dump_alloc = control->min_dump_alloc; cb->flags = control->flags; cb->skb = skb; cb->strict_check = nlk_test_bit(STRICT_CHK, NETLINK_CB(skb).sk); if (control->start) { cb->extack = control->extack; ret = control->start(cb); cb->extack = NULL; if (ret) goto error_put; } WRITE_ONCE(nlk->cb_running, true); nlk->dump_done_errno = INT_MAX; ret = netlink_dump(sk, true); sock_put(sk); if (ret) return ret; /* We successfully started a dump, by returning -EINTR we * signal not to send ACK even if it was requested. */ return -EINTR; error_put: module_put(control->module); error_unlock: sock_put(sk); mutex_unlock(&nlk->nl_cb_mutex); error_free: kfree_skb(skb); return ret; } EXPORT_SYMBOL(__netlink_dump_start); void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, const struct netlink_ext_ack *extack) { struct sk_buff *skb; struct nlmsghdr *rep; struct nlmsgerr *errmsg; size_t payload = sizeof(*errmsg); struct netlink_sock *nlk = nlk_sk(NETLINK_CB(in_skb).sk); unsigned int flags = 0; size_t tlvlen; /* Error messages get the original request appened, unless the user * requests to cap the error message, and get extra error data if * requested. */ if (err && !test_bit(NETLINK_F_CAP_ACK, &nlk->flags)) payload += nlmsg_len(nlh); else flags |= NLM_F_CAPPED; tlvlen = netlink_ack_tlv_len(nlk, err, extack); if (tlvlen) flags |= NLM_F_ACK_TLVS; skb = nlmsg_new(payload + tlvlen, GFP_KERNEL); if (!skb) goto err_skb; rep = nlmsg_put(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, NLMSG_ERROR, sizeof(*errmsg), flags); if (!rep) goto err_bad_put; errmsg = nlmsg_data(rep); errmsg->error = err; errmsg->msg = *nlh; if (!(flags & NLM_F_CAPPED)) { if (!nlmsg_append(skb, nlmsg_len(nlh))) goto err_bad_put; memcpy(nlmsg_data(&errmsg->msg), nlmsg_data(nlh), nlmsg_len(nlh)); } if (tlvlen) netlink_ack_tlv_fill(skb, nlh, err, extack); nlmsg_end(skb, rep); nlmsg_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).portid); return; err_bad_put: nlmsg_free(skb); err_skb: WRITE_ONCE(NETLINK_CB(in_skb).sk->sk_err, ENOBUFS); sk_error_report(NETLINK_CB(in_skb).sk); } EXPORT_SYMBOL(netlink_ack); int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, struct nlmsghdr *, struct netlink_ext_ack *)) { struct netlink_ext_ack extack; struct nlmsghdr *nlh; int err; while (skb->len >= nlmsg_total_size(0)) { int msglen; memset(&extack, 0, sizeof(extack)); nlh = nlmsg_hdr(skb); err = 0; if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len) return 0; /* Only requests are handled by the kernel */ if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) goto ack; /* Skip control messages */ if (nlh->nlmsg_type < NLMSG_MIN_TYPE) goto ack; err = cb(skb, nlh, &extack); if (err == -EINTR) goto skip; ack: if (nlh->nlmsg_flags & NLM_F_ACK || err) netlink_ack(skb, nlh, err, &extack); skip: msglen = NLMSG_ALIGN(nlh->nlmsg_len); if (msglen > skb->len) msglen = skb->len; skb_pull(skb, msglen); } return 0; } EXPORT_SYMBOL(netlink_rcv_skb); /** * nlmsg_notify - send a notification netlink message * @sk: netlink socket to use * @skb: notification message * @portid: destination netlink portid for reports or 0 * @group: destination multicast group or 0 * @report: 1 to report back, 0 to disable * @flags: allocation flags */ int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid, unsigned int group, int report, gfp_t flags) { int err = 0; if (group) { int exclude_portid = 0; if (report) { refcount_inc(&skb->users); exclude_portid = portid; } /* errors reported via destination sk->sk_err, but propagate * delivery errors if NETLINK_BROADCAST_ERROR flag is set */ err = nlmsg_multicast(sk, skb, exclude_portid, group, flags); if (err == -ESRCH) err = 0; } if (report) { int err2; err2 = nlmsg_unicast(sk, skb, portid); if (!err) err = err2; } return err; } EXPORT_SYMBOL(nlmsg_notify); #ifdef CONFIG_PROC_FS struct nl_seq_iter { struct seq_net_private p; struct rhashtable_iter hti; int link; }; static void netlink_walk_start(struct nl_seq_iter *iter) { rhashtable_walk_enter(&nl_table[iter->link].hash, &iter->hti); rhashtable_walk_start(&iter->hti); } static void netlink_walk_stop(struct nl_seq_iter *iter) { rhashtable_walk_stop(&iter->hti); rhashtable_walk_exit(&iter->hti); } static void *__netlink_seq_next(struct seq_file *seq) { struct nl_seq_iter *iter = seq->private; struct netlink_sock *nlk; do { for (;;) { nlk = rhashtable_walk_next(&iter->hti); if (IS_ERR(nlk)) { if (PTR_ERR(nlk) == -EAGAIN) continue; return nlk; } if (nlk) break; netlink_walk_stop(iter); if (++iter->link >= MAX_LINKS) return NULL; netlink_walk_start(iter); } } while (sock_net(&nlk->sk) != seq_file_net(seq)); return nlk; } static void *netlink_seq_start(struct seq_file *seq, loff_t *posp) __acquires(RCU) { struct nl_seq_iter *iter = seq->private; void *obj = SEQ_START_TOKEN; loff_t pos; iter->link = 0; netlink_walk_start(iter); for (pos = *posp; pos && obj && !IS_ERR(obj); pos--) obj = __netlink_seq_next(seq); return obj; } static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; return __netlink_seq_next(seq); } static void netlink_native_seq_stop(struct seq_file *seq, void *v) { struct nl_seq_iter *iter = seq->private; if (iter->link >= MAX_LINKS) return; netlink_walk_stop(iter); } static int netlink_native_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) { seq_puts(seq, "sk Eth Pid Groups " "Rmem Wmem Dump Locks Drops Inode\n"); } else { struct sock *s = v; struct netlink_sock *nlk = nlk_sk(s); seq_printf(seq, "%pK %-3d %-10u %08x %-8d %-8d %-5d %-8d %-8u %-8lu\n", s, s->sk_protocol, nlk->portid, nlk->groups ? (u32)nlk->groups[0] : 0, sk_rmem_alloc_get(s), sk_wmem_alloc_get(s), READ_ONCE(nlk->cb_running), refcount_read(&s->sk_refcnt), atomic_read(&s->sk_drops), sock_i_ino(s) ); } return 0; } #ifdef CONFIG_BPF_SYSCALL struct bpf_iter__netlink { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct netlink_sock *, sk); }; DEFINE_BPF_ITER_FUNC(netlink, struct bpf_iter_meta *meta, struct netlink_sock *sk) static int netlink_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, void *v) { struct bpf_iter__netlink ctx; meta->seq_num--; /* skip SEQ_START_TOKEN */ ctx.meta = meta; ctx.sk = nlk_sk((struct sock *)v); return bpf_iter_run_prog(prog, &ctx); } static int netlink_seq_show(struct seq_file *seq, void *v) { struct bpf_iter_meta meta; struct bpf_prog *prog; meta.seq = seq; prog = bpf_iter_get_info(&meta, false); if (!prog) return netlink_native_seq_show(seq, v); if (v != SEQ_START_TOKEN) return netlink_prog_seq_show(prog, &meta, v); return 0; } static void netlink_seq_stop(struct seq_file *seq, void *v) { struct bpf_iter_meta meta; struct bpf_prog *prog; if (!v) { meta.seq = seq; prog = bpf_iter_get_info(&meta, true); if (prog) (void)netlink_prog_seq_show(prog, &meta, v); } netlink_native_seq_stop(seq, v); } #else static int netlink_seq_show(struct seq_file *seq, void *v) { return netlink_native_seq_show(seq, v); } static void netlink_seq_stop(struct seq_file *seq, void *v) { netlink_native_seq_stop(seq, v); } #endif static const struct seq_operations netlink_seq_ops = { .start = netlink_seq_start, .next = netlink_seq_next, .stop = netlink_seq_stop, .show = netlink_seq_show, }; #endif int netlink_register_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&netlink_chain, nb); } EXPORT_SYMBOL(netlink_register_notifier); int netlink_unregister_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&netlink_chain, nb); } EXPORT_SYMBOL(netlink_unregister_notifier); static const struct proto_ops netlink_ops = { .family = PF_NETLINK, .owner = THIS_MODULE, .release = netlink_release, .bind = netlink_bind, .connect = netlink_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = netlink_getname, .poll = datagram_poll, .ioctl = netlink_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = netlink_setsockopt, .getsockopt = netlink_getsockopt, .sendmsg = netlink_sendmsg, .recvmsg = netlink_recvmsg, .mmap = sock_no_mmap, }; static const struct net_proto_family netlink_family_ops = { .family = PF_NETLINK, .create = netlink_create, .owner = THIS_MODULE, /* for consistency 8) */ }; static int __net_init netlink_net_init(struct net *net) { #ifdef CONFIG_PROC_FS if (!proc_create_net("netlink", 0, net->proc_net, &netlink_seq_ops, sizeof(struct nl_seq_iter))) return -ENOMEM; #endif return 0; } static void __net_exit netlink_net_exit(struct net *net) { #ifdef CONFIG_PROC_FS remove_proc_entry("netlink", net->proc_net); #endif } static void __init netlink_add_usersock_entry(void) { struct listeners *listeners; int groups = 32; listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL); if (!listeners) panic("netlink_add_usersock_entry: Cannot allocate listeners\n"); netlink_table_grab(); nl_table[NETLINK_USERSOCK].groups = groups; rcu_assign_pointer(nl_table[NETLINK_USERSOCK].listeners, listeners); nl_table[NETLINK_USERSOCK].module = THIS_MODULE; nl_table[NETLINK_USERSOCK].registered = 1; nl_table[NETLINK_USERSOCK].flags = NL_CFG_F_NONROOT_SEND; netlink_table_ungrab(); } static struct pernet_operations __net_initdata netlink_net_ops = { .init = netlink_net_init, .exit = netlink_net_exit, }; static inline u32 netlink_hash(const void *data, u32 len, u32 seed) { const struct netlink_sock *nlk = data; struct netlink_compare_arg arg; netlink_compare_arg_init(&arg, sock_net(&nlk->sk), nlk->portid); return jhash2((u32 *)&arg, netlink_compare_arg_len / sizeof(u32), seed); } static const struct rhashtable_params netlink_rhashtable_params = { .head_offset = offsetof(struct netlink_sock, node), .key_len = netlink_compare_arg_len, .obj_hashfn = netlink_hash, .obj_cmpfn = netlink_compare, .automatic_shrinking = true, }; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) BTF_ID_LIST(btf_netlink_sock_id) BTF_ID(struct, netlink_sock) static const struct bpf_iter_seq_info netlink_seq_info = { .seq_ops = &netlink_seq_ops, .init_seq_private = bpf_iter_init_seq_net, .fini_seq_private = bpf_iter_fini_seq_net, .seq_priv_size = sizeof(struct nl_seq_iter), }; static struct bpf_iter_reg netlink_reg_info = { .target = "netlink", .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__netlink, sk), PTR_TO_BTF_ID_OR_NULL }, }, .seq_info = &netlink_seq_info, }; static int __init bpf_iter_register(void) { netlink_reg_info.ctx_arg_info[0].btf_id = *btf_netlink_sock_id; return bpf_iter_reg_target(&netlink_reg_info); } #endif static int __init netlink_proto_init(void) { int i; int err = proto_register(&netlink_proto, 0); if (err != 0) goto out; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) err = bpf_iter_register(); if (err) goto out; #endif BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > sizeof_field(struct sk_buff, cb)); nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL); if (!nl_table) goto panic; for (i = 0; i < MAX_LINKS; i++) { if (rhashtable_init(&nl_table[i].hash, &netlink_rhashtable_params) < 0) goto panic; } netlink_add_usersock_entry(); sock_register(&netlink_family_ops); register_pernet_subsys(&netlink_net_ops); register_pernet_subsys(&netlink_tap_net_ops); /* The netlink device handler may be needed early. */ rtnetlink_init(); out: return err; panic: panic("netlink_init: Cannot allocate nl_table\n"); } core_initcall(netlink_proto_init);
5 1 7 6 6 1 6 1 1 1 15 9 9 9 9 4 4 6 6 6 6 6 1 1 1 1 8 3 8 1 7 7 2 5 6 6 6 8 8 1 1 2 2 2 2 1 1 1 3 3 1 2 6 1 5 2 3 3 5 5 5 5 1 1 1 1 1 9 12 5 5 5 4 1 1 1 1 1 1 1 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 /* RFCOMM implementation for Linux Bluetooth stack (BlueZ). Copyright (C) 2002 Maxim Krasnyansky <maxk@qualcomm.com> Copyright (C) 2002 Marcel Holtmann <marcel@holtmann.org> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ /* * Bluetooth RFCOMM core. */ #include <linux/module.h> #include <linux/debugfs.h> #include <linux/kthread.h> #include <linux/unaligned.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/l2cap.h> #include <net/bluetooth/rfcomm.h> #include <trace/events/sock.h> #define VERSION "1.11" static bool disable_cfc; static bool l2cap_ertm; static int channel_mtu = -1; static struct task_struct *rfcomm_thread; static DEFINE_MUTEX(rfcomm_mutex); #define rfcomm_lock() mutex_lock(&rfcomm_mutex) #define rfcomm_unlock() mutex_unlock(&rfcomm_mutex) static LIST_HEAD(session_list); static int rfcomm_send_frame(struct rfcomm_session *s, u8 *data, int len); static int rfcomm_send_sabm(struct rfcomm_session *s, u8 dlci); static int rfcomm_send_disc(struct rfcomm_session *s, u8 dlci); static int rfcomm_queue_disc(struct rfcomm_dlc *d); static int rfcomm_send_nsc(struct rfcomm_session *s, int cr, u8 type); static int rfcomm_send_pn(struct rfcomm_session *s, int cr, struct rfcomm_dlc *d); static int rfcomm_send_msc(struct rfcomm_session *s, int cr, u8 dlci, u8 v24_sig); static int rfcomm_send_test(struct rfcomm_session *s, int cr, u8 *pattern, int len); static int rfcomm_send_credits(struct rfcomm_session *s, u8 addr, u8 credits); static void rfcomm_make_uih(struct sk_buff *skb, u8 addr); static void rfcomm_process_connect(struct rfcomm_session *s); static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src, bdaddr_t *dst, u8 sec_level, int *err); static struct rfcomm_session *rfcomm_session_get(bdaddr_t *src, bdaddr_t *dst); static struct rfcomm_session *rfcomm_session_del(struct rfcomm_session *s); /* ---- RFCOMM frame parsing macros ---- */ #define __get_dlci(b) ((b & 0xfc) >> 2) #define __get_type(b) ((b & 0xef)) #define __test_ea(b) ((b & 0x01)) #define __test_cr(b) (!!(b & 0x02)) #define __test_pf(b) (!!(b & 0x10)) #define __session_dir(s) ((s)->initiator ? 0x00 : 0x01) #define __addr(cr, dlci) (((dlci & 0x3f) << 2) | (cr << 1) | 0x01) #define __ctrl(type, pf) (((type & 0xef) | (pf << 4))) #define __dlci(dir, chn) (((chn & 0x1f) << 1) | dir) #define __srv_channel(dlci) (dlci >> 1) #define __len8(len) (((len) << 1) | 1) #define __len16(len) ((len) << 1) /* MCC macros */ #define __mcc_type(cr, type) (((type << 2) | (cr << 1) | 0x01)) #define __get_mcc_type(b) ((b & 0xfc) >> 2) #define __get_mcc_len(b) ((b & 0xfe) >> 1) /* RPN macros */ #define __rpn_line_settings(data, stop, parity) ((data & 0x3) | ((stop & 0x1) << 2) | ((parity & 0x7) << 3)) #define __get_rpn_data_bits(line) ((line) & 0x3) #define __get_rpn_stop_bits(line) (((line) >> 2) & 0x1) #define __get_rpn_parity(line) (((line) >> 3) & 0x7) static DECLARE_WAIT_QUEUE_HEAD(rfcomm_wq); static void rfcomm_schedule(void) { wake_up_all(&rfcomm_wq); } /* ---- RFCOMM FCS computation ---- */ /* reversed, 8-bit, poly=0x07 */ static unsigned char rfcomm_crc_table[256] = { 0x00, 0x91, 0xe3, 0x72, 0x07, 0x96, 0xe4, 0x75, 0x0e, 0x9f, 0xed, 0x7c, 0x09, 0x98, 0xea, 0x7b, 0x1c, 0x8d, 0xff, 0x6e, 0x1b, 0x8a, 0xf8, 0x69, 0x12, 0x83, 0xf1, 0x60, 0x15, 0x84, 0xf6, 0x67, 0x38, 0xa9, 0xdb, 0x4a, 0x3f, 0xae, 0xdc, 0x4d, 0x36, 0xa7, 0xd5, 0x44, 0x31, 0xa0, 0xd2, 0x43, 0x24, 0xb5, 0xc7, 0x56, 0x23, 0xb2, 0xc0, 0x51, 0x2a, 0xbb, 0xc9, 0x58, 0x2d, 0xbc, 0xce, 0x5f, 0x70, 0xe1, 0x93, 0x02, 0x77, 0xe6, 0x94, 0x05, 0x7e, 0xef, 0x9d, 0x0c, 0x79, 0xe8, 0x9a, 0x0b, 0x6c, 0xfd, 0x8f, 0x1e, 0x6b, 0xfa, 0x88, 0x19, 0x62, 0xf3, 0x81, 0x10, 0x65, 0xf4, 0x86, 0x17, 0x48, 0xd9, 0xab, 0x3a, 0x4f, 0xde, 0xac, 0x3d, 0x46, 0xd7, 0xa5, 0x34, 0x41, 0xd0, 0xa2, 0x33, 0x54, 0xc5, 0xb7, 0x26, 0x53, 0xc2, 0xb0, 0x21, 0x5a, 0xcb, 0xb9, 0x28, 0x5d, 0xcc, 0xbe, 0x2f, 0xe0, 0x71, 0x03, 0x92, 0xe7, 0x76, 0x04, 0x95, 0xee, 0x7f, 0x0d, 0x9c, 0xe9, 0x78, 0x0a, 0x9b, 0xfc, 0x6d, 0x1f, 0x8e, 0xfb, 0x6a, 0x18, 0x89, 0xf2, 0x63, 0x11, 0x80, 0xf5, 0x64, 0x16, 0x87, 0xd8, 0x49, 0x3b, 0xaa, 0xdf, 0x4e, 0x3c, 0xad, 0xd6, 0x47, 0x35, 0xa4, 0xd1, 0x40, 0x32, 0xa3, 0xc4, 0x55, 0x27, 0xb6, 0xc3, 0x52, 0x20, 0xb1, 0xca, 0x5b, 0x29, 0xb8, 0xcd, 0x5c, 0x2e, 0xbf, 0x90, 0x01, 0x73, 0xe2, 0x97, 0x06, 0x74, 0xe5, 0x9e, 0x0f, 0x7d, 0xec, 0x99, 0x08, 0x7a, 0xeb, 0x8c, 0x1d, 0x6f, 0xfe, 0x8b, 0x1a, 0x68, 0xf9, 0x82, 0x13, 0x61, 0xf0, 0x85, 0x14, 0x66, 0xf7, 0xa8, 0x39, 0x4b, 0xda, 0xaf, 0x3e, 0x4c, 0xdd, 0xa6, 0x37, 0x45, 0xd4, 0xa1, 0x30, 0x42, 0xd3, 0xb4, 0x25, 0x57, 0xc6, 0xb3, 0x22, 0x50, 0xc1, 0xba, 0x2b, 0x59, 0xc8, 0xbd, 0x2c, 0x5e, 0xcf }; /* CRC on 2 bytes */ #define __crc(data) (rfcomm_crc_table[rfcomm_crc_table[0xff ^ data[0]] ^ data[1]]) /* FCS on 2 bytes */ static inline u8 __fcs(u8 *data) { return 0xff - __crc(data); } /* FCS on 3 bytes */ static inline u8 __fcs2(u8 *data) { return 0xff - rfcomm_crc_table[__crc(data) ^ data[2]]; } /* Check FCS */ static inline int __check_fcs(u8 *data, int type, u8 fcs) { u8 f = __crc(data); if (type != RFCOMM_UIH) f = rfcomm_crc_table[f ^ data[2]]; return rfcomm_crc_table[f ^ fcs] != 0xcf; } /* ---- L2CAP callbacks ---- */ static void rfcomm_l2state_change(struct sock *sk) { BT_DBG("%p state %d", sk, sk->sk_state); rfcomm_schedule(); } static void rfcomm_l2data_ready(struct sock *sk) { trace_sk_data_ready(sk); BT_DBG("%p", sk); rfcomm_schedule(); } static int rfcomm_l2sock_create(struct socket **sock) { int err; BT_DBG(""); err = sock_create_kern(&init_net, PF_BLUETOOTH, SOCK_SEQPACKET, BTPROTO_L2CAP, sock); if (!err) { struct sock *sk = (*sock)->sk; sk->sk_data_ready = rfcomm_l2data_ready; sk->sk_state_change = rfcomm_l2state_change; } return err; } static int rfcomm_check_security(struct rfcomm_dlc *d) { struct sock *sk = d->session->sock->sk; struct l2cap_conn *conn = l2cap_pi(sk)->chan->conn; __u8 auth_type; switch (d->sec_level) { case BT_SECURITY_HIGH: case BT_SECURITY_FIPS: auth_type = HCI_AT_GENERAL_BONDING_MITM; break; case BT_SECURITY_MEDIUM: auth_type = HCI_AT_GENERAL_BONDING; break; default: auth_type = HCI_AT_NO_BONDING; break; } return hci_conn_security(conn->hcon, d->sec_level, auth_type, d->out); } static void rfcomm_session_timeout(struct timer_list *t) { struct rfcomm_session *s = from_timer(s, t, timer); BT_DBG("session %p state %ld", s, s->state); set_bit(RFCOMM_TIMED_OUT, &s->flags); rfcomm_schedule(); } static void rfcomm_session_set_timer(struct rfcomm_session *s, long timeout) { BT_DBG("session %p state %ld timeout %ld", s, s->state, timeout); mod_timer(&s->timer, jiffies + timeout); } static void rfcomm_session_clear_timer(struct rfcomm_session *s) { BT_DBG("session %p state %ld", s, s->state); del_timer_sync(&s->timer); } /* ---- RFCOMM DLCs ---- */ static void rfcomm_dlc_timeout(struct timer_list *t) { struct rfcomm_dlc *d = from_timer(d, t, timer); BT_DBG("dlc %p state %ld", d, d->state); set_bit(RFCOMM_TIMED_OUT, &d->flags); rfcomm_dlc_put(d); rfcomm_schedule(); } static void rfcomm_dlc_set_timer(struct rfcomm_dlc *d, long timeout) { BT_DBG("dlc %p state %ld timeout %ld", d, d->state, timeout); if (!mod_timer(&d->timer, jiffies + timeout)) rfcomm_dlc_hold(d); } static void rfcomm_dlc_clear_timer(struct rfcomm_dlc *d) { BT_DBG("dlc %p state %ld", d, d->state); if (del_timer(&d->timer)) rfcomm_dlc_put(d); } static void rfcomm_dlc_clear_state(struct rfcomm_dlc *d) { BT_DBG("%p", d); d->state = BT_OPEN; d->flags = 0; d->mscex = 0; d->sec_level = BT_SECURITY_LOW; d->mtu = RFCOMM_DEFAULT_MTU; d->v24_sig = RFCOMM_V24_RTC | RFCOMM_V24_RTR | RFCOMM_V24_DV; d->cfc = RFCOMM_CFC_DISABLED; d->rx_credits = RFCOMM_DEFAULT_CREDITS; } struct rfcomm_dlc *rfcomm_dlc_alloc(gfp_t prio) { struct rfcomm_dlc *d = kzalloc(sizeof(*d), prio); if (!d) return NULL; timer_setup(&d->timer, rfcomm_dlc_timeout, 0); skb_queue_head_init(&d->tx_queue); mutex_init(&d->lock); refcount_set(&d->refcnt, 1); rfcomm_dlc_clear_state(d); BT_DBG("%p", d); return d; } void rfcomm_dlc_free(struct rfcomm_dlc *d) { BT_DBG("%p", d); skb_queue_purge(&d->tx_queue); kfree(d); } static void rfcomm_dlc_link(struct rfcomm_session *s, struct rfcomm_dlc *d) { BT_DBG("dlc %p session %p", d, s); rfcomm_session_clear_timer(s); rfcomm_dlc_hold(d); list_add(&d->list, &s->dlcs); d->session = s; } static void rfcomm_dlc_unlink(struct rfcomm_dlc *d) { struct rfcomm_session *s = d->session; BT_DBG("dlc %p refcnt %d session %p", d, refcount_read(&d->refcnt), s); list_del(&d->list); d->session = NULL; rfcomm_dlc_put(d); if (list_empty(&s->dlcs)) rfcomm_session_set_timer(s, RFCOMM_IDLE_TIMEOUT); } static struct rfcomm_dlc *rfcomm_dlc_get(struct rfcomm_session *s, u8 dlci) { struct rfcomm_dlc *d; list_for_each_entry(d, &s->dlcs, list) if (d->dlci == dlci) return d; return NULL; } static int rfcomm_check_channel(u8 channel) { return channel < 1 || channel > 30; } static int __rfcomm_dlc_open(struct rfcomm_dlc *d, bdaddr_t *src, bdaddr_t *dst, u8 channel) { struct rfcomm_session *s; int err = 0; u8 dlci; BT_DBG("dlc %p state %ld %pMR -> %pMR channel %d", d, d->state, src, dst, channel); if (rfcomm_check_channel(channel)) return -EINVAL; if (d->state != BT_OPEN && d->state != BT_CLOSED) return 0; s = rfcomm_session_get(src, dst); if (!s) { s = rfcomm_session_create(src, dst, d->sec_level, &err); if (!s) return err; } dlci = __dlci(__session_dir(s), channel); /* Check if DLCI already exists */ if (rfcomm_dlc_get(s, dlci)) return -EBUSY; rfcomm_dlc_clear_state(d); d->dlci = dlci; d->addr = __addr(s->initiator, dlci); d->priority = 7; d->state = BT_CONFIG; rfcomm_dlc_link(s, d); d->out = 1; d->mtu = s->mtu; d->cfc = (s->cfc == RFCOMM_CFC_UNKNOWN) ? 0 : s->cfc; if (s->state == BT_CONNECTED) { if (rfcomm_check_security(d)) rfcomm_send_pn(s, 1, d); else set_bit(RFCOMM_AUTH_PENDING, &d->flags); } rfcomm_dlc_set_timer(d, RFCOMM_CONN_TIMEOUT); return 0; } int rfcomm_dlc_open(struct rfcomm_dlc *d, bdaddr_t *src, bdaddr_t *dst, u8 channel) { int r; rfcomm_lock(); r = __rfcomm_dlc_open(d, src, dst, channel); rfcomm_unlock(); return r; } static void __rfcomm_dlc_disconn(struct rfcomm_dlc *d) { struct rfcomm_session *s = d->session; d->state = BT_DISCONN; if (skb_queue_empty(&d->tx_queue)) { rfcomm_send_disc(s, d->dlci); rfcomm_dlc_set_timer(d, RFCOMM_DISC_TIMEOUT); } else { rfcomm_queue_disc(d); rfcomm_dlc_set_timer(d, RFCOMM_DISC_TIMEOUT * 2); } } static int __rfcomm_dlc_close(struct rfcomm_dlc *d, int err) { struct rfcomm_session *s = d->session; if (!s) return 0; BT_DBG("dlc %p state %ld dlci %d err %d session %p", d, d->state, d->dlci, err, s); switch (d->state) { case BT_CONNECT: case BT_CONFIG: case BT_OPEN: case BT_CONNECT2: if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) { set_bit(RFCOMM_AUTH_REJECT, &d->flags); rfcomm_schedule(); return 0; } } switch (d->state) { case BT_CONNECT: case BT_CONNECTED: __rfcomm_dlc_disconn(d); break; case BT_CONFIG: if (s->state != BT_BOUND) { __rfcomm_dlc_disconn(d); break; } /* if closing a dlc in a session that hasn't been started, * just close and unlink the dlc */ fallthrough; default: rfcomm_dlc_clear_timer(d); rfcomm_dlc_lock(d); d->state = BT_CLOSED; d->state_change(d, err); rfcomm_dlc_unlock(d); skb_queue_purge(&d->tx_queue); rfcomm_dlc_unlink(d); } return 0; } int rfcomm_dlc_close(struct rfcomm_dlc *d, int err) { int r = 0; struct rfcomm_dlc *d_list; struct rfcomm_session *s, *s_list; BT_DBG("dlc %p state %ld dlci %d err %d", d, d->state, d->dlci, err); rfcomm_lock(); s = d->session; if (!s) goto no_session; /* after waiting on the mutex check the session still exists * then check the dlc still exists */ list_for_each_entry(s_list, &session_list, list) { if (s_list == s) { list_for_each_entry(d_list, &s->dlcs, list) { if (d_list == d) { r = __rfcomm_dlc_close(d, err); break; } } break; } } no_session: rfcomm_unlock(); return r; } struct rfcomm_dlc *rfcomm_dlc_exists(bdaddr_t *src, bdaddr_t *dst, u8 channel) { struct rfcomm_session *s; struct rfcomm_dlc *dlc = NULL; u8 dlci; if (rfcomm_check_channel(channel)) return ERR_PTR(-EINVAL); rfcomm_lock(); s = rfcomm_session_get(src, dst); if (s) { dlci = __dlci(__session_dir(s), channel); dlc = rfcomm_dlc_get(s, dlci); } rfcomm_unlock(); return dlc; } static int rfcomm_dlc_send_frag(struct rfcomm_dlc *d, struct sk_buff *frag) { int len = frag->len; BT_DBG("dlc %p mtu %d len %d", d, d->mtu, len); if (len > d->mtu) return -EINVAL; rfcomm_make_uih(frag, d->addr); __skb_queue_tail(&d->tx_queue, frag); return len; } int rfcomm_dlc_send(struct rfcomm_dlc *d, struct sk_buff *skb) { unsigned long flags; struct sk_buff *frag, *next; int len; if (d->state != BT_CONNECTED) return -ENOTCONN; frag = skb_shinfo(skb)->frag_list; skb_shinfo(skb)->frag_list = NULL; /* Queue all fragments atomically. */ spin_lock_irqsave(&d->tx_queue.lock, flags); len = rfcomm_dlc_send_frag(d, skb); if (len < 0 || !frag) goto unlock; for (; frag; frag = next) { int ret; next = frag->next; ret = rfcomm_dlc_send_frag(d, frag); if (ret < 0) { dev_kfree_skb_irq(frag); goto unlock; } len += ret; } unlock: spin_unlock_irqrestore(&d->tx_queue.lock, flags); if (len > 0 && !test_bit(RFCOMM_TX_THROTTLED, &d->flags)) rfcomm_schedule(); return len; } void rfcomm_dlc_send_noerror(struct rfcomm_dlc *d, struct sk_buff *skb) { int len = skb->len; BT_DBG("dlc %p mtu %d len %d", d, d->mtu, len); rfcomm_make_uih(skb, d->addr); skb_queue_tail(&d->tx_queue, skb); if (d->state == BT_CONNECTED && !test_bit(RFCOMM_TX_THROTTLED, &d->flags)) rfcomm_schedule(); } void __rfcomm_dlc_throttle(struct rfcomm_dlc *d) { BT_DBG("dlc %p state %ld", d, d->state); if (!d->cfc) { d->v24_sig |= RFCOMM_V24_FC; set_bit(RFCOMM_MSC_PENDING, &d->flags); } rfcomm_schedule(); } void __rfcomm_dlc_unthrottle(struct rfcomm_dlc *d) { BT_DBG("dlc %p state %ld", d, d->state); if (!d->cfc) { d->v24_sig &= ~RFCOMM_V24_FC; set_bit(RFCOMM_MSC_PENDING, &d->flags); } rfcomm_schedule(); } /* Set/get modem status functions use _local_ status i.e. what we report to the other side. Remote status is provided by dlc->modem_status() callback. */ int rfcomm_dlc_set_modem_status(struct rfcomm_dlc *d, u8 v24_sig) { BT_DBG("dlc %p state %ld v24_sig 0x%x", d, d->state, v24_sig); if (test_bit(RFCOMM_RX_THROTTLED, &d->flags)) v24_sig |= RFCOMM_V24_FC; else v24_sig &= ~RFCOMM_V24_FC; d->v24_sig = v24_sig; if (!test_and_set_bit(RFCOMM_MSC_PENDING, &d->flags)) rfcomm_schedule(); return 0; } int rfcomm_dlc_get_modem_status(struct rfcomm_dlc *d, u8 *v24_sig) { BT_DBG("dlc %p state %ld v24_sig 0x%x", d, d->state, d->v24_sig); *v24_sig = d->v24_sig; return 0; } /* ---- RFCOMM sessions ---- */ static struct rfcomm_session *rfcomm_session_add(struct socket *sock, int state) { struct rfcomm_session *s = kzalloc(sizeof(*s), GFP_KERNEL); if (!s) return NULL; BT_DBG("session %p sock %p", s, sock); timer_setup(&s->timer, rfcomm_session_timeout, 0); INIT_LIST_HEAD(&s->dlcs); s->state = state; s->sock = sock; s->mtu = RFCOMM_DEFAULT_MTU; s->cfc = disable_cfc ? RFCOMM_CFC_DISABLED : RFCOMM_CFC_UNKNOWN; /* Do not increment module usage count for listening sessions. * Otherwise we won't be able to unload the module. */ if (state != BT_LISTEN) if (!try_module_get(THIS_MODULE)) { kfree(s); return NULL; } list_add(&s->list, &session_list); return s; } static struct rfcomm_session *rfcomm_session_del(struct rfcomm_session *s) { int state = s->state; BT_DBG("session %p state %ld", s, s->state); list_del(&s->list); rfcomm_session_clear_timer(s); sock_release(s->sock); kfree(s); if (state != BT_LISTEN) module_put(THIS_MODULE); return NULL; } static struct rfcomm_session *rfcomm_session_get(bdaddr_t *src, bdaddr_t *dst) { struct rfcomm_session *s, *n; struct l2cap_chan *chan; list_for_each_entry_safe(s, n, &session_list, list) { chan = l2cap_pi(s->sock->sk)->chan; if ((!bacmp(src, BDADDR_ANY) || !bacmp(&chan->src, src)) && !bacmp(&chan->dst, dst)) return s; } return NULL; } static struct rfcomm_session *rfcomm_session_close(struct rfcomm_session *s, int err) { struct rfcomm_dlc *d, *n; s->state = BT_CLOSED; BT_DBG("session %p state %ld err %d", s, s->state, err); /* Close all dlcs */ list_for_each_entry_safe(d, n, &s->dlcs, list) { d->state = BT_CLOSED; __rfcomm_dlc_close(d, err); } rfcomm_session_clear_timer(s); return rfcomm_session_del(s); } static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src, bdaddr_t *dst, u8 sec_level, int *err) { struct rfcomm_session *s = NULL; struct sockaddr_l2 addr; struct socket *sock; struct sock *sk; BT_DBG("%pMR -> %pMR", src, dst); *err = rfcomm_l2sock_create(&sock); if (*err < 0) return NULL; bacpy(&addr.l2_bdaddr, src); addr.l2_family = AF_BLUETOOTH; addr.l2_psm = 0; addr.l2_cid = 0; addr.l2_bdaddr_type = BDADDR_BREDR; *err = kernel_bind(sock, (struct sockaddr *) &addr, sizeof(addr)); if (*err < 0) goto failed; /* Set L2CAP options */ sk = sock->sk; lock_sock(sk); /* Set MTU to 0 so L2CAP can auto select the MTU */ l2cap_pi(sk)->chan->imtu = 0; l2cap_pi(sk)->chan->sec_level = sec_level; if (l2cap_ertm) l2cap_pi(sk)->chan->mode = L2CAP_MODE_ERTM; release_sock(sk); s = rfcomm_session_add(sock, BT_BOUND); if (!s) { *err = -ENOMEM; goto failed; } s->initiator = 1; bacpy(&addr.l2_bdaddr, dst); addr.l2_family = AF_BLUETOOTH; addr.l2_psm = cpu_to_le16(L2CAP_PSM_RFCOMM); addr.l2_cid = 0; addr.l2_bdaddr_type = BDADDR_BREDR; *err = kernel_connect(sock, (struct sockaddr *) &addr, sizeof(addr), O_NONBLOCK); if (*err == 0 || *err == -EINPROGRESS) return s; return rfcomm_session_del(s); failed: sock_release(sock); return NULL; } void rfcomm_session_getaddr(struct rfcomm_session *s, bdaddr_t *src, bdaddr_t *dst) { struct l2cap_chan *chan = l2cap_pi(s->sock->sk)->chan; if (src) bacpy(src, &chan->src); if (dst) bacpy(dst, &chan->dst); } /* ---- RFCOMM frame sending ---- */ static int rfcomm_send_frame(struct rfcomm_session *s, u8 *data, int len) { struct kvec iv = { data, len }; struct msghdr msg; BT_DBG("session %p len %d", s, len); memset(&msg, 0, sizeof(msg)); return kernel_sendmsg(s->sock, &msg, &iv, 1, len); } static int rfcomm_send_cmd(struct rfcomm_session *s, struct rfcomm_cmd *cmd) { BT_DBG("%p cmd %u", s, cmd->ctrl); return rfcomm_send_frame(s, (void *) cmd, sizeof(*cmd)); } static int rfcomm_send_sabm(struct rfcomm_session *s, u8 dlci) { struct rfcomm_cmd cmd; BT_DBG("%p dlci %d", s, dlci); cmd.addr = __addr(s->initiator, dlci); cmd.ctrl = __ctrl(RFCOMM_SABM, 1); cmd.len = __len8(0); cmd.fcs = __fcs2((u8 *) &cmd); return rfcomm_send_cmd(s, &cmd); } static int rfcomm_send_ua(struct rfcomm_session *s, u8 dlci) { struct rfcomm_cmd cmd; BT_DBG("%p dlci %d", s, dlci); cmd.addr = __addr(!s->initiator, dlci); cmd.ctrl = __ctrl(RFCOMM_UA, 1); cmd.len = __len8(0); cmd.fcs = __fcs2((u8 *) &cmd); return rfcomm_send_cmd(s, &cmd); } static int rfcomm_send_disc(struct rfcomm_session *s, u8 dlci) { struct rfcomm_cmd cmd; BT_DBG("%p dlci %d", s, dlci); cmd.addr = __addr(s->initiator, dlci); cmd.ctrl = __ctrl(RFCOMM_DISC, 1); cmd.len = __len8(0); cmd.fcs = __fcs2((u8 *) &cmd); return rfcomm_send_cmd(s, &cmd); } static int rfcomm_queue_disc(struct rfcomm_dlc *d) { struct rfcomm_cmd *cmd; struct sk_buff *skb; BT_DBG("dlc %p dlci %d", d, d->dlci); skb = alloc_skb(sizeof(*cmd), GFP_KERNEL); if (!skb) return -ENOMEM; cmd = __skb_put(skb, sizeof(*cmd)); cmd->addr = d->addr; cmd->ctrl = __ctrl(RFCOMM_DISC, 1); cmd->len = __len8(0); cmd->fcs = __fcs2((u8 *) cmd); skb_queue_tail(&d->tx_queue, skb); rfcomm_schedule(); return 0; } static int rfcomm_send_dm(struct rfcomm_session *s, u8 dlci) { struct rfcomm_cmd cmd; BT_DBG("%p dlci %d", s, dlci); cmd.addr = __addr(!s->initiator, dlci); cmd.ctrl = __ctrl(RFCOMM_DM, 1); cmd.len = __len8(0); cmd.fcs = __fcs2((u8 *) &cmd); return rfcomm_send_cmd(s, &cmd); } static int rfcomm_send_nsc(struct rfcomm_session *s, int cr, u8 type) { struct rfcomm_hdr *hdr; struct rfcomm_mcc *mcc; u8 buf[16], *ptr = buf; BT_DBG("%p cr %d type %d", s, cr, type); hdr = (void *) ptr; ptr += sizeof(*hdr); hdr->addr = __addr(s->initiator, 0); hdr->ctrl = __ctrl(RFCOMM_UIH, 0); hdr->len = __len8(sizeof(*mcc) + 1); mcc = (void *) ptr; ptr += sizeof(*mcc); mcc->type = __mcc_type(0, RFCOMM_NSC); mcc->len = __len8(1); /* Type that we didn't like */ *ptr = __mcc_type(cr, type); ptr++; *ptr = __fcs(buf); ptr++; return rfcomm_send_frame(s, buf, ptr - buf); } static int rfcomm_send_pn(struct rfcomm_session *s, int cr, struct rfcomm_dlc *d) { struct rfcomm_hdr *hdr; struct rfcomm_mcc *mcc; struct rfcomm_pn *pn; u8 buf[16], *ptr = buf; BT_DBG("%p cr %d dlci %d mtu %d", s, cr, d->dlci, d->mtu); hdr = (void *) ptr; ptr += sizeof(*hdr); hdr->addr = __addr(s->initiator, 0); hdr->ctrl = __ctrl(RFCOMM_UIH, 0); hdr->len = __len8(sizeof(*mcc) + sizeof(*pn)); mcc = (void *) ptr; ptr += sizeof(*mcc); mcc->type = __mcc_type(cr, RFCOMM_PN); mcc->len = __len8(sizeof(*pn)); pn = (void *) ptr; ptr += sizeof(*pn); pn->dlci = d->dlci; pn->priority = d->priority; pn->ack_timer = 0; pn->max_retrans = 0; if (s->cfc) { pn->flow_ctrl = cr ? 0xf0 : 0xe0; pn->credits = RFCOMM_DEFAULT_CREDITS; } else { pn->flow_ctrl = 0; pn->credits = 0; } if (cr && channel_mtu >= 0) pn->mtu = cpu_to_le16(channel_mtu); else pn->mtu = cpu_to_le16(d->mtu); *ptr = __fcs(buf); ptr++; return rfcomm_send_frame(s, buf, ptr - buf); } int rfcomm_send_rpn(struct rfcomm_session *s, int cr, u8 dlci, u8 bit_rate, u8 data_bits, u8 stop_bits, u8 parity, u8 flow_ctrl_settings, u8 xon_char, u8 xoff_char, u16 param_mask) { struct rfcomm_hdr *hdr; struct rfcomm_mcc *mcc; struct rfcomm_rpn *rpn; u8 buf[16], *ptr = buf; BT_DBG("%p cr %d dlci %d bit_r 0x%x data_b 0x%x stop_b 0x%x parity 0x%x" " flwc_s 0x%x xon_c 0x%x xoff_c 0x%x p_mask 0x%x", s, cr, dlci, bit_rate, data_bits, stop_bits, parity, flow_ctrl_settings, xon_char, xoff_char, param_mask); hdr = (void *) ptr; ptr += sizeof(*hdr); hdr->addr = __addr(s->initiator, 0); hdr->ctrl = __ctrl(RFCOMM_UIH, 0); hdr->len = __len8(sizeof(*mcc) + sizeof(*rpn)); mcc = (void *) ptr; ptr += sizeof(*mcc); mcc->type = __mcc_type(cr, RFCOMM_RPN); mcc->len = __len8(sizeof(*rpn)); rpn = (void *) ptr; ptr += sizeof(*rpn); rpn->dlci = __addr(1, dlci); rpn->bit_rate = bit_rate; rpn->line_settings = __rpn_line_settings(data_bits, stop_bits, parity); rpn->flow_ctrl = flow_ctrl_settings; rpn->xon_char = xon_char; rpn->xoff_char = xoff_char; rpn->param_mask = cpu_to_le16(param_mask); *ptr = __fcs(buf); ptr++; return rfcomm_send_frame(s, buf, ptr - buf); } static int rfcomm_send_rls(struct rfcomm_session *s, int cr, u8 dlci, u8 status) { struct rfcomm_hdr *hdr; struct rfcomm_mcc *mcc; struct rfcomm_rls *rls; u8 buf[16], *ptr = buf; BT_DBG("%p cr %d status 0x%x", s, cr, status); hdr = (void *) ptr; ptr += sizeof(*hdr); hdr->addr = __addr(s->initiator, 0); hdr->ctrl = __ctrl(RFCOMM_UIH, 0); hdr->len = __len8(sizeof(*mcc) + sizeof(*rls)); mcc = (void *) ptr; ptr += sizeof(*mcc); mcc->type = __mcc_type(cr, RFCOMM_RLS); mcc->len = __len8(sizeof(*rls)); rls = (void *) ptr; ptr += sizeof(*rls); rls->dlci = __addr(1, dlci); rls->status = status; *ptr = __fcs(buf); ptr++; return rfcomm_send_frame(s, buf, ptr - buf); } static int rfcomm_send_msc(struct rfcomm_session *s, int cr, u8 dlci, u8 v24_sig) { struct rfcomm_hdr *hdr; struct rfcomm_mcc *mcc; struct rfcomm_msc *msc; u8 buf[16], *ptr = buf; BT_DBG("%p cr %d v24 0x%x", s, cr, v24_sig); hdr = (void *) ptr; ptr += sizeof(*hdr); hdr->addr = __addr(s->initiator, 0); hdr->ctrl = __ctrl(RFCOMM_UIH, 0); hdr->len = __len8(sizeof(*mcc) + sizeof(*msc)); mcc = (void *) ptr; ptr += sizeof(*mcc); mcc->type = __mcc_type(cr, RFCOMM_MSC); mcc->len = __len8(sizeof(*msc)); msc = (void *) ptr; ptr += sizeof(*msc); msc->dlci = __addr(1, dlci); msc->v24_sig = v24_sig | 0x01; *ptr = __fcs(buf); ptr++; return rfcomm_send_frame(s, buf, ptr - buf); } static int rfcomm_send_fcoff(struct rfcomm_session *s, int cr) { struct rfcomm_hdr *hdr; struct rfcomm_mcc *mcc; u8 buf[16], *ptr = buf; BT_DBG("%p cr %d", s, cr); hdr = (void *) ptr; ptr += sizeof(*hdr); hdr->addr = __addr(s->initiator, 0); hdr->ctrl = __ctrl(RFCOMM_UIH, 0); hdr->len = __len8(sizeof(*mcc)); mcc = (void *) ptr; ptr += sizeof(*mcc); mcc->type = __mcc_type(cr, RFCOMM_FCOFF); mcc->len = __len8(0); *ptr = __fcs(buf); ptr++; return rfcomm_send_frame(s, buf, ptr - buf); } static int rfcomm_send_fcon(struct rfcomm_session *s, int cr) { struct rfcomm_hdr *hdr; struct rfcomm_mcc *mcc; u8 buf[16], *ptr = buf; BT_DBG("%p cr %d", s, cr); hdr = (void *) ptr; ptr += sizeof(*hdr); hdr->addr = __addr(s->initiator, 0); hdr->ctrl = __ctrl(RFCOMM_UIH, 0); hdr->len = __len8(sizeof(*mcc)); mcc = (void *) ptr; ptr += sizeof(*mcc); mcc->type = __mcc_type(cr, RFCOMM_FCON); mcc->len = __len8(0); *ptr = __fcs(buf); ptr++; return rfcomm_send_frame(s, buf, ptr - buf); } static int rfcomm_send_test(struct rfcomm_session *s, int cr, u8 *pattern, int len) { struct socket *sock = s->sock; struct kvec iv[3]; struct msghdr msg; unsigned char hdr[5], crc[1]; if (len > 125) return -EINVAL; BT_DBG("%p cr %d", s, cr); hdr[0] = __addr(s->initiator, 0); hdr[1] = __ctrl(RFCOMM_UIH, 0); hdr[2] = 0x01 | ((len + 2) << 1); hdr[3] = 0x01 | ((cr & 0x01) << 1) | (RFCOMM_TEST << 2); hdr[4] = 0x01 | (len << 1); crc[0] = __fcs(hdr); iv[0].iov_base = hdr; iv[0].iov_len = 5; iv[1].iov_base = pattern; iv[1].iov_len = len; iv[2].iov_base = crc; iv[2].iov_len = 1; memset(&msg, 0, sizeof(msg)); return kernel_sendmsg(sock, &msg, iv, 3, 6 + len); } static int rfcomm_send_credits(struct rfcomm_session *s, u8 addr, u8 credits) { struct rfcomm_hdr *hdr; u8 buf[16], *ptr = buf; BT_DBG("%p addr %d credits %d", s, addr, credits); hdr = (void *) ptr; ptr += sizeof(*hdr); hdr->addr = addr; hdr->ctrl = __ctrl(RFCOMM_UIH, 1); hdr->len = __len8(0); *ptr = credits; ptr++; *ptr = __fcs(buf); ptr++; return rfcomm_send_frame(s, buf, ptr - buf); } static void rfcomm_make_uih(struct sk_buff *skb, u8 addr) { struct rfcomm_hdr *hdr; int len = skb->len; u8 *crc; if (len > 127) { hdr = skb_push(skb, 4); put_unaligned(cpu_to_le16(__len16(len)), (__le16 *) &hdr->len); } else { hdr = skb_push(skb, 3); hdr->len = __len8(len); } hdr->addr = addr; hdr->ctrl = __ctrl(RFCOMM_UIH, 0); crc = skb_put(skb, 1); *crc = __fcs((void *) hdr); } /* ---- RFCOMM frame reception ---- */ static struct rfcomm_session *rfcomm_recv_ua(struct rfcomm_session *s, u8 dlci) { BT_DBG("session %p state %ld dlci %d", s, s->state, dlci); if (dlci) { /* Data channel */ struct rfcomm_dlc *d = rfcomm_dlc_get(s, dlci); if (!d) { rfcomm_send_dm(s, dlci); return s; } switch (d->state) { case BT_CONNECT: rfcomm_dlc_clear_timer(d); rfcomm_dlc_lock(d); d->state = BT_CONNECTED; d->state_change(d, 0); rfcomm_dlc_unlock(d); rfcomm_send_msc(s, 1, dlci, d->v24_sig); break; case BT_DISCONN: d->state = BT_CLOSED; __rfcomm_dlc_close(d, 0); if (list_empty(&s->dlcs)) { s->state = BT_DISCONN; rfcomm_send_disc(s, 0); rfcomm_session_clear_timer(s); } break; } } else { /* Control channel */ switch (s->state) { case BT_CONNECT: s->state = BT_CONNECTED; rfcomm_process_connect(s); break; case BT_DISCONN: s = rfcomm_session_close(s, ECONNRESET); break; } } return s; } static struct rfcomm_session *rfcomm_recv_dm(struct rfcomm_session *s, u8 dlci) { int err = 0; BT_DBG("session %p state %ld dlci %d", s, s->state, dlci); if (dlci) { /* Data DLC */ struct rfcomm_dlc *d = rfcomm_dlc_get(s, dlci); if (d) { if (d->state == BT_CONNECT || d->state == BT_CONFIG) err = ECONNREFUSED; else err = ECONNRESET; d->state = BT_CLOSED; __rfcomm_dlc_close(d, err); } } else { if (s->state == BT_CONNECT) err = ECONNREFUSED; else err = ECONNRESET; s = rfcomm_session_close(s, err); } return s; } static struct rfcomm_session *rfcomm_recv_disc(struct rfcomm_session *s, u8 dlci) { int err = 0; BT_DBG("session %p state %ld dlci %d", s, s->state, dlci); if (dlci) { struct rfcomm_dlc *d = rfcomm_dlc_get(s, dlci); if (d) { rfcomm_send_ua(s, dlci); if (d->state == BT_CONNECT || d->state == BT_CONFIG) err = ECONNREFUSED; else err = ECONNRESET; d->state = BT_CLOSED; __rfcomm_dlc_close(d, err); } else rfcomm_send_dm(s, dlci); } else { rfcomm_send_ua(s, 0); if (s->state == BT_CONNECT) err = ECONNREFUSED; else err = ECONNRESET; s = rfcomm_session_close(s, err); } return s; } void rfcomm_dlc_accept(struct rfcomm_dlc *d) { struct sock *sk = d->session->sock->sk; struct l2cap_conn *conn = l2cap_pi(sk)->chan->conn; BT_DBG("dlc %p", d); rfcomm_send_ua(d->session, d->dlci); rfcomm_dlc_clear_timer(d); rfcomm_dlc_lock(d); d->state = BT_CONNECTED; d->state_change(d, 0); rfcomm_dlc_unlock(d); if (d->role_switch) hci_conn_switch_role(conn->hcon, 0x00); rfcomm_send_msc(d->session, 1, d->dlci, d->v24_sig); } static void rfcomm_check_accept(struct rfcomm_dlc *d) { if (rfcomm_check_security(d)) { if (d->defer_setup) { set_bit(RFCOMM_DEFER_SETUP, &d->flags); rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT); rfcomm_dlc_lock(d); d->state = BT_CONNECT2; d->state_change(d, 0); rfcomm_dlc_unlock(d); } else rfcomm_dlc_accept(d); } else { set_bit(RFCOMM_AUTH_PENDING, &d->flags); rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT); } } static int rfcomm_recv_sabm(struct rfcomm_session *s, u8 dlci) { struct rfcomm_dlc *d; u8 channel; BT_DBG("session %p state %ld dlci %d", s, s->state, dlci); if (!dlci) { rfcomm_send_ua(s, 0); if (s->state == BT_OPEN) { s->state = BT_CONNECTED; rfcomm_process_connect(s); } return 0; } /* Check if DLC exists */ d = rfcomm_dlc_get(s, dlci); if (d) { if (d->state == BT_OPEN) { /* DLC was previously opened by PN request */ rfcomm_check_accept(d); } return 0; } /* Notify socket layer about incoming connection */ channel = __srv_channel(dlci); if (rfcomm_connect_ind(s, channel, &d)) { d->dlci = dlci; d->addr = __addr(s->initiator, dlci); rfcomm_dlc_link(s, d); rfcomm_check_accept(d); } else { rfcomm_send_dm(s, dlci); } return 0; } static int rfcomm_apply_pn(struct rfcomm_dlc *d, int cr, struct rfcomm_pn *pn) { struct rfcomm_session *s = d->session; BT_DBG("dlc %p state %ld dlci %d mtu %d fc 0x%x credits %d", d, d->state, d->dlci, pn->mtu, pn->flow_ctrl, pn->credits); if ((pn->flow_ctrl == 0xf0 && s->cfc != RFCOMM_CFC_DISABLED) || pn->flow_ctrl == 0xe0) { d->cfc = RFCOMM_CFC_ENABLED; d->tx_credits = pn->credits; } else { d->cfc = RFCOMM_CFC_DISABLED; set_bit(RFCOMM_TX_THROTTLED, &d->flags); } if (s->cfc == RFCOMM_CFC_UNKNOWN) s->cfc = d->cfc; d->priority = pn->priority; d->mtu = __le16_to_cpu(pn->mtu); if (cr && d->mtu > s->mtu) d->mtu = s->mtu; return 0; } static int rfcomm_recv_pn(struct rfcomm_session *s, int cr, struct sk_buff *skb) { struct rfcomm_pn *pn = (void *) skb->data; struct rfcomm_dlc *d; u8 dlci = pn->dlci; BT_DBG("session %p state %ld dlci %d", s, s->state, dlci); if (!dlci) return 0; d = rfcomm_dlc_get(s, dlci); if (d) { if (cr) { /* PN request */ rfcomm_apply_pn(d, cr, pn); rfcomm_send_pn(s, 0, d); } else { /* PN response */ switch (d->state) { case BT_CONFIG: rfcomm_apply_pn(d, cr, pn); d->state = BT_CONNECT; rfcomm_send_sabm(s, d->dlci); break; } } } else { u8 channel = __srv_channel(dlci); if (!cr) return 0; /* PN request for non existing DLC. * Assume incoming connection. */ if (rfcomm_connect_ind(s, channel, &d)) { d->dlci = dlci; d->addr = __addr(s->initiator, dlci); rfcomm_dlc_link(s, d); rfcomm_apply_pn(d, cr, pn); d->state = BT_OPEN; rfcomm_send_pn(s, 0, d); } else { rfcomm_send_dm(s, dlci); } } return 0; } static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_buff *skb) { struct rfcomm_rpn *rpn = (void *) skb->data; u8 dlci = __get_dlci(rpn->dlci); u8 bit_rate = 0; u8 data_bits = 0; u8 stop_bits = 0; u8 parity = 0; u8 flow_ctrl = 0; u8 xon_char = 0; u8 xoff_char = 0; u16 rpn_mask = RFCOMM_RPN_PM_ALL; BT_DBG("dlci %d cr %d len 0x%x bitr 0x%x line 0x%x flow 0x%x xonc 0x%x xoffc 0x%x pm 0x%x", dlci, cr, len, rpn->bit_rate, rpn->line_settings, rpn->flow_ctrl, rpn->xon_char, rpn->xoff_char, rpn->param_mask); if (!cr) return 0; if (len == 1) { /* This is a request, return default (according to ETSI TS 07.10) settings */ bit_rate = RFCOMM_RPN_BR_9600; data_bits = RFCOMM_RPN_DATA_8; stop_bits = RFCOMM_RPN_STOP_1; parity = RFCOMM_RPN_PARITY_NONE; flow_ctrl = RFCOMM_RPN_FLOW_NONE; xon_char = RFCOMM_RPN_XON_CHAR; xoff_char = RFCOMM_RPN_XOFF_CHAR; goto rpn_out; } /* Check for sane values, ignore/accept bit_rate, 8 bits, 1 stop bit, * no parity, no flow control lines, normal XON/XOFF chars */ if (rpn->param_mask & cpu_to_le16(RFCOMM_RPN_PM_BITRATE)) { bit_rate = rpn->bit_rate; if (bit_rate > RFCOMM_RPN_BR_230400) { BT_DBG("RPN bit rate mismatch 0x%x", bit_rate); bit_rate = RFCOMM_RPN_BR_9600; rpn_mask ^= RFCOMM_RPN_PM_BITRATE; } } if (rpn->param_mask & cpu_to_le16(RFCOMM_RPN_PM_DATA)) { data_bits = __get_rpn_data_bits(rpn->line_settings); if (data_bits != RFCOMM_RPN_DATA_8) { BT_DBG("RPN data bits mismatch 0x%x", data_bits); data_bits = RFCOMM_RPN_DATA_8; rpn_mask ^= RFCOMM_RPN_PM_DATA; } } if (rpn->param_mask & cpu_to_le16(RFCOMM_RPN_PM_STOP)) { stop_bits = __get_rpn_stop_bits(rpn->line_settings); if (stop_bits != RFCOMM_RPN_STOP_1) { BT_DBG("RPN stop bits mismatch 0x%x", stop_bits); stop_bits = RFCOMM_RPN_STOP_1; rpn_mask ^= RFCOMM_RPN_PM_STOP; } } if (rpn->param_mask & cpu_to_le16(RFCOMM_RPN_PM_PARITY)) { parity = __get_rpn_parity(rpn->line_settings); if (parity != RFCOMM_RPN_PARITY_NONE) { BT_DBG("RPN parity mismatch 0x%x", parity); parity = RFCOMM_RPN_PARITY_NONE; rpn_mask ^= RFCOMM_RPN_PM_PARITY; } } if (rpn->param_mask & cpu_to_le16(RFCOMM_RPN_PM_FLOW)) { flow_ctrl = rpn->flow_ctrl; if (flow_ctrl != RFCOMM_RPN_FLOW_NONE) { BT_DBG("RPN flow ctrl mismatch 0x%x", flow_ctrl); flow_ctrl = RFCOMM_RPN_FLOW_NONE; rpn_mask ^= RFCOMM_RPN_PM_FLOW; } } if (rpn->param_mask & cpu_to_le16(RFCOMM_RPN_PM_XON)) { xon_char = rpn->xon_char; if (xon_char != RFCOMM_RPN_XON_CHAR) { BT_DBG("RPN XON char mismatch 0x%x", xon_char); xon_char = RFCOMM_RPN_XON_CHAR; rpn_mask ^= RFCOMM_RPN_PM_XON; } } if (rpn->param_mask & cpu_to_le16(RFCOMM_RPN_PM_XOFF)) { xoff_char = rpn->xoff_char; if (xoff_char != RFCOMM_RPN_XOFF_CHAR) { BT_DBG("RPN XOFF char mismatch 0x%x", xoff_char); xoff_char = RFCOMM_RPN_XOFF_CHAR; rpn_mask ^= RFCOMM_RPN_PM_XOFF; } } rpn_out: rfcomm_send_rpn(s, 0, dlci, bit_rate, data_bits, stop_bits, parity, flow_ctrl, xon_char, xoff_char, rpn_mask); return 0; } static int rfcomm_recv_rls(struct rfcomm_session *s, int cr, struct sk_buff *skb) { struct rfcomm_rls *rls = (void *) skb->data; u8 dlci = __get_dlci(rls->dlci); BT_DBG("dlci %d cr %d status 0x%x", dlci, cr, rls->status); if (!cr) return 0; /* We should probably do something with this information here. But * for now it's sufficient just to reply -- Bluetooth 1.1 says it's * mandatory to recognise and respond to RLS */ rfcomm_send_rls(s, 0, dlci, rls->status); return 0; } static int rfcomm_recv_msc(struct rfcomm_session *s, int cr, struct sk_buff *skb) { struct rfcomm_msc *msc = (void *) skb->data; struct rfcomm_dlc *d; u8 dlci = __get_dlci(msc->dlci); BT_DBG("dlci %d cr %d v24 0x%x", dlci, cr, msc->v24_sig); d = rfcomm_dlc_get(s, dlci); if (!d) return 0; if (cr) { if (msc->v24_sig & RFCOMM_V24_FC && !d->cfc) set_bit(RFCOMM_TX_THROTTLED, &d->flags); else clear_bit(RFCOMM_TX_THROTTLED, &d->flags); rfcomm_dlc_lock(d); d->remote_v24_sig = msc->v24_sig; if (d->modem_status) d->modem_status(d, msc->v24_sig); rfcomm_dlc_unlock(d); rfcomm_send_msc(s, 0, dlci, msc->v24_sig); d->mscex |= RFCOMM_MSCEX_RX; } else d->mscex |= RFCOMM_MSCEX_TX; return 0; } static int rfcomm_recv_mcc(struct rfcomm_session *s, struct sk_buff *skb) { struct rfcomm_mcc *mcc = (void *) skb->data; u8 type, cr, len; cr = __test_cr(mcc->type); type = __get_mcc_type(mcc->type); len = __get_mcc_len(mcc->len); BT_DBG("%p type 0x%x cr %d", s, type, cr); skb_pull(skb, 2); switch (type) { case RFCOMM_PN: rfcomm_recv_pn(s, cr, skb); break; case RFCOMM_RPN: rfcomm_recv_rpn(s, cr, len, skb); break; case RFCOMM_RLS: rfcomm_recv_rls(s, cr, skb); break; case RFCOMM_MSC: rfcomm_recv_msc(s, cr, skb); break; case RFCOMM_FCOFF: if (cr) { set_bit(RFCOMM_TX_THROTTLED, &s->flags); rfcomm_send_fcoff(s, 0); } break; case RFCOMM_FCON: if (cr) { clear_bit(RFCOMM_TX_THROTTLED, &s->flags); rfcomm_send_fcon(s, 0); } break; case RFCOMM_TEST: if (cr) rfcomm_send_test(s, 0, skb->data, skb->len); break; case RFCOMM_NSC: break; default: BT_ERR("Unknown control type 0x%02x", type); rfcomm_send_nsc(s, cr, type); break; } return 0; } static int rfcomm_recv_data(struct rfcomm_session *s, u8 dlci, int pf, struct sk_buff *skb) { struct rfcomm_dlc *d; BT_DBG("session %p state %ld dlci %d pf %d", s, s->state, dlci, pf); d = rfcomm_dlc_get(s, dlci); if (!d) { rfcomm_send_dm(s, dlci); goto drop; } if (pf && d->cfc) { u8 credits = *(u8 *) skb->data; skb_pull(skb, 1); d->tx_credits += credits; if (d->tx_credits) clear_bit(RFCOMM_TX_THROTTLED, &d->flags); } if (skb->len && d->state == BT_CONNECTED) { rfcomm_dlc_lock(d); d->rx_credits--; d->data_ready(d, skb); rfcomm_dlc_unlock(d); return 0; } drop: kfree_skb(skb); return 0; } static struct rfcomm_session *rfcomm_recv_frame(struct rfcomm_session *s, struct sk_buff *skb) { struct rfcomm_hdr *hdr = (void *) skb->data; u8 type, dlci, fcs; if (!s) { /* no session, so free socket data */ kfree_skb(skb); return s; } dlci = __get_dlci(hdr->addr); type = __get_type(hdr->ctrl); /* Trim FCS */ skb->len--; skb->tail--; fcs = *(u8 *)skb_tail_pointer(skb); if (__check_fcs(skb->data, type, fcs)) { BT_ERR("bad checksum in packet"); kfree_skb(skb); return s; } if (__test_ea(hdr->len)) skb_pull(skb, 3); else skb_pull(skb, 4); switch (type) { case RFCOMM_SABM: if (__test_pf(hdr->ctrl)) rfcomm_recv_sabm(s, dlci); break; case RFCOMM_DISC: if (__test_pf(hdr->ctrl)) s = rfcomm_recv_disc(s, dlci); break; case RFCOMM_UA: if (__test_pf(hdr->ctrl)) s = rfcomm_recv_ua(s, dlci); break; case RFCOMM_DM: s = rfcomm_recv_dm(s, dlci); break; case RFCOMM_UIH: if (dlci) { rfcomm_recv_data(s, dlci, __test_pf(hdr->ctrl), skb); return s; } rfcomm_recv_mcc(s, skb); break; default: BT_ERR("Unknown packet type 0x%02x", type); break; } kfree_skb(skb); return s; } /* ---- Connection and data processing ---- */ static void rfcomm_process_connect(struct rfcomm_session *s) { struct rfcomm_dlc *d, *n; BT_DBG("session %p state %ld", s, s->state); list_for_each_entry_safe(d, n, &s->dlcs, list) { if (d->state == BT_CONFIG) { d->mtu = s->mtu; if (rfcomm_check_security(d)) { rfcomm_send_pn(s, 1, d); } else { set_bit(RFCOMM_AUTH_PENDING, &d->flags); rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT); } } } } /* Send data queued for the DLC. * Return number of frames left in the queue. */ static int rfcomm_process_tx(struct rfcomm_dlc *d) { struct sk_buff *skb; int err; BT_DBG("dlc %p state %ld cfc %d rx_credits %d tx_credits %d", d, d->state, d->cfc, d->rx_credits, d->tx_credits); /* Send pending MSC */ if (test_and_clear_bit(RFCOMM_MSC_PENDING, &d->flags)) rfcomm_send_msc(d->session, 1, d->dlci, d->v24_sig); if (d->cfc) { /* CFC enabled. * Give them some credits */ if (!test_bit(RFCOMM_RX_THROTTLED, &d->flags) && d->rx_credits <= (d->cfc >> 2)) { rfcomm_send_credits(d->session, d->addr, d->cfc - d->rx_credits); d->rx_credits = d->cfc; } } else { /* CFC disabled. * Give ourselves some credits */ d->tx_credits = 5; } if (test_bit(RFCOMM_TX_THROTTLED, &d->flags)) return skb_queue_len(&d->tx_queue); while (d->tx_credits && (skb = skb_dequeue(&d->tx_queue))) { err = rfcomm_send_frame(d->session, skb->data, skb->len); if (err < 0) { skb_queue_head(&d->tx_queue, skb); break; } kfree_skb(skb); d->tx_credits--; } if (d->cfc && !d->tx_credits) { /* We're out of TX credits. * Set TX_THROTTLED flag to avoid unnesary wakeups by dlc_send. */ set_bit(RFCOMM_TX_THROTTLED, &d->flags); } return skb_queue_len(&d->tx_queue); } static void rfcomm_process_dlcs(struct rfcomm_session *s) { struct rfcomm_dlc *d, *n; BT_DBG("session %p state %ld", s, s->state); list_for_each_entry_safe(d, n, &s->dlcs, list) { if (test_bit(RFCOMM_TIMED_OUT, &d->flags)) { __rfcomm_dlc_close(d, ETIMEDOUT); continue; } if (test_bit(RFCOMM_ENC_DROP, &d->flags)) { __rfcomm_dlc_close(d, ECONNREFUSED); continue; } if (test_and_clear_bit(RFCOMM_AUTH_ACCEPT, &d->flags)) { rfcomm_dlc_clear_timer(d); if (d->out) { rfcomm_send_pn(s, 1, d); rfcomm_dlc_set_timer(d, RFCOMM_CONN_TIMEOUT); } else { if (d->defer_setup) { set_bit(RFCOMM_DEFER_SETUP, &d->flags); rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT); rfcomm_dlc_lock(d); d->state = BT_CONNECT2; d->state_change(d, 0); rfcomm_dlc_unlock(d); } else rfcomm_dlc_accept(d); } continue; } else if (test_and_clear_bit(RFCOMM_AUTH_REJECT, &d->flags)) { rfcomm_dlc_clear_timer(d); if (!d->out) rfcomm_send_dm(s, d->dlci); else d->state = BT_CLOSED; __rfcomm_dlc_close(d, ECONNREFUSED); continue; } if (test_bit(RFCOMM_SEC_PENDING, &d->flags)) continue; if (test_bit(RFCOMM_TX_THROTTLED, &s->flags)) continue; if ((d->state == BT_CONNECTED || d->state == BT_DISCONN) && d->mscex == RFCOMM_MSCEX_OK) rfcomm_process_tx(d); } } static struct rfcomm_session *rfcomm_process_rx(struct rfcomm_session *s) { struct socket *sock = s->sock; struct sock *sk = sock->sk; struct sk_buff *skb; BT_DBG("session %p state %ld qlen %d", s, s->state, skb_queue_len(&sk->sk_receive_queue)); /* Get data directly from socket receive queue without copying it. */ while ((skb = skb_dequeue(&sk->sk_receive_queue))) { skb_orphan(skb); if (!skb_linearize(skb) && sk->sk_state != BT_CLOSED) { s = rfcomm_recv_frame(s, skb); if (!s) break; } else { kfree_skb(skb); } } if (s && (sk->sk_state == BT_CLOSED)) s = rfcomm_session_close(s, sk->sk_err); return s; } static void rfcomm_accept_connection(struct rfcomm_session *s) { struct socket *sock = s->sock, *nsock; int err; /* Fast check for a new connection. * Avoids unnesesary socket allocations. */ if (list_empty(&bt_sk(sock->sk)->accept_q)) return; BT_DBG("session %p", s); err = kernel_accept(sock, &nsock, O_NONBLOCK); if (err < 0) return; /* Set our callbacks */ nsock->sk->sk_data_ready = rfcomm_l2data_ready; nsock->sk->sk_state_change = rfcomm_l2state_change; s = rfcomm_session_add(nsock, BT_OPEN); if (s) { /* We should adjust MTU on incoming sessions. * L2CAP MTU minus UIH header and FCS. */ s->mtu = min(l2cap_pi(nsock->sk)->chan->omtu, l2cap_pi(nsock->sk)->chan->imtu) - 5; rfcomm_schedule(); } else sock_release(nsock); } static struct rfcomm_session *rfcomm_check_connection(struct rfcomm_session *s) { struct sock *sk = s->sock->sk; BT_DBG("%p state %ld", s, s->state); switch (sk->sk_state) { case BT_CONNECTED: s->state = BT_CONNECT; /* We can adjust MTU on outgoing sessions. * L2CAP MTU minus UIH header and FCS. */ s->mtu = min(l2cap_pi(sk)->chan->omtu, l2cap_pi(sk)->chan->imtu) - 5; rfcomm_send_sabm(s, 0); break; case BT_CLOSED: s = rfcomm_session_close(s, sk->sk_err); break; } return s; } static void rfcomm_process_sessions(void) { struct rfcomm_session *s, *n; rfcomm_lock(); list_for_each_entry_safe(s, n, &session_list, list) { if (test_and_clear_bit(RFCOMM_TIMED_OUT, &s->flags)) { s->state = BT_DISCONN; rfcomm_send_disc(s, 0); continue; } switch (s->state) { case BT_LISTEN: rfcomm_accept_connection(s); continue; case BT_BOUND: s = rfcomm_check_connection(s); break; default: s = rfcomm_process_rx(s); break; } if (s) rfcomm_process_dlcs(s); } rfcomm_unlock(); } static int rfcomm_add_listener(bdaddr_t *ba) { struct sockaddr_l2 addr; struct socket *sock; struct sock *sk; struct rfcomm_session *s; int err = 0; /* Create socket */ err = rfcomm_l2sock_create(&sock); if (err < 0) { BT_ERR("Create socket failed %d", err); return err; } /* Bind socket */ bacpy(&addr.l2_bdaddr, ba); addr.l2_family = AF_BLUETOOTH; addr.l2_psm = cpu_to_le16(L2CAP_PSM_RFCOMM); addr.l2_cid = 0; addr.l2_bdaddr_type = BDADDR_BREDR; err = kernel_bind(sock, (struct sockaddr *) &addr, sizeof(addr)); if (err < 0) { BT_ERR("Bind failed %d", err); goto failed; } /* Set L2CAP options */ sk = sock->sk; lock_sock(sk); /* Set MTU to 0 so L2CAP can auto select the MTU */ l2cap_pi(sk)->chan->imtu = 0; release_sock(sk); /* Start listening on the socket */ err = kernel_listen(sock, 10); if (err) { BT_ERR("Listen failed %d", err); goto failed; } /* Add listening session */ s = rfcomm_session_add(sock, BT_LISTEN); if (!s) { err = -ENOMEM; goto failed; } return 0; failed: sock_release(sock); return err; } static void rfcomm_kill_listener(void) { struct rfcomm_session *s, *n; BT_DBG(""); list_for_each_entry_safe(s, n, &session_list, list) rfcomm_session_del(s); } static int rfcomm_run(void *unused) { DEFINE_WAIT_FUNC(wait, woken_wake_function); BT_DBG(""); set_user_nice(current, -10); rfcomm_add_listener(BDADDR_ANY); add_wait_queue(&rfcomm_wq, &wait); while (!kthread_should_stop()) { /* Process stuff */ rfcomm_process_sessions(); wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } remove_wait_queue(&rfcomm_wq, &wait); rfcomm_kill_listener(); return 0; } static bool rfcomm_match(struct hci_conn *hcon) { return hcon->type == ACL_LINK; } static void rfcomm_security_cfm(struct hci_conn *conn, u8 status, u8 encrypt) { struct rfcomm_session *s; struct rfcomm_dlc *d, *n; BT_DBG("conn %p status 0x%02x encrypt 0x%02x", conn, status, encrypt); s = rfcomm_session_get(&conn->hdev->bdaddr, &conn->dst); if (!s) return; list_for_each_entry_safe(d, n, &s->dlcs, list) { if (test_and_clear_bit(RFCOMM_SEC_PENDING, &d->flags)) { rfcomm_dlc_clear_timer(d); if (status || encrypt == 0x00) { set_bit(RFCOMM_ENC_DROP, &d->flags); continue; } } if (d->state == BT_CONNECTED && !status && encrypt == 0x00) { if (d->sec_level == BT_SECURITY_MEDIUM) { set_bit(RFCOMM_SEC_PENDING, &d->flags); rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT); continue; } else if (d->sec_level == BT_SECURITY_HIGH || d->sec_level == BT_SECURITY_FIPS) { set_bit(RFCOMM_ENC_DROP, &d->flags); continue; } } if (!test_and_clear_bit(RFCOMM_AUTH_PENDING, &d->flags)) continue; if (!status && hci_conn_check_secure(conn, d->sec_level)) set_bit(RFCOMM_AUTH_ACCEPT, &d->flags); else set_bit(RFCOMM_AUTH_REJECT, &d->flags); } rfcomm_schedule(); } static struct hci_cb rfcomm_cb = { .name = "RFCOMM", .match = rfcomm_match, .security_cfm = rfcomm_security_cfm }; static int rfcomm_dlc_debugfs_show(struct seq_file *f, void *x) { struct rfcomm_session *s; rfcomm_lock(); list_for_each_entry(s, &session_list, list) { struct l2cap_chan *chan = l2cap_pi(s->sock->sk)->chan; struct rfcomm_dlc *d; list_for_each_entry(d, &s->dlcs, list) { seq_printf(f, "%pMR %pMR %ld %d %d %d %d\n", &chan->src, &chan->dst, d->state, d->dlci, d->mtu, d->rx_credits, d->tx_credits); } } rfcomm_unlock(); return 0; } DEFINE_SHOW_ATTRIBUTE(rfcomm_dlc_debugfs); static struct dentry *rfcomm_dlc_debugfs; /* ---- Initialization ---- */ static int __init rfcomm_init(void) { int err; hci_register_cb(&rfcomm_cb); rfcomm_thread = kthread_run(rfcomm_run, NULL, "krfcommd"); if (IS_ERR(rfcomm_thread)) { err = PTR_ERR(rfcomm_thread); goto unregister; } err = rfcomm_init_ttys(); if (err < 0) goto stop; err = rfcomm_init_sockets(); if (err < 0) goto cleanup; BT_INFO("RFCOMM ver %s", VERSION); if (IS_ERR_OR_NULL(bt_debugfs)) return 0; rfcomm_dlc_debugfs = debugfs_create_file("rfcomm_dlc", 0444, bt_debugfs, NULL, &rfcomm_dlc_debugfs_fops); return 0; cleanup: rfcomm_cleanup_ttys(); stop: kthread_stop(rfcomm_thread); unregister: hci_unregister_cb(&rfcomm_cb); return err; } static void __exit rfcomm_exit(void) { debugfs_remove(rfcomm_dlc_debugfs); hci_unregister_cb(&rfcomm_cb); kthread_stop(rfcomm_thread); rfcomm_cleanup_ttys(); rfcomm_cleanup_sockets(); } module_init(rfcomm_init); module_exit(rfcomm_exit); module_param(disable_cfc, bool, 0644); MODULE_PARM_DESC(disable_cfc, "Disable credit based flow control"); module_param(channel_mtu, int, 0644); MODULE_PARM_DESC(channel_mtu, "Default MTU for the RFCOMM channel"); module_param(l2cap_ertm, bool, 0644); MODULE_PARM_DESC(l2cap_ertm, "Use L2CAP ERTM mode for connection"); MODULE_AUTHOR("Marcel Holtmann <marcel@holtmann.org>"); MODULE_DESCRIPTION("Bluetooth RFCOMM ver " VERSION); MODULE_VERSION(VERSION); MODULE_LICENSE("GPL"); MODULE_ALIAS("bt-proto-3");
8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 // SPDX-License-Identifier: GPL-2.0 /* Bareudp: UDP tunnel encasulation for different Payload types like * MPLS, NSH, IP, etc. * Copyright (c) 2019 Nokia, Inc. * Authors: Martin Varghese, <martin.varghese@nokia.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/etherdevice.h> #include <linux/hash.h> #include <net/dst_metadata.h> #include <net/gro_cells.h> #include <net/rtnetlink.h> #include <net/protocol.h> #include <net/ip6_tunnel.h> #include <net/ip_tunnels.h> #include <net/udp_tunnel.h> #include <net/bareudp.h> #define BAREUDP_BASE_HLEN sizeof(struct udphdr) #define BAREUDP_IPV4_HLEN (sizeof(struct iphdr) + \ sizeof(struct udphdr)) #define BAREUDP_IPV6_HLEN (sizeof(struct ipv6hdr) + \ sizeof(struct udphdr)) static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); /* per-network namespace private data for this module */ static unsigned int bareudp_net_id; struct bareudp_net { struct list_head bareudp_list; }; struct bareudp_conf { __be16 ethertype; __be16 port; u16 sport_min; bool multi_proto_mode; }; /* Pseudo network device */ struct bareudp_dev { struct net *net; /* netns for packet i/o */ struct net_device *dev; /* netdev for bareudp tunnel */ __be16 ethertype; __be16 port; u16 sport_min; bool multi_proto_mode; struct socket __rcu *sock; struct list_head next; /* bareudp node on namespace list */ struct gro_cells gro_cells; }; static int bareudp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) { struct metadata_dst *tun_dst = NULL; IP_TUNNEL_DECLARE_FLAGS(key) = { }; struct bareudp_dev *bareudp; unsigned short family; unsigned int len; __be16 proto; void *oiph; int err; int nh; bareudp = rcu_dereference_sk_user_data(sk); if (!bareudp) goto drop; if (skb->protocol == htons(ETH_P_IP)) family = AF_INET; else family = AF_INET6; if (bareudp->ethertype == htons(ETH_P_IP)) { __u8 ipversion; if (skb_copy_bits(skb, BAREUDP_BASE_HLEN, &ipversion, sizeof(ipversion))) { dev_dstats_rx_dropped(bareudp->dev); goto drop; } ipversion >>= 4; if (ipversion == 4) { proto = htons(ETH_P_IP); } else if (ipversion == 6 && bareudp->multi_proto_mode) { proto = htons(ETH_P_IPV6); } else { dev_dstats_rx_dropped(bareudp->dev); goto drop; } } else if (bareudp->ethertype == htons(ETH_P_MPLS_UC)) { struct iphdr *tunnel_hdr; tunnel_hdr = (struct iphdr *)skb_network_header(skb); if (tunnel_hdr->version == 4) { if (!ipv4_is_multicast(tunnel_hdr->daddr)) { proto = bareudp->ethertype; } else if (bareudp->multi_proto_mode && ipv4_is_multicast(tunnel_hdr->daddr)) { proto = htons(ETH_P_MPLS_MC); } else { dev_dstats_rx_dropped(bareudp->dev); goto drop; } } else { int addr_type; struct ipv6hdr *tunnel_hdr_v6; tunnel_hdr_v6 = (struct ipv6hdr *)skb_network_header(skb); addr_type = ipv6_addr_type((struct in6_addr *)&tunnel_hdr_v6->daddr); if (!(addr_type & IPV6_ADDR_MULTICAST)) { proto = bareudp->ethertype; } else if (bareudp->multi_proto_mode && (addr_type & IPV6_ADDR_MULTICAST)) { proto = htons(ETH_P_MPLS_MC); } else { dev_dstats_rx_dropped(bareudp->dev); goto drop; } } } else { proto = bareudp->ethertype; } if (iptunnel_pull_header(skb, BAREUDP_BASE_HLEN, proto, !net_eq(bareudp->net, dev_net(bareudp->dev)))) { dev_dstats_rx_dropped(bareudp->dev); goto drop; } __set_bit(IP_TUNNEL_KEY_BIT, key); tun_dst = udp_tun_rx_dst(skb, family, key, 0, 0); if (!tun_dst) { dev_dstats_rx_dropped(bareudp->dev); goto drop; } skb_dst_set(skb, &tun_dst->dst); skb->dev = bareudp->dev; skb_reset_mac_header(skb); /* Save offset of outer header relative to skb->head, * because we are going to reset the network header to the inner header * and might change skb->head. */ nh = skb_network_header(skb) - skb->head; skb_reset_network_header(skb); if (!pskb_inet_may_pull(skb)) { DEV_STATS_INC(bareudp->dev, rx_length_errors); DEV_STATS_INC(bareudp->dev, rx_errors); goto drop; } /* Get the outer header. */ oiph = skb->head + nh; if (!ipv6_mod_enabled() || family == AF_INET) err = IP_ECN_decapsulate(oiph, skb); else err = IP6_ECN_decapsulate(oiph, skb); if (unlikely(err)) { if (log_ecn_error) { if (!ipv6_mod_enabled() || family == AF_INET) net_info_ratelimited("non-ECT from %pI4 " "with TOS=%#x\n", &((struct iphdr *)oiph)->saddr, ((struct iphdr *)oiph)->tos); else net_info_ratelimited("non-ECT from %pI6\n", &((struct ipv6hdr *)oiph)->saddr); } if (err > 1) { DEV_STATS_INC(bareudp->dev, rx_frame_errors); DEV_STATS_INC(bareudp->dev, rx_errors); goto drop; } } len = skb->len; err = gro_cells_receive(&bareudp->gro_cells, skb); if (likely(err == NET_RX_SUCCESS)) dev_dstats_rx_add(bareudp->dev, len); return 0; drop: /* Consume bad packet */ kfree_skb(skb); return 0; } static int bareudp_err_lookup(struct sock *sk, struct sk_buff *skb) { return 0; } static int bareudp_init(struct net_device *dev) { struct bareudp_dev *bareudp = netdev_priv(dev); int err; err = gro_cells_init(&bareudp->gro_cells, dev); if (err) return err; return 0; } static void bareudp_uninit(struct net_device *dev) { struct bareudp_dev *bareudp = netdev_priv(dev); gro_cells_destroy(&bareudp->gro_cells); } static struct socket *bareudp_create_sock(struct net *net, __be16 port) { struct udp_port_cfg udp_conf; struct socket *sock; int err; memset(&udp_conf, 0, sizeof(udp_conf)); if (ipv6_mod_enabled()) udp_conf.family = AF_INET6; else udp_conf.family = AF_INET; udp_conf.local_udp_port = port; /* Open UDP socket */ err = udp_sock_create(net, &udp_conf, &sock); if (err < 0) return ERR_PTR(err); udp_allow_gso(sock->sk); return sock; } /* Create new listen socket if needed */ static int bareudp_socket_create(struct bareudp_dev *bareudp, __be16 port) { struct udp_tunnel_sock_cfg tunnel_cfg; struct socket *sock; sock = bareudp_create_sock(bareudp->net, port); if (IS_ERR(sock)) return PTR_ERR(sock); /* Mark socket as an encapsulation socket */ memset(&tunnel_cfg, 0, sizeof(tunnel_cfg)); tunnel_cfg.sk_user_data = bareudp; tunnel_cfg.encap_type = 1; tunnel_cfg.encap_rcv = bareudp_udp_encap_recv; tunnel_cfg.encap_err_lookup = bareudp_err_lookup; tunnel_cfg.encap_destroy = NULL; setup_udp_tunnel_sock(bareudp->net, sock, &tunnel_cfg); rcu_assign_pointer(bareudp->sock, sock); return 0; } static int bareudp_open(struct net_device *dev) { struct bareudp_dev *bareudp = netdev_priv(dev); int ret = 0; ret = bareudp_socket_create(bareudp, bareudp->port); return ret; } static void bareudp_sock_release(struct bareudp_dev *bareudp) { struct socket *sock; sock = bareudp->sock; rcu_assign_pointer(bareudp->sock, NULL); synchronize_net(); udp_tunnel_sock_release(sock); } static int bareudp_stop(struct net_device *dev) { struct bareudp_dev *bareudp = netdev_priv(dev); bareudp_sock_release(bareudp); return 0; } static int bareudp_xmit_skb(struct sk_buff *skb, struct net_device *dev, struct bareudp_dev *bareudp, const struct ip_tunnel_info *info) { bool udp_sum = test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags); bool xnet = !net_eq(bareudp->net, dev_net(bareudp->dev)); bool use_cache = ip_tunnel_dst_cache_usable(skb, info); struct socket *sock = rcu_dereference(bareudp->sock); const struct ip_tunnel_key *key = &info->key; struct rtable *rt; __be16 sport, df; int min_headroom; __u8 tos, ttl; __be32 saddr; int err; if (skb_vlan_inet_prepare(skb, skb->protocol != htons(ETH_P_TEB))) return -EINVAL; if (!sock) return -ESHUTDOWN; sport = udp_flow_src_port(bareudp->net, skb, bareudp->sport_min, USHRT_MAX, true); rt = udp_tunnel_dst_lookup(skb, dev, bareudp->net, 0, &saddr, &info->key, sport, bareudp->port, key->tos, use_cache ? (struct dst_cache *)&info->dst_cache : NULL); if (IS_ERR(rt)) return PTR_ERR(rt); skb_tunnel_check_pmtu(skb, &rt->dst, BAREUDP_IPV4_HLEN + info->options_len, false); tos = ip_tunnel_ecn_encap(key->tos, ip_hdr(skb), skb); ttl = key->ttl; df = test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags) ? htons(IP_DF) : 0; skb_scrub_packet(skb, xnet); err = -ENOSPC; if (!skb_pull(skb, skb_network_offset(skb))) goto free_dst; min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len + BAREUDP_BASE_HLEN + info->options_len + sizeof(struct iphdr); err = skb_cow_head(skb, min_headroom); if (unlikely(err)) goto free_dst; err = udp_tunnel_handle_offloads(skb, udp_sum); if (err) goto free_dst; skb_set_inner_protocol(skb, bareudp->ethertype); udp_tunnel_xmit_skb(rt, sock->sk, skb, saddr, info->key.u.ipv4.dst, tos, ttl, df, sport, bareudp->port, !net_eq(bareudp->net, dev_net(bareudp->dev)), !test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags)); return 0; free_dst: dst_release(&rt->dst); return err; } static int bareudp6_xmit_skb(struct sk_buff *skb, struct net_device *dev, struct bareudp_dev *bareudp, const struct ip_tunnel_info *info) { bool udp_sum = test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags); bool xnet = !net_eq(bareudp->net, dev_net(bareudp->dev)); bool use_cache = ip_tunnel_dst_cache_usable(skb, info); struct socket *sock = rcu_dereference(bareudp->sock); const struct ip_tunnel_key *key = &info->key; struct dst_entry *dst = NULL; struct in6_addr saddr, daddr; int min_headroom; __u8 prio, ttl; __be16 sport; int err; if (skb_vlan_inet_prepare(skb, skb->protocol != htons(ETH_P_TEB))) return -EINVAL; if (!sock) return -ESHUTDOWN; sport = udp_flow_src_port(bareudp->net, skb, bareudp->sport_min, USHRT_MAX, true); dst = udp_tunnel6_dst_lookup(skb, dev, bareudp->net, sock, 0, &saddr, key, sport, bareudp->port, key->tos, use_cache ? (struct dst_cache *) &info->dst_cache : NULL); if (IS_ERR(dst)) return PTR_ERR(dst); skb_tunnel_check_pmtu(skb, dst, BAREUDP_IPV6_HLEN + info->options_len, false); prio = ip_tunnel_ecn_encap(key->tos, ip_hdr(skb), skb); ttl = key->ttl; skb_scrub_packet(skb, xnet); err = -ENOSPC; if (!skb_pull(skb, skb_network_offset(skb))) goto free_dst; min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len + BAREUDP_BASE_HLEN + info->options_len + sizeof(struct ipv6hdr); err = skb_cow_head(skb, min_headroom); if (unlikely(err)) goto free_dst; err = udp_tunnel_handle_offloads(skb, udp_sum); if (err) goto free_dst; daddr = info->key.u.ipv6.dst; udp_tunnel6_xmit_skb(dst, sock->sk, skb, dev, &saddr, &daddr, prio, ttl, info->key.label, sport, bareudp->port, !test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags)); return 0; free_dst: dst_release(dst); return err; } static bool bareudp_proto_valid(struct bareudp_dev *bareudp, __be16 proto) { if (bareudp->ethertype == proto) return true; if (!bareudp->multi_proto_mode) return false; if (bareudp->ethertype == htons(ETH_P_MPLS_UC) && proto == htons(ETH_P_MPLS_MC)) return true; if (bareudp->ethertype == htons(ETH_P_IP) && proto == htons(ETH_P_IPV6)) return true; return false; } static netdev_tx_t bareudp_xmit(struct sk_buff *skb, struct net_device *dev) { struct bareudp_dev *bareudp = netdev_priv(dev); struct ip_tunnel_info *info = NULL; int err; if (!bareudp_proto_valid(bareudp, skb->protocol)) { err = -EINVAL; goto tx_error; } info = skb_tunnel_info(skb); if (unlikely(!info || !(info->mode & IP_TUNNEL_INFO_TX))) { err = -EINVAL; goto tx_error; } rcu_read_lock(); if (ipv6_mod_enabled() && info->mode & IP_TUNNEL_INFO_IPV6) err = bareudp6_xmit_skb(skb, dev, bareudp, info); else err = bareudp_xmit_skb(skb, dev, bareudp, info); rcu_read_unlock(); if (likely(!err)) return NETDEV_TX_OK; tx_error: dev_kfree_skb(skb); if (err == -ELOOP) DEV_STATS_INC(dev, collisions); else if (err == -ENETUNREACH) DEV_STATS_INC(dev, tx_carrier_errors); DEV_STATS_INC(dev, tx_errors); return NETDEV_TX_OK; } static int bareudp_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) { struct ip_tunnel_info *info = skb_tunnel_info(skb); struct bareudp_dev *bareudp = netdev_priv(dev); bool use_cache; __be16 sport; use_cache = ip_tunnel_dst_cache_usable(skb, info); sport = udp_flow_src_port(bareudp->net, skb, bareudp->sport_min, USHRT_MAX, true); if (!ipv6_mod_enabled() || ip_tunnel_info_af(info) == AF_INET) { struct rtable *rt; __be32 saddr; rt = udp_tunnel_dst_lookup(skb, dev, bareudp->net, 0, &saddr, &info->key, sport, bareudp->port, info->key.tos, use_cache ? &info->dst_cache : NULL); if (IS_ERR(rt)) return PTR_ERR(rt); ip_rt_put(rt); info->key.u.ipv4.src = saddr; } else if (ip_tunnel_info_af(info) == AF_INET6) { struct dst_entry *dst; struct in6_addr saddr; struct socket *sock = rcu_dereference(bareudp->sock); dst = udp_tunnel6_dst_lookup(skb, dev, bareudp->net, sock, 0, &saddr, &info->key, sport, bareudp->port, info->key.tos, use_cache ? &info->dst_cache : NULL); if (IS_ERR(dst)) return PTR_ERR(dst); dst_release(dst); info->key.u.ipv6.src = saddr; } else { return -EINVAL; } info->key.tp_src = sport; info->key.tp_dst = bareudp->port; return 0; } static const struct net_device_ops bareudp_netdev_ops = { .ndo_init = bareudp_init, .ndo_uninit = bareudp_uninit, .ndo_open = bareudp_open, .ndo_stop = bareudp_stop, .ndo_start_xmit = bareudp_xmit, .ndo_fill_metadata_dst = bareudp_fill_metadata_dst, }; static const struct nla_policy bareudp_policy[IFLA_BAREUDP_MAX + 1] = { [IFLA_BAREUDP_PORT] = { .type = NLA_U16 }, [IFLA_BAREUDP_ETHERTYPE] = { .type = NLA_U16 }, [IFLA_BAREUDP_SRCPORT_MIN] = { .type = NLA_U16 }, [IFLA_BAREUDP_MULTIPROTO_MODE] = { .type = NLA_FLAG }, }; /* Info for udev, that this is a virtual tunnel endpoint */ static const struct device_type bareudp_type = { .name = "bareudp", }; /* Initialize the device structure. */ static void bareudp_setup(struct net_device *dev) { dev->netdev_ops = &bareudp_netdev_ops; dev->needs_free_netdev = true; SET_NETDEV_DEVTYPE(dev, &bareudp_type); dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST; dev->features |= NETIF_F_RXCSUM; dev->features |= NETIF_F_GSO_SOFTWARE; dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST; dev->hw_features |= NETIF_F_RXCSUM; dev->hw_features |= NETIF_F_GSO_SOFTWARE; dev->hard_header_len = 0; dev->addr_len = 0; dev->mtu = ETH_DATA_LEN; dev->min_mtu = IPV4_MIN_MTU; dev->max_mtu = IP_MAX_MTU - BAREUDP_BASE_HLEN; dev->type = ARPHRD_NONE; netif_keep_dst(dev); dev->priv_flags |= IFF_NO_QUEUE; dev->lltx = true; dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS; } static int bareudp_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (!data) { NL_SET_ERR_MSG(extack, "Not enough attributes provided to perform the operation"); return -EINVAL; } return 0; } static int bareudp2info(struct nlattr *data[], struct bareudp_conf *conf, struct netlink_ext_ack *extack) { memset(conf, 0, sizeof(*conf)); if (!data[IFLA_BAREUDP_PORT]) { NL_SET_ERR_MSG(extack, "port not specified"); return -EINVAL; } if (!data[IFLA_BAREUDP_ETHERTYPE]) { NL_SET_ERR_MSG(extack, "ethertype not specified"); return -EINVAL; } conf->port = nla_get_u16(data[IFLA_BAREUDP_PORT]); conf->ethertype = nla_get_u16(data[IFLA_BAREUDP_ETHERTYPE]); if (data[IFLA_BAREUDP_SRCPORT_MIN]) conf->sport_min = nla_get_u16(data[IFLA_BAREUDP_SRCPORT_MIN]); if (data[IFLA_BAREUDP_MULTIPROTO_MODE]) conf->multi_proto_mode = true; return 0; } static struct bareudp_dev *bareudp_find_dev(struct bareudp_net *bn, const struct bareudp_conf *conf) { struct bareudp_dev *bareudp, *t = NULL; list_for_each_entry(bareudp, &bn->bareudp_list, next) { if (conf->port == bareudp->port) t = bareudp; } return t; } static int bareudp_configure(struct net *net, struct net_device *dev, struct bareudp_conf *conf, struct netlink_ext_ack *extack) { struct bareudp_net *bn = net_generic(net, bareudp_net_id); struct bareudp_dev *t, *bareudp = netdev_priv(dev); int err; bareudp->net = net; bareudp->dev = dev; t = bareudp_find_dev(bn, conf); if (t) { NL_SET_ERR_MSG(extack, "Another bareudp device using the same port already exists"); return -EBUSY; } if (conf->multi_proto_mode && (conf->ethertype != htons(ETH_P_MPLS_UC) && conf->ethertype != htons(ETH_P_IP))) { NL_SET_ERR_MSG(extack, "Cannot set multiproto mode for this ethertype (only IPv4 and unicast MPLS are supported)"); return -EINVAL; } bareudp->port = conf->port; bareudp->ethertype = conf->ethertype; bareudp->sport_min = conf->sport_min; bareudp->multi_proto_mode = conf->multi_proto_mode; err = register_netdevice(dev); if (err) return err; list_add(&bareudp->next, &bn->bareudp_list); return 0; } static int bareudp_link_config(struct net_device *dev, struct nlattr *tb[]) { int err; if (tb[IFLA_MTU]) { err = dev_set_mtu(dev, nla_get_u32(tb[IFLA_MTU])); if (err) return err; } return 0; } static void bareudp_dellink(struct net_device *dev, struct list_head *head) { struct bareudp_dev *bareudp = netdev_priv(dev); list_del(&bareudp->next); unregister_netdevice_queue(dev, head); } static int bareudp_newlink(struct net *net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct bareudp_conf conf; int err; err = bareudp2info(data, &conf, extack); if (err) return err; err = bareudp_configure(net, dev, &conf, extack); if (err) return err; err = bareudp_link_config(dev, tb); if (err) goto err_unconfig; return 0; err_unconfig: bareudp_dellink(dev, NULL); return err; } static size_t bareudp_get_size(const struct net_device *dev) { return nla_total_size(sizeof(__be16)) + /* IFLA_BAREUDP_PORT */ nla_total_size(sizeof(__be16)) + /* IFLA_BAREUDP_ETHERTYPE */ nla_total_size(sizeof(__u16)) + /* IFLA_BAREUDP_SRCPORT_MIN */ nla_total_size(0) + /* IFLA_BAREUDP_MULTIPROTO_MODE */ 0; } static int bareudp_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct bareudp_dev *bareudp = netdev_priv(dev); if (nla_put_be16(skb, IFLA_BAREUDP_PORT, bareudp->port)) goto nla_put_failure; if (nla_put_be16(skb, IFLA_BAREUDP_ETHERTYPE, bareudp->ethertype)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_BAREUDP_SRCPORT_MIN, bareudp->sport_min)) goto nla_put_failure; if (bareudp->multi_proto_mode && nla_put_flag(skb, IFLA_BAREUDP_MULTIPROTO_MODE)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static struct rtnl_link_ops bareudp_link_ops __read_mostly = { .kind = "bareudp", .maxtype = IFLA_BAREUDP_MAX, .policy = bareudp_policy, .priv_size = sizeof(struct bareudp_dev), .setup = bareudp_setup, .validate = bareudp_validate, .newlink = bareudp_newlink, .dellink = bareudp_dellink, .get_size = bareudp_get_size, .fill_info = bareudp_fill_info, }; static __net_init int bareudp_init_net(struct net *net) { struct bareudp_net *bn = net_generic(net, bareudp_net_id); INIT_LIST_HEAD(&bn->bareudp_list); return 0; } static void bareudp_destroy_tunnels(struct net *net, struct list_head *head) { struct bareudp_net *bn = net_generic(net, bareudp_net_id); struct bareudp_dev *bareudp, *next; list_for_each_entry_safe(bareudp, next, &bn->bareudp_list, next) unregister_netdevice_queue(bareudp->dev, head); } static void __net_exit bareudp_exit_batch_rtnl(struct list_head *net_list, struct list_head *dev_kill_list) { struct net *net; list_for_each_entry(net, net_list, exit_list) bareudp_destroy_tunnels(net, dev_kill_list); } static struct pernet_operations bareudp_net_ops = { .init = bareudp_init_net, .exit_batch_rtnl = bareudp_exit_batch_rtnl, .id = &bareudp_net_id, .size = sizeof(struct bareudp_net), }; static int __init bareudp_init_module(void) { int rc; rc = register_pernet_subsys(&bareudp_net_ops); if (rc) goto out1; rc = rtnl_link_register(&bareudp_link_ops); if (rc) goto out2; return 0; out2: unregister_pernet_subsys(&bareudp_net_ops); out1: return rc; } late_initcall(bareudp_init_module); static void __exit bareudp_cleanup_module(void) { rtnl_link_unregister(&bareudp_link_ops); unregister_pernet_subsys(&bareudp_net_ops); } module_exit(bareudp_cleanup_module); MODULE_ALIAS_RTNL_LINK("bareudp"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Martin Varghese <martin.varghese@nokia.com>"); MODULE_DESCRIPTION("Interface driver for UDP encapsulated traffic");
74 74 73 73 61 14 14 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 // SPDX-License-Identifier: GPL-2.0-or-later /* * ip_vs_proto_udp.c: UDP load balancing support for IPVS * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * Julian Anastasov <ja@ssi.bg> * * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com> * Network name space (netns) aware. */ #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/in.h> #include <linux/ip.h> #include <linux/kernel.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/udp.h> #include <linux/indirect_call_wrapper.h> #include <net/ip_vs.h> #include <net/ip.h> #include <net/ip6_checksum.h> static int udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp); static int udp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, struct ip_vs_iphdr *iph) { struct ip_vs_service *svc; struct udphdr _udph, *uh; __be16 _ports[2], *ports = NULL; if (likely(!ip_vs_iph_icmp(iph))) { /* IPv6 fragments, only first fragment will hit this */ uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph); if (uh) ports = &uh->source; } else { ports = skb_header_pointer( skb, iph->len, sizeof(_ports), &_ports); } if (!ports) { *verdict = NF_DROP; return 0; } if (likely(!ip_vs_iph_inverse(iph))) svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, &iph->daddr, ports[1]); else svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, &iph->saddr, ports[0]); if (svc) { int ignored; if (ip_vs_todrop(ipvs)) { /* * It seems that we are very loaded. * We have to drop this packet :( */ *verdict = NF_DROP; return 0; } /* * Let the virtual server select a real server for the * incoming connection, and create a connection entry. */ *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph); if (!*cpp && ignored <= 0) { if (!ignored) *verdict = ip_vs_leave(svc, skb, pd, iph); else *verdict = NF_DROP; return 0; } } /* NF_ACCEPT */ return 1; } static inline void udp_fast_csum_update(int af, struct udphdr *uhdr, const union nf_inet_addr *oldip, const union nf_inet_addr *newip, __be16 oldport, __be16 newport) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) uhdr->check = csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, ip_vs_check_diff2(oldport, newport, ~csum_unfold(uhdr->check)))); else #endif uhdr->check = csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, ip_vs_check_diff2(oldport, newport, ~csum_unfold(uhdr->check)))); if (!uhdr->check) uhdr->check = CSUM_MANGLED_0; } static inline void udp_partial_csum_update(int af, struct udphdr *uhdr, const union nf_inet_addr *oldip, const union nf_inet_addr *newip, __be16 oldlen, __be16 newlen) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) uhdr->check = ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, ip_vs_check_diff2(oldlen, newlen, csum_unfold(uhdr->check)))); else #endif uhdr->check = ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, ip_vs_check_diff2(oldlen, newlen, csum_unfold(uhdr->check)))); } INDIRECT_CALLABLE_SCOPE int udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { struct udphdr *udph; unsigned int udphoff = iph->len; bool payload_csum = false; int oldlen; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6 && iph->fragoffs) return 1; #endif oldlen = skb->len - udphoff; /* csum_check requires unshared skb */ if (skb_ensure_writable(skb, udphoff + sizeof(*udph))) return 0; if (unlikely(cp->app != NULL)) { int ret; /* Some checks before mangling */ if (!udp_csum_check(cp->af, skb, pp)) return 0; /* * Call application helper if needed */ if (!(ret = ip_vs_app_pkt_out(cp, skb, iph))) return 0; /* ret=2: csum update is needed after payload mangling */ if (ret == 1) oldlen = skb->len - udphoff; else payload_csum = true; } udph = (void *)skb_network_header(skb) + udphoff; udph->source = cp->vport; /* * Adjust UDP checksums */ if (skb->ip_summed == CHECKSUM_PARTIAL) { udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, htons(oldlen), htons(skb->len - udphoff)); } else if (!payload_csum && (udph->check != 0)) { /* Only port and addr are changed, do fast csum update */ udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, cp->dport, cp->vport); if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = cp->app ? CHECKSUM_UNNECESSARY : CHECKSUM_NONE; } else { /* full checksum calculation */ udph->check = 0; skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) udph->check = csum_ipv6_magic(&cp->vaddr.in6, &cp->caddr.in6, skb->len - udphoff, cp->protocol, skb->csum); else #endif udph->check = csum_tcpudp_magic(cp->vaddr.ip, cp->caddr.ip, skb->len - udphoff, cp->protocol, skb->csum); if (udph->check == 0) udph->check = CSUM_MANGLED_0; skb->ip_summed = CHECKSUM_UNNECESSARY; IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", pp->name, udph->check, (char*)&(udph->check) - (char*)udph); } return 1; } static int udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { struct udphdr *udph; unsigned int udphoff = iph->len; bool payload_csum = false; int oldlen; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6 && iph->fragoffs) return 1; #endif oldlen = skb->len - udphoff; /* csum_check requires unshared skb */ if (skb_ensure_writable(skb, udphoff + sizeof(*udph))) return 0; if (unlikely(cp->app != NULL)) { int ret; /* Some checks before mangling */ if (!udp_csum_check(cp->af, skb, pp)) return 0; /* * Attempt ip_vs_app call. * It will fix ip_vs_conn */ if (!(ret = ip_vs_app_pkt_in(cp, skb, iph))) return 0; /* ret=2: csum update is needed after payload mangling */ if (ret == 1) oldlen = skb->len - udphoff; else payload_csum = true; } udph = (void *)skb_network_header(skb) + udphoff; udph->dest = cp->dport; /* * Adjust UDP checksums */ if (skb->ip_summed == CHECKSUM_PARTIAL) { udp_partial_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr, htons(oldlen), htons(skb->len - udphoff)); } else if (!payload_csum && (udph->check != 0)) { /* Only port and addr are changed, do fast csum update */ udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr, cp->vport, cp->dport); if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = cp->app ? CHECKSUM_UNNECESSARY : CHECKSUM_NONE; } else { /* full checksum calculation */ udph->check = 0; skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) udph->check = csum_ipv6_magic(&cp->caddr.in6, &cp->daddr.in6, skb->len - udphoff, cp->protocol, skb->csum); else #endif udph->check = csum_tcpudp_magic(cp->caddr.ip, cp->daddr.ip, skb->len - udphoff, cp->protocol, skb->csum); if (udph->check == 0) udph->check = CSUM_MANGLED_0; skb->ip_summed = CHECKSUM_UNNECESSARY; } return 1; } static int udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) { struct udphdr _udph, *uh; unsigned int udphoff; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) udphoff = sizeof(struct ipv6hdr); else #endif udphoff = ip_hdrlen(skb); uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph); if (uh == NULL) return 0; if (uh->check != 0) { switch (skb->ip_summed) { case CHECKSUM_NONE: skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); fallthrough; case CHECKSUM_COMPLETE: #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->len - udphoff, ipv6_hdr(skb)->nexthdr, skb->csum)) { IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, "Failed checksum for"); return 0; } } else #endif if (csum_tcpudp_magic(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, skb->len - udphoff, ip_hdr(skb)->protocol, skb->csum)) { IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, "Failed checksum for"); return 0; } break; default: /* No need to checksum. */ break; } } return 1; } static inline __u16 udp_app_hashkey(__be16 port) { return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port) & UDP_APP_TAB_MASK; } static int udp_register_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { struct ip_vs_app *i; __u16 hash; __be16 port = inc->port; int ret = 0; struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP); hash = udp_app_hashkey(port); list_for_each_entry(i, &ipvs->udp_apps[hash], p_list) { if (i->port == port) { ret = -EEXIST; goto out; } } list_add_rcu(&inc->p_list, &ipvs->udp_apps[hash]); atomic_inc(&pd->appcnt); out: return ret; } static void udp_unregister_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP); atomic_dec(&pd->appcnt); list_del_rcu(&inc->p_list); } static int udp_app_conn_bind(struct ip_vs_conn *cp) { struct netns_ipvs *ipvs = cp->ipvs; int hash; struct ip_vs_app *inc; int result = 0; /* Default binding: bind app only for NAT */ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) return 0; /* Lookup application incarnations and bind the right one */ hash = udp_app_hashkey(cp->vport); list_for_each_entry_rcu(inc, &ipvs->udp_apps[hash], p_list) { if (inc->port == cp->vport) { if (unlikely(!ip_vs_app_inc_get(inc))) break; IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->" "%s:%u to app %s on port %u\n", __func__, IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), inc->name, ntohs(inc->port)); cp->app = inc; if (inc->init_conn) result = inc->init_conn(inc, cp); break; } } return result; } static const int udp_timeouts[IP_VS_UDP_S_LAST+1] = { [IP_VS_UDP_S_NORMAL] = 5*60*HZ, [IP_VS_UDP_S_LAST] = 2*HZ, }; static const char *const udp_state_name_table[IP_VS_UDP_S_LAST+1] = { [IP_VS_UDP_S_NORMAL] = "UDP", [IP_VS_UDP_S_LAST] = "BUG!", }; static const char * udp_state_name(int state) { if (state >= IP_VS_UDP_S_LAST) return "ERR!"; return udp_state_name_table[state] ? udp_state_name_table[state] : "?"; } static void udp_state_transition(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, struct ip_vs_proto_data *pd) { if (unlikely(!pd)) { pr_err("UDP no ns data\n"); return; } cp->timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL]; if (direction == IP_VS_DIR_OUTPUT) ip_vs_control_assure_ct(cp); } static int __udp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { ip_vs_init_hash_table(ipvs->udp_apps, UDP_APP_TAB_SIZE); pd->timeout_table = ip_vs_create_timeout_table((int *)udp_timeouts, sizeof(udp_timeouts)); if (!pd->timeout_table) return -ENOMEM; return 0; } static void __udp_exit(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { kfree(pd->timeout_table); } struct ip_vs_protocol ip_vs_protocol_udp = { .name = "UDP", .protocol = IPPROTO_UDP, .num_states = IP_VS_UDP_S_LAST, .dont_defrag = 0, .init = NULL, .exit = NULL, .init_netns = __udp_init, .exit_netns = __udp_exit, .conn_schedule = udp_conn_schedule, .conn_in_get = ip_vs_conn_in_get_proto, .conn_out_get = ip_vs_conn_out_get_proto, .snat_handler = udp_snat_handler, .dnat_handler = udp_dnat_handler, .state_transition = udp_state_transition, .state_name = udp_state_name, .register_app = udp_register_app, .unregister_app = udp_unregister_app, .app_conn_bind = udp_app_conn_bind, .debug_packet = ip_vs_tcpudp_debug_packet, .timeout_change = NULL, };
38 3 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/module.h> #include <linux/device.h> #include <linux/netdevice.h> #include <net/bonding.h> #include <net/bond_alb.h> #if defined(CONFIG_DEBUG_FS) && !defined(CONFIG_NET_NS) #include <linux/debugfs.h> #include <linux/seq_file.h> static struct dentry *bonding_debug_root; /* Show RLB hash table */ static int bond_debug_rlb_hash_show(struct seq_file *m, void *v) { struct bonding *bond = m->private; struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); struct rlb_client_info *client_info; u32 hash_index; if (BOND_MODE(bond) != BOND_MODE_ALB) return 0; seq_printf(m, "SourceIP DestinationIP " "Destination MAC DEV\n"); spin_lock_bh(&bond->mode_lock); hash_index = bond_info->rx_hashtbl_used_head; for (; hash_index != RLB_NULL_INDEX; hash_index = client_info->used_next) { client_info = &(bond_info->rx_hashtbl[hash_index]); seq_printf(m, "%-15pI4 %-15pI4 %-17pM %s\n", &client_info->ip_src, &client_info->ip_dst, &client_info->mac_dst, client_info->slave->dev->name); } spin_unlock_bh(&bond->mode_lock); return 0; } DEFINE_SHOW_ATTRIBUTE(bond_debug_rlb_hash); void bond_debug_register(struct bonding *bond) { bond->debug_dir = debugfs_create_dir(bond->dev->name, bonding_debug_root); debugfs_create_file("rlb_hash_table", 0400, bond->debug_dir, bond, &bond_debug_rlb_hash_fops); } void bond_debug_unregister(struct bonding *bond) { debugfs_remove_recursive(bond->debug_dir); } void bond_debug_reregister(struct bonding *bond) { int err = debugfs_change_name(bond->debug_dir, "%s", bond->dev->name); if (err) { netdev_warn(bond->dev, "failed to reregister, so just unregister old one\n"); bond_debug_unregister(bond); } } void __init bond_create_debugfs(void) { bonding_debug_root = debugfs_create_dir("bonding", NULL); if (IS_ERR(bonding_debug_root)) pr_warn("Warning: Cannot create bonding directory in debugfs\n"); } void bond_destroy_debugfs(void) { debugfs_remove_recursive(bonding_debug_root); bonding_debug_root = NULL; } #else /* !CONFIG_DEBUG_FS */ void bond_debug_register(struct bonding *bond) { } void bond_debug_unregister(struct bonding *bond) { } void bond_debug_reregister(struct bonding *bond) { } void __init bond_create_debugfs(void) { } void bond_destroy_debugfs(void) { } #endif /* CONFIG_DEBUG_FS */
4 6 1 1 6 7 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 // SPDX-License-Identifier: GPL-2.0-or-later /* * * Copyright Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) */ #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/slab.h> #include <net/ax25.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/tcp_states.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <net/netrom.h> /* * This routine purges all of the queues of frames. */ void nr_clear_queues(struct sock *sk) { struct nr_sock *nr = nr_sk(sk); skb_queue_purge(&sk->sk_write_queue); skb_queue_purge(&nr->ack_queue); skb_queue_purge(&nr->reseq_queue); skb_queue_purge(&nr->frag_queue); } /* * This routine purges the input queue of those frames that have been * acknowledged. This replaces the boxes labelled "V(a) <- N(r)" on the * SDL diagram. */ void nr_frames_acked(struct sock *sk, unsigned short nr) { struct nr_sock *nrom = nr_sk(sk); struct sk_buff *skb; /* * Remove all the ack-ed frames from the ack queue. */ if (nrom->va != nr) { while (skb_peek(&nrom->ack_queue) != NULL && nrom->va != nr) { skb = skb_dequeue(&nrom->ack_queue); kfree_skb(skb); nrom->va = (nrom->va + 1) % NR_MODULUS; } } } /* * Requeue all the un-ack-ed frames on the output queue to be picked * up by nr_kick called from the timer. This arrangement handles the * possibility of an empty output queue. */ void nr_requeue_frames(struct sock *sk) { struct sk_buff *skb, *skb_prev = NULL; while ((skb = skb_dequeue(&nr_sk(sk)->ack_queue)) != NULL) { if (skb_prev == NULL) skb_queue_head(&sk->sk_write_queue, skb); else skb_append(skb_prev, skb, &sk->sk_write_queue); skb_prev = skb; } } /* * Validate that the value of nr is between va and vs. Return true or * false for testing. */ int nr_validate_nr(struct sock *sk, unsigned short nr) { struct nr_sock *nrom = nr_sk(sk); unsigned short vc = nrom->va; while (vc != nrom->vs) { if (nr == vc) return 1; vc = (vc + 1) % NR_MODULUS; } return nr == nrom->vs; } /* * Check that ns is within the receive window. */ int nr_in_rx_window(struct sock *sk, unsigned short ns) { struct nr_sock *nr = nr_sk(sk); unsigned short vc = nr->vr; unsigned short vt = (nr->vl + nr->window) % NR_MODULUS; while (vc != vt) { if (ns == vc) return 1; vc = (vc + 1) % NR_MODULUS; } return 0; } /* * This routine is called when the HDLC layer internally generates a * control frame. */ void nr_write_internal(struct sock *sk, int frametype) { struct nr_sock *nr = nr_sk(sk); struct sk_buff *skb; unsigned char *dptr; int len, timeout; len = NR_TRANSPORT_LEN; switch (frametype & 0x0F) { case NR_CONNREQ: len += 17; break; case NR_CONNACK: len += (nr->bpqext) ? 2 : 1; break; case NR_DISCREQ: case NR_DISCACK: case NR_INFOACK: break; default: printk(KERN_ERR "NET/ROM: nr_write_internal - invalid frame type %d\n", frametype); return; } skb = alloc_skb(NR_NETWORK_LEN + len, GFP_ATOMIC); if (!skb) return; /* * Space for AX.25 and NET/ROM network header */ skb_reserve(skb, NR_NETWORK_LEN); dptr = skb_put(skb, len); switch (frametype & 0x0F) { case NR_CONNREQ: timeout = nr->t1 / HZ; *dptr++ = nr->my_index; *dptr++ = nr->my_id; *dptr++ = 0; *dptr++ = 0; *dptr++ = frametype; *dptr++ = nr->window; memcpy(dptr, &nr->user_addr, AX25_ADDR_LEN); dptr[6] &= ~AX25_CBIT; dptr[6] &= ~AX25_EBIT; dptr[6] |= AX25_SSSID_SPARE; dptr += AX25_ADDR_LEN; memcpy(dptr, &nr->source_addr, AX25_ADDR_LEN); dptr[6] &= ~AX25_CBIT; dptr[6] &= ~AX25_EBIT; dptr[6] |= AX25_SSSID_SPARE; dptr += AX25_ADDR_LEN; *dptr++ = timeout % 256; *dptr++ = timeout / 256; break; case NR_CONNACK: *dptr++ = nr->your_index; *dptr++ = nr->your_id; *dptr++ = nr->my_index; *dptr++ = nr->my_id; *dptr++ = frametype; *dptr++ = nr->window; if (nr->bpqext) *dptr++ = READ_ONCE(sysctl_netrom_network_ttl_initialiser); break; case NR_DISCREQ: case NR_DISCACK: *dptr++ = nr->your_index; *dptr++ = nr->your_id; *dptr++ = 0; *dptr++ = 0; *dptr++ = frametype; break; case NR_INFOACK: *dptr++ = nr->your_index; *dptr++ = nr->your_id; *dptr++ = 0; *dptr++ = nr->vr; *dptr++ = frametype; break; } nr_transmit_buffer(sk, skb); } /* * This routine is called to send an error reply. */ void __nr_transmit_reply(struct sk_buff *skb, int mine, unsigned char cmdflags) { struct sk_buff *skbn; unsigned char *dptr; int len; len = NR_NETWORK_LEN + NR_TRANSPORT_LEN + 1; if ((skbn = alloc_skb(len, GFP_ATOMIC)) == NULL) return; skb_reserve(skbn, 0); dptr = skb_put(skbn, NR_NETWORK_LEN + NR_TRANSPORT_LEN); skb_copy_from_linear_data_offset(skb, 7, dptr, AX25_ADDR_LEN); dptr[6] &= ~AX25_CBIT; dptr[6] &= ~AX25_EBIT; dptr[6] |= AX25_SSSID_SPARE; dptr += AX25_ADDR_LEN; skb_copy_from_linear_data(skb, dptr, AX25_ADDR_LEN); dptr[6] &= ~AX25_CBIT; dptr[6] |= AX25_EBIT; dptr[6] |= AX25_SSSID_SPARE; dptr += AX25_ADDR_LEN; *dptr++ = READ_ONCE(sysctl_netrom_network_ttl_initialiser); if (mine) { *dptr++ = 0; *dptr++ = 0; *dptr++ = skb->data[15]; *dptr++ = skb->data[16]; } else { *dptr++ = skb->data[15]; *dptr++ = skb->data[16]; *dptr++ = 0; *dptr++ = 0; } *dptr++ = cmdflags; *dptr++ = 0; if (!nr_route_frame(skbn, NULL)) kfree_skb(skbn); } void nr_disconnect(struct sock *sk, int reason) { nr_stop_t1timer(sk); nr_stop_t2timer(sk); nr_stop_t4timer(sk); nr_stop_idletimer(sk); nr_clear_queues(sk); nr_sk(sk)->state = NR_STATE_0; sk->sk_state = TCP_CLOSE; sk->sk_err = reason; sk->sk_shutdown |= SEND_SHUTDOWN; if (!sock_flag(sk, SOCK_DEAD)) { sk->sk_state_change(sk); sock_set_flag(sk, SOCK_DEAD); } }
8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2021 Intel Corporation */ #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include "hci_codec.h" static int hci_codec_list_add(struct list_head *list, struct hci_op_read_local_codec_caps *sent, struct hci_rp_read_local_codec_caps *rp, void *caps, __u32 len) { struct codec_list *entry; entry = kzalloc(sizeof(*entry) + len, GFP_KERNEL); if (!entry) return -ENOMEM; entry->id = sent->id; if (sent->id == 0xFF) { entry->cid = __le16_to_cpu(sent->cid); entry->vid = __le16_to_cpu(sent->vid); } entry->transport = sent->transport; entry->len = len; entry->num_caps = 0; if (rp) { entry->num_caps = rp->num_caps; memcpy(entry->caps, caps, len); } list_add(&entry->list, list); return 0; } void hci_codec_list_clear(struct list_head *codec_list) { struct codec_list *c, *n; list_for_each_entry_safe(c, n, codec_list, list) { list_del(&c->list); kfree(c); } } static void hci_read_codec_capabilities(struct hci_dev *hdev, __u8 transport, struct hci_op_read_local_codec_caps *cmd) { __u8 i; for (i = 0; i < TRANSPORT_TYPE_MAX; i++) { if (transport & BIT(i)) { struct hci_rp_read_local_codec_caps *rp; struct hci_codec_caps *caps; struct sk_buff *skb; __u8 j; __u32 len; cmd->transport = i; /* If Read_Codec_Capabilities command is not supported * then just add codec to the list without caps */ if (!(hdev->commands[45] & 0x08)) { hci_dev_lock(hdev); hci_codec_list_add(&hdev->local_codecs, cmd, NULL, NULL, 0); hci_dev_unlock(hdev); continue; } skb = __hci_cmd_sync_sk(hdev, HCI_OP_READ_LOCAL_CODEC_CAPS, sizeof(*cmd), cmd, 0, HCI_CMD_TIMEOUT, NULL); if (IS_ERR(skb)) { bt_dev_err(hdev, "Failed to read codec capabilities (%ld)", PTR_ERR(skb)); continue; } if (skb->len < sizeof(*rp)) goto error; rp = (void *)skb->data; if (rp->status) goto error; if (!rp->num_caps) { len = 0; /* this codec doesn't have capabilities */ goto skip_caps_parse; } skb_pull(skb, sizeof(*rp)); for (j = 0, len = 0; j < rp->num_caps; j++) { caps = (void *)skb->data; if (skb->len < sizeof(*caps)) goto error; if (skb->len < caps->len) goto error; len += sizeof(caps->len) + caps->len; skb_pull(skb, sizeof(caps->len) + caps->len); } skip_caps_parse: hci_dev_lock(hdev); hci_codec_list_add(&hdev->local_codecs, cmd, rp, (__u8 *)rp + sizeof(*rp), len); hci_dev_unlock(hdev); error: kfree_skb(skb); } } } void hci_read_supported_codecs(struct hci_dev *hdev) { struct sk_buff *skb; struct hci_rp_read_local_supported_codecs *rp; struct hci_std_codecs *std_codecs; struct hci_vnd_codecs *vnd_codecs; struct hci_op_read_local_codec_caps caps; __u8 i; skb = __hci_cmd_sync_sk(hdev, HCI_OP_READ_LOCAL_CODECS, 0, NULL, 0, HCI_CMD_TIMEOUT, NULL); if (IS_ERR(skb)) { bt_dev_err(hdev, "Failed to read local supported codecs (%ld)", PTR_ERR(skb)); return; } if (skb->len < sizeof(*rp)) goto error; rp = (void *)skb->data; if (rp->status) goto error; skb_pull(skb, sizeof(rp->status)); std_codecs = (void *)skb->data; /* validate codecs length before accessing */ if (skb->len < flex_array_size(std_codecs, codec, std_codecs->num) + sizeof(std_codecs->num)) goto error; /* enumerate codec capabilities of standard codecs */ memset(&caps, 0, sizeof(caps)); for (i = 0; i < std_codecs->num; i++) { caps.id = std_codecs->codec[i]; caps.direction = 0x00; hci_read_codec_capabilities(hdev, LOCAL_CODEC_ACL_MASK | LOCAL_CODEC_SCO_MASK, &caps); } skb_pull(skb, flex_array_size(std_codecs, codec, std_codecs->num) + sizeof(std_codecs->num)); vnd_codecs = (void *)skb->data; /* validate vendor codecs length before accessing */ if (skb->len < flex_array_size(vnd_codecs, codec, vnd_codecs->num) + sizeof(vnd_codecs->num)) goto error; /* enumerate vendor codec capabilities */ for (i = 0; i < vnd_codecs->num; i++) { caps.id = 0xFF; caps.cid = vnd_codecs->codec[i].cid; caps.vid = vnd_codecs->codec[i].vid; caps.direction = 0x00; hci_read_codec_capabilities(hdev, LOCAL_CODEC_ACL_MASK | LOCAL_CODEC_SCO_MASK, &caps); } error: kfree_skb(skb); } void hci_read_supported_codecs_v2(struct hci_dev *hdev) { struct sk_buff *skb; struct hci_rp_read_local_supported_codecs_v2 *rp; struct hci_std_codecs_v2 *std_codecs; struct hci_vnd_codecs_v2 *vnd_codecs; struct hci_op_read_local_codec_caps caps; __u8 i; skb = __hci_cmd_sync_sk(hdev, HCI_OP_READ_LOCAL_CODECS_V2, 0, NULL, 0, HCI_CMD_TIMEOUT, NULL); if (IS_ERR(skb)) { bt_dev_err(hdev, "Failed to read local supported codecs (%ld)", PTR_ERR(skb)); return; } if (skb->len < sizeof(*rp)) goto error; rp = (void *)skb->data; if (rp->status) goto error; skb_pull(skb, sizeof(rp->status)); std_codecs = (void *)skb->data; /* check for payload data length before accessing */ if (skb->len < flex_array_size(std_codecs, codec, std_codecs->num) + sizeof(std_codecs->num)) goto error; memset(&caps, 0, sizeof(caps)); for (i = 0; i < std_codecs->num; i++) { caps.id = std_codecs->codec[i].id; hci_read_codec_capabilities(hdev, std_codecs->codec[i].transport, &caps); } skb_pull(skb, flex_array_size(std_codecs, codec, std_codecs->num) + sizeof(std_codecs->num)); vnd_codecs = (void *)skb->data; /* check for payload data length before accessing */ if (skb->len < flex_array_size(vnd_codecs, codec, vnd_codecs->num) + sizeof(vnd_codecs->num)) goto error; for (i = 0; i < vnd_codecs->num; i++) { caps.id = 0xFF; caps.cid = vnd_codecs->codec[i].cid; caps.vid = vnd_codecs->codec[i].vid; hci_read_codec_capabilities(hdev, vnd_codecs->codec[i].transport, &caps); } error: kfree_skb(skb); }
131 131 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 // SPDX-License-Identifier: GPL-2.0 /* * linux/mm/madvise.c * * Copyright (C) 1999 Linus Torvalds * Copyright (C) 2002 Christoph Hellwig */ #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/syscalls.h> #include <linux/mempolicy.h> #include <linux/page-isolation.h> #include <linux/page_idle.h> #include <linux/userfaultfd_k.h> #include <linux/hugetlb.h> #include <linux/falloc.h> #include <linux/fadvise.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/mm_inline.h> #include <linux/string.h> #include <linux/uio.h> #include <linux/ksm.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/pagewalk.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/shmem_fs.h> #include <linux/mmu_notifier.h> #include <asm/tlb.h> #include "internal.h" #include "swap.h" /* * Maximum number of attempts we make to install guard pages before we give up * and return -ERESTARTNOINTR to have userspace try again. */ #define MAX_MADVISE_GUARD_RETRIES 3 struct madvise_walk_private { struct mmu_gather *tlb; bool pageout; }; /* * Any behaviour which results in changes to the vma->vm_flags needs to * take mmap_lock for writing. Others, which simply traverse vmas, need * to only take it for reading. */ static int madvise_need_mmap_write(int behavior) { switch (behavior) { case MADV_REMOVE: case MADV_WILLNEED: case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: case MADV_COLD: case MADV_PAGEOUT: case MADV_FREE: case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: case MADV_COLLAPSE: case MADV_GUARD_INSTALL: case MADV_GUARD_REMOVE: return 0; default: /* be safe, default to 1. list exceptions explicitly */ return 1; } } #ifdef CONFIG_ANON_VMA_NAME struct anon_vma_name *anon_vma_name_alloc(const char *name) { struct anon_vma_name *anon_name; size_t count; /* Add 1 for NUL terminator at the end of the anon_name->name */ count = strlen(name) + 1; anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL); if (anon_name) { kref_init(&anon_name->kref); memcpy(anon_name->name, name, count); } return anon_name; } void anon_vma_name_free(struct kref *kref) { struct anon_vma_name *anon_name = container_of(kref, struct anon_vma_name, kref); kfree(anon_name); } struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) { mmap_assert_locked(vma->vm_mm); return vma->anon_name; } /* mmap_lock should be write-locked */ static int replace_anon_vma_name(struct vm_area_struct *vma, struct anon_vma_name *anon_name) { struct anon_vma_name *orig_name = anon_vma_name(vma); if (!anon_name) { vma->anon_name = NULL; anon_vma_name_put(orig_name); return 0; } if (anon_vma_name_eq(orig_name, anon_name)) return 0; vma->anon_name = anon_vma_name_reuse(anon_name); anon_vma_name_put(orig_name); return 0; } #else /* CONFIG_ANON_VMA_NAME */ static int replace_anon_vma_name(struct vm_area_struct *vma, struct anon_vma_name *anon_name) { if (anon_name) return -EINVAL; return 0; } #endif /* CONFIG_ANON_VMA_NAME */ /* * Update the vm_flags on region of a vma, splitting it or merging it as * necessary. Must be called with mmap_lock held for writing; * Caller should ensure anon_name stability by raising its refcount even when * anon_name belongs to a valid vma because this function might free that vma. */ static int madvise_update_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, unsigned long new_flags, struct anon_vma_name *anon_name) { struct mm_struct *mm = vma->vm_mm; int error; VMA_ITERATOR(vmi, mm, start); if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) { *prev = vma; return 0; } vma = vma_modify_flags_name(&vmi, *prev, vma, start, end, new_flags, anon_name); if (IS_ERR(vma)) return PTR_ERR(vma); *prev = vma; /* vm_flags is protected by the mmap_lock held in write mode. */ vma_start_write(vma); vm_flags_reset(vma, new_flags); if (!vma->vm_file || vma_is_anon_shmem(vma)) { error = replace_anon_vma_name(vma, anon_name); if (error) return error; } return 0; } #ifdef CONFIG_SWAP static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->private; struct swap_iocb *splug = NULL; pte_t *ptep = NULL; spinlock_t *ptl; unsigned long addr; for (addr = start; addr < end; addr += PAGE_SIZE) { pte_t pte; swp_entry_t entry; struct folio *folio; if (!ptep++) { ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (!ptep) break; } pte = ptep_get(ptep); if (!is_swap_pte(pte)) continue; entry = pte_to_swp_entry(pte); if (unlikely(non_swap_entry(entry))) continue; pte_unmap_unlock(ptep, ptl); ptep = NULL; folio = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, vma, addr, &splug); if (folio) folio_put(folio); } if (ptep) pte_unmap_unlock(ptep, ptl); swap_read_unplug(splug); cond_resched(); return 0; } static const struct mm_walk_ops swapin_walk_ops = { .pmd_entry = swapin_walk_pmd_entry, .walk_lock = PGWALK_RDLOCK, }; static void shmem_swapin_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct address_space *mapping) { XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start)); pgoff_t end_index = linear_page_index(vma, end) - 1; struct folio *folio; struct swap_iocb *splug = NULL; rcu_read_lock(); xas_for_each(&xas, folio, end_index) { unsigned long addr; swp_entry_t entry; if (!xa_is_value(folio)) continue; entry = radix_to_swp_entry(folio); /* There might be swapin error entries in shmem mapping. */ if (non_swap_entry(entry)) continue; addr = vma->vm_start + ((xas.xa_index - vma->vm_pgoff) << PAGE_SHIFT); xas_pause(&xas); rcu_read_unlock(); folio = read_swap_cache_async(entry, mapping_gfp_mask(mapping), vma, addr, &splug); if (folio) folio_put(folio); rcu_read_lock(); } rcu_read_unlock(); swap_read_unplug(splug); } #endif /* CONFIG_SWAP */ /* * Schedule all required I/O operations. Do not wait for completion. */ static long madvise_willneed(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end) { struct mm_struct *mm = vma->vm_mm; struct file *file = vma->vm_file; loff_t offset; *prev = vma; #ifdef CONFIG_SWAP if (!file) { walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma); lru_add_drain(); /* Push any new pages onto the LRU now */ return 0; } if (shmem_mapping(file->f_mapping)) { shmem_swapin_range(vma, start, end, file->f_mapping); lru_add_drain(); /* Push any new pages onto the LRU now */ return 0; } #else if (!file) return -EBADF; #endif if (IS_DAX(file_inode(file))) { /* no bad return value, but ignore advice */ return 0; } /* * Filesystem's fadvise may need to take various locks. We need to * explicitly grab a reference because the vma (and hence the * vma's reference to the file) can go away as soon as we drop * mmap_lock. */ *prev = NULL; /* tell sys_madvise we drop mmap_lock */ get_file(file); offset = (loff_t)(start - vma->vm_start) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); mmap_read_unlock(mm); vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED); fput(file); mmap_read_lock(mm); return 0; } static inline bool can_do_file_pageout(struct vm_area_struct *vma) { if (!vma->vm_file) return false; /* * paging out pagecache only for non-anonymous mappings that correspond * to the files the calling process could (if tried) open for writing; * otherwise we'd be including shared non-exclusive mappings, which * opens a side channel. */ return inode_owner_or_capable(&nop_mnt_idmap, file_inode(vma->vm_file)) || file_permission(vma->vm_file, MAY_WRITE) == 0; } static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end, struct folio *folio, pte_t *ptep, pte_t pte, bool *any_young, bool *any_dirty) { const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; int max_nr = (end - addr) / PAGE_SIZE; return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL, any_young, any_dirty); } static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct madvise_walk_private *private = walk->private; struct mmu_gather *tlb = private->tlb; bool pageout = private->pageout; struct mm_struct *mm = tlb->mm; struct vm_area_struct *vma = walk->vma; pte_t *start_pte, *pte, ptent; spinlock_t *ptl; struct folio *folio = NULL; LIST_HEAD(folio_list); bool pageout_anon_only_filter; unsigned int batch_count = 0; int nr; if (fatal_signal_pending(current)) return -EINTR; pageout_anon_only_filter = pageout && !vma_is_anonymous(vma) && !can_do_file_pageout(vma); #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (pmd_trans_huge(*pmd)) { pmd_t orig_pmd; unsigned long next = pmd_addr_end(addr, end); tlb_change_page_size(tlb, HPAGE_PMD_SIZE); ptl = pmd_trans_huge_lock(pmd, vma); if (!ptl) return 0; orig_pmd = *pmd; if (is_huge_zero_pmd(orig_pmd)) goto huge_unlock; if (unlikely(!pmd_present(orig_pmd))) { VM_BUG_ON(thp_migration_supported() && !is_pmd_migration_entry(orig_pmd)); goto huge_unlock; } folio = pmd_folio(orig_pmd); /* Do not interfere with other mappings of this folio */ if (folio_likely_mapped_shared(folio)) goto huge_unlock; if (pageout_anon_only_filter && !folio_test_anon(folio)) goto huge_unlock; if (next - addr != HPAGE_PMD_SIZE) { int err; folio_get(folio); spin_unlock(ptl); folio_lock(folio); err = split_folio(folio); folio_unlock(folio); folio_put(folio); if (!err) goto regular_folio; return 0; } if (!pageout && pmd_young(orig_pmd)) { pmdp_invalidate(vma, addr, pmd); orig_pmd = pmd_mkold(orig_pmd); set_pmd_at(mm, addr, pmd, orig_pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); } folio_clear_referenced(folio); folio_test_clear_young(folio); if (folio_test_active(folio)) folio_set_workingset(folio); if (pageout) { if (folio_isolate_lru(folio)) { if (folio_test_unevictable(folio)) folio_putback_lru(folio); else list_add(&folio->lru, &folio_list); } } else folio_deactivate(folio); huge_unlock: spin_unlock(ptl); if (pageout) reclaim_pages(&folio_list); return 0; } regular_folio: #endif tlb_change_page_size(tlb, PAGE_SIZE); restart: start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (!start_pte) return 0; flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) { nr = 1; ptent = ptep_get(pte); if (++batch_count == SWAP_CLUSTER_MAX) { batch_count = 0; if (need_resched()) { arch_leave_lazy_mmu_mode(); pte_unmap_unlock(start_pte, ptl); cond_resched(); goto restart; } } if (pte_none(ptent)) continue; if (!pte_present(ptent)) continue; folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; /* * If we encounter a large folio, only split it if it is not * fully mapped within the range we are operating on. Otherwise * leave it as is so that it can be swapped out whole. If we * fail to split a folio, leave it in place and advance to the * next pte in the range. */ if (folio_test_large(folio)) { bool any_young; nr = madvise_folio_pte_batch(addr, end, folio, pte, ptent, &any_young, NULL); if (any_young) ptent = pte_mkyoung(ptent); if (nr < folio_nr_pages(folio)) { int err; if (folio_likely_mapped_shared(folio)) continue; if (pageout_anon_only_filter && !folio_test_anon(folio)) continue; if (!folio_trylock(folio)) continue; folio_get(folio); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(start_pte, ptl); start_pte = NULL; err = split_folio(folio); folio_unlock(folio); folio_put(folio); start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!start_pte) break; arch_enter_lazy_mmu_mode(); if (!err) nr = 0; continue; } } /* * Do not interfere with other mappings of this folio and * non-LRU folio. If we have a large folio at this point, we * know it is fully mapped so if its mapcount is the same as its * number of pages, it must be exclusive. */ if (!folio_test_lru(folio) || folio_mapcount(folio) != folio_nr_pages(folio)) continue; if (pageout_anon_only_filter && !folio_test_anon(folio)) continue; if (!pageout && pte_young(ptent)) { clear_young_dirty_ptes(vma, addr, pte, nr, CYDP_CLEAR_YOUNG); tlb_remove_tlb_entries(tlb, pte, nr, addr); } /* * We are deactivating a folio for accelerating reclaiming. * VM couldn't reclaim the folio unless we clear PG_young. * As a side effect, it makes confuse idle-page tracking * because they will miss recent referenced history. */ folio_clear_referenced(folio); folio_test_clear_young(folio); if (folio_test_active(folio)) folio_set_workingset(folio); if (pageout) { if (folio_isolate_lru(folio)) { if (folio_test_unevictable(folio)) folio_putback_lru(folio); else list_add(&folio->lru, &folio_list); } } else folio_deactivate(folio); } if (start_pte) { arch_leave_lazy_mmu_mode(); pte_unmap_unlock(start_pte, ptl); } if (pageout) reclaim_pages(&folio_list); cond_resched(); return 0; } static const struct mm_walk_ops cold_walk_ops = { .pmd_entry = madvise_cold_or_pageout_pte_range, .walk_lock = PGWALK_RDLOCK, }; static void madvise_cold_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { struct madvise_walk_private walk_private = { .pageout = false, .tlb = tlb, }; tlb_start_vma(tlb, vma); walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); tlb_end_vma(tlb, vma); } static inline bool can_madv_lru_vma(struct vm_area_struct *vma) { return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB)); } static long madvise_cold(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start_addr, unsigned long end_addr) { struct mm_struct *mm = vma->vm_mm; struct mmu_gather tlb; *prev = vma; if (!can_madv_lru_vma(vma)) return -EINVAL; lru_add_drain(); tlb_gather_mmu(&tlb, mm); madvise_cold_page_range(&tlb, vma, start_addr, end_addr); tlb_finish_mmu(&tlb); return 0; } static void madvise_pageout_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { struct madvise_walk_private walk_private = { .pageout = true, .tlb = tlb, }; tlb_start_vma(tlb, vma); walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); tlb_end_vma(tlb, vma); } static long madvise_pageout(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start_addr, unsigned long end_addr) { struct mm_struct *mm = vma->vm_mm; struct mmu_gather tlb; *prev = vma; if (!can_madv_lru_vma(vma)) return -EINVAL; /* * If the VMA belongs to a private file mapping, there can be private * dirty pages which can be paged out if even this process is neither * owner nor write capable of the file. We allow private file mappings * further to pageout dirty anon pages. */ if (!vma_is_anonymous(vma) && (!can_do_file_pageout(vma) && (vma->vm_flags & VM_MAYSHARE))) return 0; lru_add_drain(); tlb_gather_mmu(&tlb, mm); madvise_pageout_page_range(&tlb, vma, start_addr, end_addr); tlb_finish_mmu(&tlb); return 0; } static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { const cydp_t cydp_flags = CYDP_CLEAR_YOUNG | CYDP_CLEAR_DIRTY; struct mmu_gather *tlb = walk->private; struct mm_struct *mm = tlb->mm; struct vm_area_struct *vma = walk->vma; spinlock_t *ptl; pte_t *start_pte, *pte, ptent; struct folio *folio; int nr_swap = 0; unsigned long next; int nr, max_nr; next = pmd_addr_end(addr, end); if (pmd_trans_huge(*pmd)) if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next)) return 0; tlb_change_page_size(tlb, PAGE_SIZE); start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!start_pte) return 0; flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) { nr = 1; ptent = ptep_get(pte); if (pte_none(ptent)) continue; /* * If the pte has swp_entry, just clear page table to * prevent swap-in which is more expensive rather than * (page allocation + zeroing). */ if (!pte_present(ptent)) { swp_entry_t entry; entry = pte_to_swp_entry(ptent); if (!non_swap_entry(entry)) { max_nr = (end - addr) / PAGE_SIZE; nr = swap_pte_batch(pte, max_nr, ptent); nr_swap -= nr; free_swap_and_cache_nr(entry, nr); clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm); } else if (is_hwpoison_entry(entry) || is_poisoned_swp_entry(entry)) { pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } continue; } folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; /* * If we encounter a large folio, only split it if it is not * fully mapped within the range we are operating on. Otherwise * leave it as is so that it can be marked as lazyfree. If we * fail to split a folio, leave it in place and advance to the * next pte in the range. */ if (folio_test_large(folio)) { bool any_young, any_dirty; nr = madvise_folio_pte_batch(addr, end, folio, pte, ptent, &any_young, &any_dirty); if (nr < folio_nr_pages(folio)) { int err; if (folio_likely_mapped_shared(folio)) continue; if (!folio_trylock(folio)) continue; folio_get(folio); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(start_pte, ptl); start_pte = NULL; err = split_folio(folio); folio_unlock(folio); folio_put(folio); pte = pte_offset_map_lock(mm, pmd, addr, &ptl); start_pte = pte; if (!start_pte) break; arch_enter_lazy_mmu_mode(); if (!err) nr = 0; continue; } if (any_young) ptent = pte_mkyoung(ptent); if (any_dirty) ptent = pte_mkdirty(ptent); } if (folio_test_swapcache(folio) || folio_test_dirty(folio)) { if (!folio_trylock(folio)) continue; /* * If we have a large folio at this point, we know it is * fully mapped so if its mapcount is the same as its * number of pages, it must be exclusive. */ if (folio_mapcount(folio) != folio_nr_pages(folio)) { folio_unlock(folio); continue; } if (folio_test_swapcache(folio) && !folio_free_swap(folio)) { folio_unlock(folio); continue; } folio_clear_dirty(folio); folio_unlock(folio); } if (pte_young(ptent) || pte_dirty(ptent)) { clear_young_dirty_ptes(vma, addr, pte, nr, cydp_flags); tlb_remove_tlb_entries(tlb, pte, nr, addr); } folio_mark_lazyfree(folio); } if (nr_swap) add_mm_counter(mm, MM_SWAPENTS, nr_swap); if (start_pte) { arch_leave_lazy_mmu_mode(); pte_unmap_unlock(start_pte, ptl); } cond_resched(); return 0; } static const struct mm_walk_ops madvise_free_walk_ops = { .pmd_entry = madvise_free_pte_range, .walk_lock = PGWALK_RDLOCK, }; static int madvise_free_single_vma(struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr) { struct mm_struct *mm = vma->vm_mm; struct mmu_notifier_range range; struct mmu_gather tlb; /* MADV_FREE works for only anon vma at the moment */ if (!vma_is_anonymous(vma)) return -EINVAL; range.start = max(vma->vm_start, start_addr); if (range.start >= vma->vm_end) return -EINVAL; range.end = min(vma->vm_end, end_addr); if (range.end <= vma->vm_start) return -EINVAL; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, range.start, range.end); lru_add_drain(); tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(&range); tlb_start_vma(&tlb, vma); walk_page_range(vma->vm_mm, range.start, range.end, &madvise_free_walk_ops, &tlb); tlb_end_vma(&tlb, vma); mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb); return 0; } /* * Application no longer needs these pages. If the pages are dirty, * it's OK to just throw them away. The app will be more careful about * data it wants to keep. Be sure to free swap resources too. The * zap_page_range_single call sets things up for shrink_active_list to actually * free these pages later if no one else has touched them in the meantime, * although we could add these pages to a global reuse list for * shrink_active_list to pick up before reclaiming other pages. * * NB: This interface discards data rather than pushes it out to swap, * as some implementations do. This has performance implications for * applications like large transactional databases which want to discard * pages in anonymous maps after committing to backing store the data * that was kept in them. There is no reason to write this data out to * the swap area if the application is discarding it. * * An interface that causes the system to free clean pages and flush * dirty pages is already available as msync(MS_INVALIDATE). */ static long madvise_dontneed_single_vma(struct vm_area_struct *vma, unsigned long start, unsigned long end) { struct zap_details details = { .reclaim_pt = true, .even_cows = true, }; zap_page_range_single(vma, start, end - start, &details); return 0; } static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma, unsigned long start, unsigned long *end, int behavior) { if (!is_vm_hugetlb_page(vma)) { unsigned int forbidden = VM_PFNMAP; if (behavior != MADV_DONTNEED_LOCKED) forbidden |= VM_LOCKED; return !(vma->vm_flags & forbidden); } if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED) return false; if (start & ~huge_page_mask(hstate_vma(vma))) return false; /* * Madvise callers expect the length to be rounded up to PAGE_SIZE * boundaries, and may be unaware that this VMA uses huge pages. * Avoid unexpected data loss by rounding down the number of * huge pages freed. */ *end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma))); return true; } static long madvise_dontneed_free(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, int behavior) { struct mm_struct *mm = vma->vm_mm; *prev = vma; if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior)) return -EINVAL; if (start == end) return 0; if (!userfaultfd_remove(vma, start, end)) { *prev = NULL; /* mmap_lock has been dropped, prev is stale */ mmap_read_lock(mm); vma = vma_lookup(mm, start); if (!vma) return -ENOMEM; /* * Potential end adjustment for hugetlb vma is OK as * the check below keeps end within vma. */ if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior)) return -EINVAL; if (end > vma->vm_end) { /* * Don't fail if end > vma->vm_end. If the old * vma was split while the mmap_lock was * released the effect of the concurrent * operation may not cause madvise() to * have an undefined result. There may be an * adjacent next vma that we'll walk * next. userfaultfd_remove() will generate an * UFFD_EVENT_REMOVE repetition on the * end-vma->vm_end range, but the manager can * handle a repetition fine. */ end = vma->vm_end; } VM_WARN_ON(start >= end); } if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED) return madvise_dontneed_single_vma(vma, start, end); else if (behavior == MADV_FREE) return madvise_free_single_vma(vma, start, end); else return -EINVAL; } static long madvise_populate(struct mm_struct *mm, unsigned long start, unsigned long end, int behavior) { const bool write = behavior == MADV_POPULATE_WRITE; int locked = 1; long pages; while (start < end) { /* Populate (prefault) page tables readable/writable. */ pages = faultin_page_range(mm, start, end, write, &locked); if (!locked) { mmap_read_lock(mm); locked = 1; } if (pages < 0) { switch (pages) { case -EINTR: return -EINTR; case -EINVAL: /* Incompatible mappings / permissions. */ return -EINVAL; case -EHWPOISON: return -EHWPOISON; case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */ return -EFAULT; default: pr_warn_once("%s: unhandled return value: %ld\n", __func__, pages); fallthrough; case -ENOMEM: /* No VMA or out of memory. */ return -ENOMEM; } } start += pages * PAGE_SIZE; } return 0; } /* * Application wants to free up the pages and associated backing store. * This is effectively punching a hole into the middle of a file. */ static long madvise_remove(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end) { loff_t offset; int error; struct file *f; struct mm_struct *mm = vma->vm_mm; *prev = NULL; /* tell sys_madvise we drop mmap_lock */ if (vma->vm_flags & VM_LOCKED) return -EINVAL; f = vma->vm_file; if (!f || !f->f_mapping || !f->f_mapping->host) { return -EINVAL; } if (!vma_is_shared_maywrite(vma)) return -EACCES; offset = (loff_t)(start - vma->vm_start) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); /* * Filesystem's fallocate may need to take i_rwsem. We need to * explicitly grab a reference because the vma (and hence the * vma's reference to the file) can go away as soon as we drop * mmap_lock. */ get_file(f); if (userfaultfd_remove(vma, start, end)) { /* mmap_lock was not released by userfaultfd_remove() */ mmap_read_unlock(mm); } error = vfs_fallocate(f, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, end - start); fput(f); mmap_read_lock(mm); return error; } static bool is_valid_guard_vma(struct vm_area_struct *vma, bool allow_locked) { vm_flags_t disallowed = VM_SPECIAL | VM_HUGETLB; /* * A user could lock after setting a guard range but that's fine, as * they'd not be able to fault in. The issue arises when we try to zap * existing locked VMAs. We don't want to do that. */ if (!allow_locked) disallowed |= VM_LOCKED; if (!vma_is_anonymous(vma)) return false; if ((vma->vm_flags & (VM_MAYWRITE | disallowed)) != VM_MAYWRITE) return false; return true; } static bool is_guard_pte_marker(pte_t ptent) { return is_pte_marker(ptent) && is_guard_swp_entry(pte_to_swp_entry(ptent)); } static int guard_install_pud_entry(pud_t *pud, unsigned long addr, unsigned long next, struct mm_walk *walk) { pud_t pudval = pudp_get(pud); /* If huge return >0 so we abort the operation + zap. */ return pud_trans_huge(pudval) || pud_devmap(pudval); } static int guard_install_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { pmd_t pmdval = pmdp_get(pmd); /* If huge return >0 so we abort the operation + zap. */ return pmd_trans_huge(pmdval) || pmd_devmap(pmdval); } static int guard_install_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { pte_t pteval = ptep_get(pte); unsigned long *nr_pages = (unsigned long *)walk->private; /* If there is already a guard page marker, we have nothing to do. */ if (is_guard_pte_marker(pteval)) { (*nr_pages)++; return 0; } /* If populated return >0 so we abort the operation + zap. */ return 1; } static int guard_install_set_pte(unsigned long addr, unsigned long next, pte_t *ptep, struct mm_walk *walk) { unsigned long *nr_pages = (unsigned long *)walk->private; /* Simply install a PTE marker, this causes segfault on access. */ *ptep = make_pte_marker(PTE_MARKER_GUARD); (*nr_pages)++; return 0; } static const struct mm_walk_ops guard_install_walk_ops = { .pud_entry = guard_install_pud_entry, .pmd_entry = guard_install_pmd_entry, .pte_entry = guard_install_pte_entry, .install_pte = guard_install_set_pte, .walk_lock = PGWALK_RDLOCK, }; static long madvise_guard_install(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end) { long err; int i; *prev = vma; if (!is_valid_guard_vma(vma, /* allow_locked = */false)) return -EINVAL; /* * If we install guard markers, then the range is no longer * empty from a page table perspective and therefore it's * appropriate to have an anon_vma. * * This ensures that on fork, we copy page tables correctly. */ err = anon_vma_prepare(vma); if (err) return err; /* * Optimistically try to install the guard marker pages first. If any * non-guard pages are encountered, give up and zap the range before * trying again. * * We try a few times before giving up and releasing back to userland to * loop around, releasing locks in the process to avoid contention. This * would only happen if there was a great many racing page faults. * * In most cases we should simply install the guard markers immediately * with no zap or looping. */ for (i = 0; i < MAX_MADVISE_GUARD_RETRIES; i++) { unsigned long nr_pages = 0; /* Returns < 0 on error, == 0 if success, > 0 if zap needed. */ err = walk_page_range_mm(vma->vm_mm, start, end, &guard_install_walk_ops, &nr_pages); if (err < 0) return err; if (err == 0) { unsigned long nr_expected_pages = PHYS_PFN(end - start); VM_WARN_ON(nr_pages != nr_expected_pages); return 0; } /* * OK some of the range have non-guard pages mapped, zap * them. This leaves existing guard pages in place. */ zap_page_range_single(vma, start, end - start, NULL); } /* * We were unable to install the guard pages due to being raced by page * faults. This should not happen ordinarily. We return to userspace and * immediately retry, relieving lock contention. */ return restart_syscall(); } static int guard_remove_pud_entry(pud_t *pud, unsigned long addr, unsigned long next, struct mm_walk *walk) { pud_t pudval = pudp_get(pud); /* If huge, cannot have guard pages present, so no-op - skip. */ if (pud_trans_huge(pudval) || pud_devmap(pudval)) walk->action = ACTION_CONTINUE; return 0; } static int guard_remove_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { pmd_t pmdval = pmdp_get(pmd); /* If huge, cannot have guard pages present, so no-op - skip. */ if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval)) walk->action = ACTION_CONTINUE; return 0; } static int guard_remove_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { pte_t ptent = ptep_get(pte); if (is_guard_pte_marker(ptent)) { /* Simply clear the PTE marker. */ pte_clear_not_present_full(walk->mm, addr, pte, false); update_mmu_cache(walk->vma, addr, pte); } return 0; } static const struct mm_walk_ops guard_remove_walk_ops = { .pud_entry = guard_remove_pud_entry, .pmd_entry = guard_remove_pmd_entry, .pte_entry = guard_remove_pte_entry, .walk_lock = PGWALK_RDLOCK, }; static long madvise_guard_remove(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end) { *prev = vma; /* * We're ok with removing guards in mlock()'d ranges, as this is a * non-destructive action. */ if (!is_valid_guard_vma(vma, /* allow_locked = */true)) return -EINVAL; return walk_page_range(vma->vm_mm, start, end, &guard_remove_walk_ops, NULL); } /* * Apply an madvise behavior to a region of a vma. madvise_update_vma * will handle splitting a vm area into separate areas, each area with its own * behavior. */ static int madvise_vma_behavior(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, unsigned long behavior) { int error; struct anon_vma_name *anon_name; unsigned long new_flags = vma->vm_flags; if (unlikely(!can_modify_vma_madv(vma, behavior))) return -EPERM; switch (behavior) { case MADV_REMOVE: return madvise_remove(vma, prev, start, end); case MADV_WILLNEED: return madvise_willneed(vma, prev, start, end); case MADV_COLD: return madvise_cold(vma, prev, start, end); case MADV_PAGEOUT: return madvise_pageout(vma, prev, start, end); case MADV_FREE: case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: return madvise_dontneed_free(vma, prev, start, end, behavior); case MADV_NORMAL: new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; break; case MADV_SEQUENTIAL: new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; break; case MADV_RANDOM: new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; break; case MADV_DONTFORK: new_flags |= VM_DONTCOPY; break; case MADV_DOFORK: if (vma->vm_flags & VM_IO) return -EINVAL; new_flags &= ~VM_DONTCOPY; break; case MADV_WIPEONFORK: /* MADV_WIPEONFORK is only supported on anonymous memory. */ if (vma->vm_file || vma->vm_flags & VM_SHARED) return -EINVAL; new_flags |= VM_WIPEONFORK; break; case MADV_KEEPONFORK: if (vma->vm_flags & VM_DROPPABLE) return -EINVAL; new_flags &= ~VM_WIPEONFORK; break; case MADV_DONTDUMP: new_flags |= VM_DONTDUMP; break; case MADV_DODUMP: if ((!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) || (vma->vm_flags & VM_DROPPABLE)) return -EINVAL; new_flags &= ~VM_DONTDUMP; break; case MADV_MERGEABLE: case MADV_UNMERGEABLE: error = ksm_madvise(vma, start, end, behavior, &new_flags); if (error) goto out; break; case MADV_HUGEPAGE: case MADV_NOHUGEPAGE: error = hugepage_madvise(vma, &new_flags, behavior); if (error) goto out; break; case MADV_COLLAPSE: return madvise_collapse(vma, prev, start, end); case MADV_GUARD_INSTALL: return madvise_guard_install(vma, prev, start, end); case MADV_GUARD_REMOVE: return madvise_guard_remove(vma, prev, start, end); } anon_name = anon_vma_name(vma); anon_vma_name_get(anon_name); error = madvise_update_vma(vma, prev, start, end, new_flags, anon_name); anon_vma_name_put(anon_name); out: /* * madvise() returns EAGAIN if kernel resources, such as * slab, are temporarily unavailable. */ if (error == -ENOMEM) error = -EAGAIN; return error; } #ifdef CONFIG_MEMORY_FAILURE /* * Error injection support for memory error handling. */ static int madvise_inject_error(int behavior, unsigned long start, unsigned long end) { unsigned long size; if (!capable(CAP_SYS_ADMIN)) return -EPERM; for (; start < end; start += size) { unsigned long pfn; struct page *page; int ret; ret = get_user_pages_fast(start, 1, 0, &page); if (ret != 1) return ret; pfn = page_to_pfn(page); /* * When soft offlining hugepages, after migrating the page * we dissolve it, therefore in the second loop "page" will * no longer be a compound page. */ size = page_size(compound_head(page)); if (behavior == MADV_SOFT_OFFLINE) { pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", pfn, start); ret = soft_offline_page(pfn, MF_COUNT_INCREASED); } else { pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", pfn, start); ret = memory_failure(pfn, MF_ACTION_REQUIRED | MF_COUNT_INCREASED | MF_SW_SIMULATED); if (ret == -EOPNOTSUPP) ret = 0; } if (ret) return ret; } return 0; } #endif static bool madvise_behavior_valid(int behavior) { switch (behavior) { case MADV_DOFORK: case MADV_DONTFORK: case MADV_NORMAL: case MADV_SEQUENTIAL: case MADV_RANDOM: case MADV_REMOVE: case MADV_WILLNEED: case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: case MADV_FREE: case MADV_COLD: case MADV_PAGEOUT: case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: #ifdef CONFIG_KSM case MADV_MERGEABLE: case MADV_UNMERGEABLE: #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE case MADV_HUGEPAGE: case MADV_NOHUGEPAGE: case MADV_COLLAPSE: #endif case MADV_DONTDUMP: case MADV_DODUMP: case MADV_WIPEONFORK: case MADV_KEEPONFORK: case MADV_GUARD_INSTALL: case MADV_GUARD_REMOVE: #ifdef CONFIG_MEMORY_FAILURE case MADV_SOFT_OFFLINE: case MADV_HWPOISON: #endif return true; default: return false; } } /* Can we invoke process_madvise() on a remote mm for the specified behavior? */ static bool process_madvise_remote_valid(int behavior) { switch (behavior) { case MADV_COLD: case MADV_PAGEOUT: case MADV_WILLNEED: case MADV_COLLAPSE: return true; default: return false; } } /* * Walk the vmas in range [start,end), and call the visit function on each one. * The visit function will get start and end parameters that cover the overlap * between the current vma and the original range. Any unmapped regions in the * original range will result in this function returning -ENOMEM while still * calling the visit function on all of the existing vmas in the range. * Must be called with the mmap_lock held for reading or writing. */ static int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long arg, int (*visit)(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, unsigned long arg)) { struct vm_area_struct *vma; struct vm_area_struct *prev; unsigned long tmp; int unmapped_error = 0; /* * If the interval [start,end) covers some unmapped address * ranges, just ignore them, but return -ENOMEM at the end. * - different from the way of handling in mlock etc. */ vma = find_vma_prev(mm, start, &prev); if (vma && start > vma->vm_start) prev = vma; for (;;) { int error; /* Still start < end. */ if (!vma) return -ENOMEM; /* Here start < (end|vma->vm_end). */ if (start < vma->vm_start) { unmapped_error = -ENOMEM; start = vma->vm_start; if (start >= end) break; } /* Here vma->vm_start <= start < (end|vma->vm_end) */ tmp = vma->vm_end; if (end < tmp) tmp = end; /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ error = visit(vma, &prev, start, tmp, arg); if (error) return error; start = tmp; if (prev && start < prev->vm_end) start = prev->vm_end; if (start >= end) break; if (prev) vma = find_vma(mm, prev->vm_end); else /* madvise_remove dropped mmap_lock */ vma = find_vma(mm, start); } return unmapped_error; } #ifdef CONFIG_ANON_VMA_NAME static int madvise_vma_anon_name(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, unsigned long anon_name) { int error; /* Only anonymous mappings can be named */ if (vma->vm_file && !vma_is_anon_shmem(vma)) return -EBADF; error = madvise_update_vma(vma, prev, start, end, vma->vm_flags, (struct anon_vma_name *)anon_name); /* * madvise() returns EAGAIN if kernel resources, such as * slab, are temporarily unavailable. */ if (error == -ENOMEM) error = -EAGAIN; return error; } int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, unsigned long len_in, struct anon_vma_name *anon_name) { unsigned long end; unsigned long len; if (start & ~PAGE_MASK) return -EINVAL; len = (len_in + ~PAGE_MASK) & PAGE_MASK; /* Check to see whether len was rounded up from small -ve to zero */ if (len_in && !len) return -EINVAL; end = start + len; if (end < start) return -EINVAL; if (end == start) return 0; return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name, madvise_vma_anon_name); } #endif /* CONFIG_ANON_VMA_NAME */ /* * The madvise(2) system call. * * Applications can use madvise() to advise the kernel how it should * handle paging I/O in this VM area. The idea is to help the kernel * use appropriate read-ahead and caching techniques. The information * provided is advisory only, and can be safely disregarded by the * kernel without affecting the correct operation of the application. * * behavior values: * MADV_NORMAL - the default behavior is to read clusters. This * results in some read-ahead and read-behind. * MADV_RANDOM - the system should read the minimum amount of data * on any access, since it is unlikely that the appli- * cation will need more than what it asks for. * MADV_SEQUENTIAL - pages in the given range will probably be accessed * once, so they can be aggressively read ahead, and * can be freed soon after they are accessed. * MADV_WILLNEED - the application is notifying the system to read * some pages ahead. * MADV_DONTNEED - the application is finished with the given range, * so the kernel can free resources associated with it. * MADV_FREE - the application marks pages in the given range as lazy free, * where actual purges are postponed until memory pressure happens. * MADV_REMOVE - the application wants to free up the given range of * pages and associated backing store. * MADV_DONTFORK - omit this area from child's address space when forking: * typically, to avoid COWing pages pinned by get_user_pages(). * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. * MADV_WIPEONFORK - present the child process with zero-filled memory in this * range after a fork. * MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK * MADV_HWPOISON - trigger memory error handler as if the given memory range * were corrupted by unrecoverable hardware memory failure. * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory. * MADV_MERGEABLE - the application recommends that KSM try to merge pages in * this area with pages of identical content from other such areas. * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. * MADV_HUGEPAGE - the application wants to back the given range by transparent * huge pages in the future. Existing pages might be coalesced and * new pages might be allocated as THP. * MADV_NOHUGEPAGE - mark the given range as not worth being backed by * transparent huge pages so the existing pages will not be * coalesced into THP and new pages will not be allocated as THP. * MADV_COLLAPSE - synchronously coalesce pages into new THP. * MADV_DONTDUMP - the application wants to prevent pages in the given range * from being included in its core dump. * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. * MADV_COLD - the application is not expected to use this memory soon, * deactivate pages in this range so that they can be reclaimed * easily if memory pressure happens. * MADV_PAGEOUT - the application is not expected to use this memory soon, * page out the pages in this range immediately. * MADV_POPULATE_READ - populate (prefault) page tables readable by * triggering read faults if required * MADV_POPULATE_WRITE - populate (prefault) page tables writable by * triggering write faults if required * * return values: * zero - success * -EINVAL - start + len < 0, start is not page-aligned, * "behavior" is not a valid value, or application * is attempting to release locked or shared pages, * or the specified address range includes file, Huge TLB, * MAP_SHARED or VMPFNMAP range. * -ENOMEM - addresses in the specified range are not currently * mapped, or are outside the AS of the process. * -EIO - an I/O error occurred while paging in data. * -EBADF - map exists, but area maps something that isn't a file. * -EAGAIN - a kernel resource was temporarily unavailable. * -EPERM - memory is sealed. */ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) { unsigned long end; int error; int write; size_t len; struct blk_plug plug; if (!madvise_behavior_valid(behavior)) return -EINVAL; if (!PAGE_ALIGNED(start)) return -EINVAL; len = PAGE_ALIGN(len_in); /* Check to see whether len was rounded up from small -ve to zero */ if (len_in && !len) return -EINVAL; end = start + len; if (end < start) return -EINVAL; if (end == start) return 0; #ifdef CONFIG_MEMORY_FAILURE if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) return madvise_inject_error(behavior, start, start + len_in); #endif write = madvise_need_mmap_write(behavior); if (write) { if (mmap_write_lock_killable(mm)) return -EINTR; } else { mmap_read_lock(mm); } start = untagged_addr_remote(mm, start); end = start + len; blk_start_plug(&plug); switch (behavior) { case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: error = madvise_populate(mm, start, end, behavior); break; default: error = madvise_walk_vmas(mm, start, end, behavior, madvise_vma_behavior); break; } blk_finish_plug(&plug); if (write) mmap_write_unlock(mm); else mmap_read_unlock(mm); return error; } SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) { return do_madvise(current->mm, start, len_in, behavior); } /* Perform an madvise operation over a vector of addresses and lengths. */ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, int behavior) { ssize_t ret = 0; size_t total_len; total_len = iov_iter_count(iter); while (iov_iter_count(iter)) { ret = do_madvise(mm, (unsigned long)iter_iov_addr(iter), iter_iov_len(iter), behavior); /* * An madvise operation is attempting to restart the syscall, * but we cannot proceed as it would not be correct to repeat * the operation in aggregate, and would be surprising to the * user. * * As we have already dropped locks, it is safe to just loop and * try again. We check for fatal signals in case we need exit * early anyway. */ if (ret == -ERESTARTNOINTR) { if (fatal_signal_pending(current)) { ret = -EINTR; break; } continue; } if (ret < 0) break; iov_iter_advance(iter, iter_iov_len(iter)); } ret = (total_len - iov_iter_count(iter)) ? : ret; return ret; } SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, size_t, vlen, int, behavior, unsigned int, flags) { ssize_t ret; struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; struct task_struct *task; struct mm_struct *mm; unsigned int f_flags; if (flags != 0) { ret = -EINVAL; goto out; } ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); if (ret < 0) goto out; task = pidfd_get_task(pidfd, &f_flags); if (IS_ERR(task)) { ret = PTR_ERR(task); goto free_iov; } /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); if (IS_ERR(mm)) { ret = PTR_ERR(mm); goto release_task; } /* * We need only perform this check if we are attempting to manipulate a * remote process's address space. */ if (mm != current->mm && !process_madvise_remote_valid(behavior)) { ret = -EINVAL; goto release_mm; } /* * Require CAP_SYS_NICE for influencing process performance. Note that * only non-destructive hints are currently supported for remote * processes. */ if (mm != current->mm && !capable(CAP_SYS_NICE)) { ret = -EPERM; goto release_mm; } ret = vector_madvise(mm, &iter, behavior); release_mm: mmput(mm); release_task: put_task_struct(task); free_iov: kfree(iov); out: return ret; }
5 1 2 1 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2015 Pablo Neira Ayuso <pablo@netfilter.org> */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/ipv4/nf_dup_ipv4.h> struct nft_dup_ipv4 { u8 sreg_addr; u8 sreg_dev; }; static void nft_dup_ipv4_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_dup_ipv4 *priv = nft_expr_priv(expr); struct in_addr gw = { .s_addr = (__force __be32)regs->data[priv->sreg_addr], }; int oif = priv->sreg_dev ? regs->data[priv->sreg_dev] : -1; nf_dup_ipv4(nft_net(pkt), pkt->skb, nft_hook(pkt), &gw, oif); } static int nft_dup_ipv4_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_dup_ipv4 *priv = nft_expr_priv(expr); int err; if (tb[NFTA_DUP_SREG_ADDR] == NULL) return -EINVAL; err = nft_parse_register_load(ctx, tb[NFTA_DUP_SREG_ADDR], &priv->sreg_addr, sizeof(struct in_addr)); if (err < 0) return err; if (tb[NFTA_DUP_SREG_DEV]) err = nft_parse_register_load(ctx, tb[NFTA_DUP_SREG_DEV], &priv->sreg_dev, sizeof(int)); return err; } static int nft_dup_ipv4_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { struct nft_dup_ipv4 *priv = nft_expr_priv(expr); if (nft_dump_register(skb, NFTA_DUP_SREG_ADDR, priv->sreg_addr)) goto nla_put_failure; if (priv->sreg_dev && nft_dump_register(skb, NFTA_DUP_SREG_DEV, priv->sreg_dev)) goto nla_put_failure; return 0; nla_put_failure: return -1; } static struct nft_expr_type nft_dup_ipv4_type; static const struct nft_expr_ops nft_dup_ipv4_ops = { .type = &nft_dup_ipv4_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_dup_ipv4)), .eval = nft_dup_ipv4_eval, .init = nft_dup_ipv4_init, .dump = nft_dup_ipv4_dump, .reduce = NFT_REDUCE_READONLY, }; static const struct nla_policy nft_dup_ipv4_policy[NFTA_DUP_MAX + 1] = { [NFTA_DUP_SREG_ADDR] = { .type = NLA_U32 }, [NFTA_DUP_SREG_DEV] = { .type = NLA_U32 }, }; static struct nft_expr_type nft_dup_ipv4_type __read_mostly = { .family = NFPROTO_IPV4, .name = "dup", .ops = &nft_dup_ipv4_ops, .policy = nft_dup_ipv4_policy, .maxattr = NFTA_DUP_MAX, .owner = THIS_MODULE, }; static int __init nft_dup_ipv4_module_init(void) { return nft_register_expr(&nft_dup_ipv4_type); } static void __exit nft_dup_ipv4_module_exit(void) { nft_unregister_expr(&nft_dup_ipv4_type); } module_init(nft_dup_ipv4_module_init); module_exit(nft_dup_ipv4_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "dup"); MODULE_DESCRIPTION("IPv4 nftables packet duplication support");
2 2 2 2 2 2 1 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 34 35 1 1 30 3 24 1 7 22 1 1 1 1 1 5 1 5 5 5 4 1 4 1 4 1 5 5 1 5 5 5 1 5 1 35 35 31 31 31 1 1 31 35 35 35 1 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/cls_flow.c Generic flow classifier * * Copyright (c) 2007, 2008 Patrick McHardy <kaber@trash.net> */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/list.h> #include <linux/jhash.h> #include <linux/random.h> #include <linux/pkt_cls.h> #include <linux/skbuff.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/if_vlan.h> #include <linux/slab.h> #include <linux/module.h> #include <net/inet_sock.h> #include <net/pkt_cls.h> #include <net/ip.h> #include <net/route.h> #include <net/flow_dissector.h> #include <net/tc_wrapper.h> #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack.h> #endif struct flow_head { struct list_head filters; struct rcu_head rcu; }; struct flow_filter { struct list_head list; struct tcf_exts exts; struct tcf_ematch_tree ematches; struct tcf_proto *tp; struct timer_list perturb_timer; u32 perturb_period; u32 handle; u32 nkeys; u32 keymask; u32 mode; u32 mask; u32 xor; u32 rshift; u32 addend; u32 divisor; u32 baseclass; u32 hashrnd; struct rcu_work rwork; }; static inline u32 addr_fold(void *addr) { unsigned long a = (unsigned long)addr; return (a & 0xFFFFFFFF) ^ (BITS_PER_LONG > 32 ? a >> 32 : 0); } static u32 flow_get_src(const struct sk_buff *skb, const struct flow_keys *flow) { __be32 src = flow_get_u32_src(flow); if (src) return ntohl(src); return addr_fold(skb->sk); } static u32 flow_get_dst(const struct sk_buff *skb, const struct flow_keys *flow) { __be32 dst = flow_get_u32_dst(flow); if (dst) return ntohl(dst); return addr_fold(skb_dst(skb)) ^ (__force u16)skb_protocol(skb, true); } static u32 flow_get_proto(const struct sk_buff *skb, const struct flow_keys *flow) { return flow->basic.ip_proto; } static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys *flow) { if (flow->ports.ports) return ntohs(flow->ports.src); return addr_fold(skb->sk); } static u32 flow_get_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow) { if (flow->ports.ports) return ntohs(flow->ports.dst); return addr_fold(skb_dst(skb)) ^ (__force u16)skb_protocol(skb, true); } static u32 flow_get_iif(const struct sk_buff *skb) { return skb->skb_iif; } static u32 flow_get_priority(const struct sk_buff *skb) { return skb->priority; } static u32 flow_get_mark(const struct sk_buff *skb) { return skb->mark; } static u32 flow_get_nfct(const struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) return addr_fold(skb_nfct(skb)); #else return 0; #endif } #if IS_ENABLED(CONFIG_NF_CONNTRACK) #define CTTUPLE(skb, member) \ ({ \ enum ip_conntrack_info ctinfo; \ const struct nf_conn *ct = nf_ct_get(skb, &ctinfo); \ if (ct == NULL) \ goto fallback; \ ct->tuplehash[CTINFO2DIR(ctinfo)].tuple.member; \ }) #else #define CTTUPLE(skb, member) \ ({ \ goto fallback; \ 0; \ }) #endif static u32 flow_get_nfct_src(const struct sk_buff *skb, const struct flow_keys *flow) { switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): return ntohl(CTTUPLE(skb, src.u3.ip)); case htons(ETH_P_IPV6): return ntohl(CTTUPLE(skb, src.u3.ip6[3])); } fallback: return flow_get_src(skb, flow); } static u32 flow_get_nfct_dst(const struct sk_buff *skb, const struct flow_keys *flow) { switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): return ntohl(CTTUPLE(skb, dst.u3.ip)); case htons(ETH_P_IPV6): return ntohl(CTTUPLE(skb, dst.u3.ip6[3])); } fallback: return flow_get_dst(skb, flow); } static u32 flow_get_nfct_proto_src(const struct sk_buff *skb, const struct flow_keys *flow) { return ntohs(CTTUPLE(skb, src.u.all)); fallback: return flow_get_proto_src(skb, flow); } static u32 flow_get_nfct_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow) { return ntohs(CTTUPLE(skb, dst.u.all)); fallback: return flow_get_proto_dst(skb, flow); } static u32 flow_get_rtclassid(const struct sk_buff *skb) { #ifdef CONFIG_IP_ROUTE_CLASSID if (skb_dst(skb)) return skb_dst(skb)->tclassid; #endif return 0; } static u32 flow_get_skuid(const struct sk_buff *skb) { struct sock *sk = skb_to_full_sk(skb); if (sk && sk->sk_socket && sk->sk_socket->file) { kuid_t skuid = sk->sk_socket->file->f_cred->fsuid; return from_kuid(&init_user_ns, skuid); } return 0; } static u32 flow_get_skgid(const struct sk_buff *skb) { struct sock *sk = skb_to_full_sk(skb); if (sk && sk->sk_socket && sk->sk_socket->file) { kgid_t skgid = sk->sk_socket->file->f_cred->fsgid; return from_kgid(&init_user_ns, skgid); } return 0; } static u32 flow_get_vlan_tag(const struct sk_buff *skb) { u16 tag; if (vlan_get_tag(skb, &tag) < 0) return 0; return tag & VLAN_VID_MASK; } static u32 flow_get_rxhash(struct sk_buff *skb) { return skb_get_hash(skb); } static u32 flow_key_get(struct sk_buff *skb, int key, struct flow_keys *flow) { switch (key) { case FLOW_KEY_SRC: return flow_get_src(skb, flow); case FLOW_KEY_DST: return flow_get_dst(skb, flow); case FLOW_KEY_PROTO: return flow_get_proto(skb, flow); case FLOW_KEY_PROTO_SRC: return flow_get_proto_src(skb, flow); case FLOW_KEY_PROTO_DST: return flow_get_proto_dst(skb, flow); case FLOW_KEY_IIF: return flow_get_iif(skb); case FLOW_KEY_PRIORITY: return flow_get_priority(skb); case FLOW_KEY_MARK: return flow_get_mark(skb); case FLOW_KEY_NFCT: return flow_get_nfct(skb); case FLOW_KEY_NFCT_SRC: return flow_get_nfct_src(skb, flow); case FLOW_KEY_NFCT_DST: return flow_get_nfct_dst(skb, flow); case FLOW_KEY_NFCT_PROTO_SRC: return flow_get_nfct_proto_src(skb, flow); case FLOW_KEY_NFCT_PROTO_DST: return flow_get_nfct_proto_dst(skb, flow); case FLOW_KEY_RTCLASSID: return flow_get_rtclassid(skb); case FLOW_KEY_SKUID: return flow_get_skuid(skb); case FLOW_KEY_SKGID: return flow_get_skgid(skb); case FLOW_KEY_VLAN_TAG: return flow_get_vlan_tag(skb); case FLOW_KEY_RXHASH: return flow_get_rxhash(skb); default: WARN_ON(1); return 0; } } #define FLOW_KEYS_NEEDED ((1 << FLOW_KEY_SRC) | \ (1 << FLOW_KEY_DST) | \ (1 << FLOW_KEY_PROTO) | \ (1 << FLOW_KEY_PROTO_SRC) | \ (1 << FLOW_KEY_PROTO_DST) | \ (1 << FLOW_KEY_NFCT_SRC) | \ (1 << FLOW_KEY_NFCT_DST) | \ (1 << FLOW_KEY_NFCT_PROTO_SRC) | \ (1 << FLOW_KEY_NFCT_PROTO_DST)) TC_INDIRECT_SCOPE int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { struct flow_head *head = rcu_dereference_bh(tp->root); struct flow_filter *f; u32 keymask; u32 classid; unsigned int n, key; int r; list_for_each_entry_rcu(f, &head->filters, list) { u32 keys[FLOW_KEY_MAX + 1]; struct flow_keys flow_keys; if (!tcf_em_tree_match(skb, &f->ematches, NULL)) continue; keymask = f->keymask; if (keymask & FLOW_KEYS_NEEDED) skb_flow_dissect_flow_keys(skb, &flow_keys, 0); for (n = 0; n < f->nkeys; n++) { key = ffs(keymask) - 1; keymask &= ~(1 << key); keys[n] = flow_key_get(skb, key, &flow_keys); } if (f->mode == FLOW_MODE_HASH) classid = jhash2(keys, f->nkeys, f->hashrnd); else { classid = keys[0]; classid = (classid & f->mask) ^ f->xor; classid = (classid >> f->rshift) + f->addend; } if (f->divisor) classid %= f->divisor; res->class = 0; res->classid = TC_H_MAKE(f->baseclass, f->baseclass + classid); r = tcf_exts_exec(skb, &f->exts, res); if (r < 0) continue; return r; } return -1; } static void flow_perturbation(struct timer_list *t) { struct flow_filter *f = from_timer(f, t, perturb_timer); get_random_bytes(&f->hashrnd, 4); if (f->perturb_period) mod_timer(&f->perturb_timer, jiffies + f->perturb_period); } static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = { [TCA_FLOW_KEYS] = { .type = NLA_U32 }, [TCA_FLOW_MODE] = { .type = NLA_U32 }, [TCA_FLOW_BASECLASS] = { .type = NLA_U32 }, [TCA_FLOW_RSHIFT] = NLA_POLICY_MAX(NLA_U32, 31 /* BITS_PER_U32 - 1 */), [TCA_FLOW_ADDEND] = { .type = NLA_U32 }, [TCA_FLOW_MASK] = { .type = NLA_U32 }, [TCA_FLOW_XOR] = { .type = NLA_U32 }, [TCA_FLOW_DIVISOR] = { .type = NLA_U32 }, [TCA_FLOW_ACT] = { .type = NLA_NESTED }, [TCA_FLOW_POLICE] = { .type = NLA_NESTED }, [TCA_FLOW_EMATCHES] = { .type = NLA_NESTED }, [TCA_FLOW_PERTURB] = { .type = NLA_U32 }, }; static void __flow_destroy_filter(struct flow_filter *f) { timer_shutdown_sync(&f->perturb_timer); tcf_exts_destroy(&f->exts); tcf_em_tree_destroy(&f->ematches); tcf_exts_put_net(&f->exts); kfree(f); } static void flow_destroy_filter_work(struct work_struct *work) { struct flow_filter *f = container_of(to_rcu_work(work), struct flow_filter, rwork); rtnl_lock(); __flow_destroy_filter(f); rtnl_unlock(); } static int flow_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, void **arg, u32 flags, struct netlink_ext_ack *extack) { struct flow_head *head = rtnl_dereference(tp->root); struct flow_filter *fold, *fnew; struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_FLOW_MAX + 1]; unsigned int nkeys = 0; unsigned int perturb_period = 0; u32 baseclass = 0; u32 keymask = 0; u32 mode; int err; if (opt == NULL) return -EINVAL; err = nla_parse_nested_deprecated(tb, TCA_FLOW_MAX, opt, flow_policy, NULL); if (err < 0) return err; if (tb[TCA_FLOW_BASECLASS]) { baseclass = nla_get_u32(tb[TCA_FLOW_BASECLASS]); if (TC_H_MIN(baseclass) == 0) return -EINVAL; } if (tb[TCA_FLOW_KEYS]) { keymask = nla_get_u32(tb[TCA_FLOW_KEYS]); nkeys = hweight32(keymask); if (nkeys == 0) return -EINVAL; if (fls(keymask) - 1 > FLOW_KEY_MAX) return -EOPNOTSUPP; if ((keymask & (FLOW_KEY_SKUID|FLOW_KEY_SKGID)) && sk_user_ns(NETLINK_CB(in_skb).sk) != &init_user_ns) return -EOPNOTSUPP; } fnew = kzalloc(sizeof(*fnew), GFP_KERNEL); if (!fnew) return -ENOBUFS; err = tcf_em_tree_validate(tp, tb[TCA_FLOW_EMATCHES], &fnew->ematches); if (err < 0) goto err1; err = tcf_exts_init(&fnew->exts, net, TCA_FLOW_ACT, TCA_FLOW_POLICE); if (err < 0) goto err2; err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &fnew->exts, flags, extack); if (err < 0) goto err2; fold = *arg; if (fold) { err = -EINVAL; if (fold->handle != handle && handle) goto err2; /* Copy fold into fnew */ fnew->tp = fold->tp; fnew->handle = fold->handle; fnew->nkeys = fold->nkeys; fnew->keymask = fold->keymask; fnew->mode = fold->mode; fnew->mask = fold->mask; fnew->xor = fold->xor; fnew->rshift = fold->rshift; fnew->addend = fold->addend; fnew->divisor = fold->divisor; fnew->baseclass = fold->baseclass; fnew->hashrnd = fold->hashrnd; mode = fold->mode; if (tb[TCA_FLOW_MODE]) mode = nla_get_u32(tb[TCA_FLOW_MODE]); if (mode != FLOW_MODE_HASH && nkeys > 1) goto err2; if (mode == FLOW_MODE_HASH) perturb_period = fold->perturb_period; if (tb[TCA_FLOW_PERTURB]) { if (mode != FLOW_MODE_HASH) goto err2; perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ; } } else { err = -EINVAL; if (!handle) goto err2; if (!tb[TCA_FLOW_KEYS]) goto err2; mode = FLOW_MODE_MAP; if (tb[TCA_FLOW_MODE]) mode = nla_get_u32(tb[TCA_FLOW_MODE]); if (mode != FLOW_MODE_HASH && nkeys > 1) goto err2; if (tb[TCA_FLOW_PERTURB]) { if (mode != FLOW_MODE_HASH) goto err2; perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ; } if (TC_H_MAJ(baseclass) == 0) { struct Qdisc *q = tcf_block_q(tp->chain->block); baseclass = TC_H_MAKE(q->handle, baseclass); } if (TC_H_MIN(baseclass) == 0) baseclass = TC_H_MAKE(baseclass, 1); fnew->handle = handle; fnew->mask = ~0U; fnew->tp = tp; get_random_bytes(&fnew->hashrnd, 4); } timer_setup(&fnew->perturb_timer, flow_perturbation, TIMER_DEFERRABLE); tcf_block_netif_keep_dst(tp->chain->block); if (tb[TCA_FLOW_KEYS]) { fnew->keymask = keymask; fnew->nkeys = nkeys; } fnew->mode = mode; if (tb[TCA_FLOW_MASK]) fnew->mask = nla_get_u32(tb[TCA_FLOW_MASK]); if (tb[TCA_FLOW_XOR]) fnew->xor = nla_get_u32(tb[TCA_FLOW_XOR]); if (tb[TCA_FLOW_RSHIFT]) fnew->rshift = nla_get_u32(tb[TCA_FLOW_RSHIFT]); if (tb[TCA_FLOW_ADDEND]) fnew->addend = nla_get_u32(tb[TCA_FLOW_ADDEND]); if (tb[TCA_FLOW_DIVISOR]) fnew->divisor = nla_get_u32(tb[TCA_FLOW_DIVISOR]); if (baseclass) fnew->baseclass = baseclass; fnew->perturb_period = perturb_period; if (perturb_period) mod_timer(&fnew->perturb_timer, jiffies + perturb_period); if (!*arg) list_add_tail_rcu(&fnew->list, &head->filters); else list_replace_rcu(&fold->list, &fnew->list); *arg = fnew; if (fold) { tcf_exts_get_net(&fold->exts); tcf_queue_work(&fold->rwork, flow_destroy_filter_work); } return 0; err2: tcf_exts_destroy(&fnew->exts); tcf_em_tree_destroy(&fnew->ematches); err1: kfree(fnew); return err; } static int flow_delete(struct tcf_proto *tp, void *arg, bool *last, bool rtnl_held, struct netlink_ext_ack *extack) { struct flow_head *head = rtnl_dereference(tp->root); struct flow_filter *f = arg; list_del_rcu(&f->list); tcf_exts_get_net(&f->exts); tcf_queue_work(&f->rwork, flow_destroy_filter_work); *last = list_empty(&head->filters); return 0; } static int flow_init(struct tcf_proto *tp) { struct flow_head *head; head = kzalloc(sizeof(*head), GFP_KERNEL); if (head == NULL) return -ENOBUFS; INIT_LIST_HEAD(&head->filters); rcu_assign_pointer(tp->root, head); return 0; } static void flow_destroy(struct tcf_proto *tp, bool rtnl_held, struct netlink_ext_ack *extack) { struct flow_head *head = rtnl_dereference(tp->root); struct flow_filter *f, *next; list_for_each_entry_safe(f, next, &head->filters, list) { list_del_rcu(&f->list); if (tcf_exts_get_net(&f->exts)) tcf_queue_work(&f->rwork, flow_destroy_filter_work); else __flow_destroy_filter(f); } kfree_rcu(head, rcu); } static void *flow_get(struct tcf_proto *tp, u32 handle) { struct flow_head *head = rtnl_dereference(tp->root); struct flow_filter *f; list_for_each_entry(f, &head->filters, list) if (f->handle == handle) return f; return NULL; } static int flow_dump(struct net *net, struct tcf_proto *tp, void *fh, struct sk_buff *skb, struct tcmsg *t, bool rtnl_held) { struct flow_filter *f = fh; struct nlattr *nest; if (f == NULL) return skb->len; t->tcm_handle = f->handle; nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; if (nla_put_u32(skb, TCA_FLOW_KEYS, f->keymask) || nla_put_u32(skb, TCA_FLOW_MODE, f->mode)) goto nla_put_failure; if (f->mask != ~0 || f->xor != 0) { if (nla_put_u32(skb, TCA_FLOW_MASK, f->mask) || nla_put_u32(skb, TCA_FLOW_XOR, f->xor)) goto nla_put_failure; } if (f->rshift && nla_put_u32(skb, TCA_FLOW_RSHIFT, f->rshift)) goto nla_put_failure; if (f->addend && nla_put_u32(skb, TCA_FLOW_ADDEND, f->addend)) goto nla_put_failure; if (f->divisor && nla_put_u32(skb, TCA_FLOW_DIVISOR, f->divisor)) goto nla_put_failure; if (f->baseclass && nla_put_u32(skb, TCA_FLOW_BASECLASS, f->baseclass)) goto nla_put_failure; if (f->perturb_period && nla_put_u32(skb, TCA_FLOW_PERTURB, f->perturb_period / HZ)) goto nla_put_failure; if (tcf_exts_dump(skb, &f->exts) < 0) goto nla_put_failure; #ifdef CONFIG_NET_EMATCH if (f->ematches.hdr.nmatches && tcf_em_tree_dump(skb, &f->ematches, TCA_FLOW_EMATCHES) < 0) goto nla_put_failure; #endif nla_nest_end(skb, nest); if (tcf_exts_dump_stats(skb, &f->exts) < 0) goto nla_put_failure; return skb->len; nla_put_failure: nla_nest_cancel(skb, nest); return -1; } static void flow_walk(struct tcf_proto *tp, struct tcf_walker *arg, bool rtnl_held) { struct flow_head *head = rtnl_dereference(tp->root); struct flow_filter *f; list_for_each_entry(f, &head->filters, list) { if (!tc_cls_stats_dump(tp, arg, f)) break; } } static struct tcf_proto_ops cls_flow_ops __read_mostly = { .kind = "flow", .classify = flow_classify, .init = flow_init, .destroy = flow_destroy, .change = flow_change, .delete = flow_delete, .get = flow_get, .dump = flow_dump, .walk = flow_walk, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_CLS("flow"); static int __init cls_flow_init(void) { return register_tcf_proto_ops(&cls_flow_ops); } static void __exit cls_flow_exit(void) { unregister_tcf_proto_ops(&cls_flow_ops); } module_init(cls_flow_init); module_exit(cls_flow_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("TC flow classifier");
2464 3947 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2021, Google LLC. * Pasha Tatashin <pasha.tatashin@soleen.com> */ #ifndef __LINUX_PAGE_TABLE_CHECK_H #define __LINUX_PAGE_TABLE_CHECK_H #ifdef CONFIG_PAGE_TABLE_CHECK #include <linux/jump_label.h> extern struct static_key_true page_table_check_disabled; extern struct page_ext_operations page_table_check_ops; void __page_table_check_zero(struct page *page, unsigned int order); void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte); void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd); void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud); void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, unsigned int nr); void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd); void __page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud); void __page_table_check_pte_clear_range(struct mm_struct *mm, unsigned long addr, pmd_t pmd); static inline void page_table_check_alloc(struct page *page, unsigned int order) { if (static_branch_likely(&page_table_check_disabled)) return; __page_table_check_zero(page, order); } static inline void page_table_check_free(struct page *page, unsigned int order) { if (static_branch_likely(&page_table_check_disabled)) return; __page_table_check_zero(page, order); } static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) { if (static_branch_likely(&page_table_check_disabled)) return; __page_table_check_pte_clear(mm, pte); } static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) { if (static_branch_likely(&page_table_check_disabled)) return; __page_table_check_pmd_clear(mm, pmd); } static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) { if (static_branch_likely(&page_table_check_disabled)) return; __page_table_check_pud_clear(mm, pud); } static inline void page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, unsigned int nr) { if (static_branch_likely(&page_table_check_disabled)) return; __page_table_check_ptes_set(mm, ptep, pte, nr); } static inline void page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd) { if (static_branch_likely(&page_table_check_disabled)) return; __page_table_check_pmd_set(mm, pmdp, pmd); } static inline void page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud) { if (static_branch_likely(&page_table_check_disabled)) return; __page_table_check_pud_set(mm, pudp, pud); } static inline void page_table_check_pte_clear_range(struct mm_struct *mm, unsigned long addr, pmd_t pmd) { if (static_branch_likely(&page_table_check_disabled)) return; __page_table_check_pte_clear_range(mm, addr, pmd); } #else static inline void page_table_check_alloc(struct page *page, unsigned int order) { } static inline void page_table_check_free(struct page *page, unsigned int order) { } static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) { } static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) { } static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) { } static inline void page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, unsigned int nr) { } static inline void page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd) { } static inline void page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud) { } static inline void page_table_check_pte_clear_range(struct mm_struct *mm, unsigned long addr, pmd_t pmd) { } #endif /* CONFIG_PAGE_TABLE_CHECK */ #endif /* __LINUX_PAGE_TABLE_CHECK_H */
16580 16589 16602 16612 16584 16594 16581 16785 16784 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 // SPDX-License-Identifier: GPL-2.0-only /* * AppArmor security module * * This file contains AppArmor network mediation * * Copyright (C) 1998-2008 Novell/SUSE * Copyright 2009-2017 Canonical Ltd. */ #include "include/apparmor.h" #include "include/audit.h" #include "include/cred.h" #include "include/label.h" #include "include/net.h" #include "include/policy.h" #include "include/secid.h" #include "net_names.h" struct aa_sfs_entry aa_sfs_entry_network[] = { AA_SFS_FILE_STRING("af_mask", AA_SFS_AF_MASK), { } }; static const char * const net_mask_names[] = { "unknown", "send", "receive", "unknown", "create", "shutdown", "connect", "unknown", "setattr", "getattr", "setcred", "getcred", "chmod", "chown", "chgrp", "lock", "mmap", "mprot", "unknown", "unknown", "accept", "bind", "listen", "unknown", "setopt", "getopt", "unknown", "unknown", "unknown", "unknown", "unknown", "unknown", }; /* audit callback for net specific fields */ void audit_net_cb(struct audit_buffer *ab, void *va) { struct common_audit_data *sa = va; struct apparmor_audit_data *ad = aad(sa); if (address_family_names[sa->u.net->family]) audit_log_format(ab, " family=\"%s\"", address_family_names[sa->u.net->family]); else audit_log_format(ab, " family=\"unknown(%d)\"", sa->u.net->family); if (sock_type_names[ad->net.type]) audit_log_format(ab, " sock_type=\"%s\"", sock_type_names[ad->net.type]); else audit_log_format(ab, " sock_type=\"unknown(%d)\"", ad->net.type); audit_log_format(ab, " protocol=%d", ad->net.protocol); if (ad->request & NET_PERMS_MASK) { audit_log_format(ab, " requested_mask="); aa_audit_perm_mask(ab, ad->request, NULL, 0, net_mask_names, NET_PERMS_MASK); if (ad->denied & NET_PERMS_MASK) { audit_log_format(ab, " denied_mask="); aa_audit_perm_mask(ab, ad->denied, NULL, 0, net_mask_names, NET_PERMS_MASK); } } if (ad->peer) { audit_log_format(ab, " peer="); aa_label_xaudit(ab, labels_ns(ad->subj_label), ad->peer, FLAGS_NONE, GFP_ATOMIC); } } /* Generic af perm */ int aa_profile_af_perm(struct aa_profile *profile, struct apparmor_audit_data *ad, u32 request, u16 family, int type) { struct aa_ruleset *rules = list_first_entry(&profile->rules, typeof(*rules), list); struct aa_perms perms = { }; aa_state_t state; __be16 buffer[2]; AA_BUG(family >= AF_MAX); AA_BUG(type < 0 || type >= SOCK_MAX); if (profile_unconfined(profile)) return 0; state = RULE_MEDIATES(rules, AA_CLASS_NET); if (!state) return 0; buffer[0] = cpu_to_be16(family); buffer[1] = cpu_to_be16((u16) type); state = aa_dfa_match_len(rules->policy->dfa, state, (char *) &buffer, 4); perms = *aa_lookup_perms(rules->policy, state); aa_apply_modes_to_perms(profile, &perms); return aa_check_perms(profile, &perms, request, ad, audit_net_cb); } int aa_af_perm(const struct cred *subj_cred, struct aa_label *label, const char *op, u32 request, u16 family, int type, int protocol) { struct aa_profile *profile; DEFINE_AUDIT_NET(ad, op, NULL, family, type, protocol); return fn_for_each_confined(label, profile, aa_profile_af_perm(profile, &ad, request, family, type)); } static int aa_label_sk_perm(const struct cred *subj_cred, struct aa_label *label, const char *op, u32 request, struct sock *sk) { struct aa_sk_ctx *ctx = aa_sock(sk); int error = 0; AA_BUG(!label); AA_BUG(!sk); if (ctx->label != kernel_t && !unconfined(label)) { struct aa_profile *profile; DEFINE_AUDIT_SK(ad, op, sk); ad.subj_cred = subj_cred; error = fn_for_each_confined(label, profile, aa_profile_af_sk_perm(profile, &ad, request, sk)); } return error; } int aa_sk_perm(const char *op, u32 request, struct sock *sk) { struct aa_label *label; int error; AA_BUG(!sk); AA_BUG(in_interrupt()); /* TODO: switch to begin_current_label ???? */ label = begin_current_label_crit_section(); error = aa_label_sk_perm(current_cred(), label, op, request, sk); end_current_label_crit_section(label); return error; } int aa_sock_file_perm(const struct cred *subj_cred, struct aa_label *label, const char *op, u32 request, struct socket *sock) { AA_BUG(!label); AA_BUG(!sock); AA_BUG(!sock->sk); return aa_label_sk_perm(subj_cred, label, op, request, sock->sk); } #ifdef CONFIG_NETWORK_SECMARK static int apparmor_secmark_init(struct aa_secmark *secmark) { struct aa_label *label; if (secmark->label[0] == '*') { secmark->secid = AA_SECID_WILDCARD; return 0; } label = aa_label_strn_parse(&root_ns->unconfined->label, secmark->label, strlen(secmark->label), GFP_ATOMIC, false, false); if (IS_ERR(label)) return PTR_ERR(label); secmark->secid = label->secid; return 0; } static int aa_secmark_perm(struct aa_profile *profile, u32 request, u32 secid, struct apparmor_audit_data *ad) { int i, ret; struct aa_perms perms = { }; struct aa_ruleset *rules = list_first_entry(&profile->rules, typeof(*rules), list); if (rules->secmark_count == 0) return 0; for (i = 0; i < rules->secmark_count; i++) { if (!rules->secmark[i].secid) { ret = apparmor_secmark_init(&rules->secmark[i]); if (ret) return ret; } if (rules->secmark[i].secid == secid || rules->secmark[i].secid == AA_SECID_WILDCARD) { if (rules->secmark[i].deny) perms.deny = ALL_PERMS_MASK; else perms.allow = ALL_PERMS_MASK; if (rules->secmark[i].audit) perms.audit = ALL_PERMS_MASK; } } aa_apply_modes_to_perms(profile, &perms); return aa_check_perms(profile, &perms, request, ad, audit_net_cb); } int apparmor_secmark_check(struct aa_label *label, char *op, u32 request, u32 secid, const struct sock *sk) { struct aa_profile *profile; DEFINE_AUDIT_SK(ad, op, sk); return fn_for_each_confined(label, profile, aa_secmark_perm(profile, request, secid, &ad)); } #endif
146 43 147 182 43 134 147 146 182 147 110 147 104 198 197 6 6 5 124 126 23 105 103 102 105 196 1 5 4 125 100 22 1 4 1 4 175 4 119 179 156 19 4 178 179 1 22 18 1 179 179 98 85 122 122 123 1 21 17 1 123 123 50 91 4 27 4 4 4 177 4 30 145 123 124 72 72 69 1614 1631 1616 31 7 35 36 36 36 1 35 35 121 118 7 7 7 6 12 12 12 12 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 /* netfilter.c: look after the filters for various protocols. * Heavily influenced by the old firewall.c by David Bonn and Alan Cox. * * Thanks to Rob `CmdrTaco' Malda for not influencing this code in any * way. * * This code is GPL. */ #include <linux/kernel.h> #include <linux/netfilter.h> #include <net/protocol.h> #include <linux/init.h> #include <linux/skbuff.h> #include <linux/wait.h> #include <linux/module.h> #include <linux/interrupt.h> #include <linux/if.h> #include <linux/netdevice.h> #include <linux/netfilter_ipv6.h> #include <linux/inetdevice.h> #include <linux/proc_fs.h> #include <linux/mutex.h> #include <linux/mm.h> #include <linux/rcupdate.h> #include <net/net_namespace.h> #include <net/netfilter/nf_queue.h> #include <net/sock.h> #include "nf_internals.h" const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly; EXPORT_SYMBOL_GPL(nf_ipv6_ops); DEFINE_PER_CPU(bool, nf_skb_duplicated); EXPORT_SYMBOL_GPL(nf_skb_duplicated); #ifdef CONFIG_JUMP_LABEL struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; EXPORT_SYMBOL(nf_hooks_needed); #endif static DEFINE_MUTEX(nf_hook_mutex); /* max hooks per family/hooknum */ #define MAX_HOOK_COUNT 1024 #define nf_entry_dereference(e) \ rcu_dereference_protected(e, lockdep_is_held(&nf_hook_mutex)) static struct nf_hook_entries *allocate_hook_entries_size(u16 num) { struct nf_hook_entries *e; size_t alloc = sizeof(*e) + sizeof(struct nf_hook_entry) * num + sizeof(struct nf_hook_ops *) * num + sizeof(struct nf_hook_entries_rcu_head); if (num == 0) return NULL; e = kvzalloc(alloc, GFP_KERNEL_ACCOUNT); if (e) e->num_hook_entries = num; return e; } static void __nf_hook_entries_free(struct rcu_head *h) { struct nf_hook_entries_rcu_head *head; head = container_of(h, struct nf_hook_entries_rcu_head, head); kvfree(head->allocation); } static void nf_hook_entries_free(struct nf_hook_entries *e) { struct nf_hook_entries_rcu_head *head; struct nf_hook_ops **ops; unsigned int num; if (!e) return; num = e->num_hook_entries; ops = nf_hook_entries_get_hook_ops(e); head = (void *)&ops[num]; head->allocation = e; call_rcu(&head->head, __nf_hook_entries_free); } static unsigned int accept_all(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { return NF_ACCEPT; /* ACCEPT makes nf_hook_slow call next hook */ } static const struct nf_hook_ops dummy_ops = { .hook = accept_all, .priority = INT_MIN, }; static struct nf_hook_entries * nf_hook_entries_grow(const struct nf_hook_entries *old, const struct nf_hook_ops *reg) { unsigned int i, alloc_entries, nhooks, old_entries; struct nf_hook_ops **orig_ops = NULL; struct nf_hook_ops **new_ops; struct nf_hook_entries *new; bool inserted = false; alloc_entries = 1; old_entries = old ? old->num_hook_entries : 0; if (old) { orig_ops = nf_hook_entries_get_hook_ops(old); for (i = 0; i < old_entries; i++) { if (orig_ops[i] != &dummy_ops) alloc_entries++; /* Restrict BPF hook type to force a unique priority, not * shared at attach time. * * This is mainly to avoid ordering issues between two * different bpf programs, this doesn't prevent a normal * hook at same priority as a bpf one (we don't want to * prevent defrag, conntrack, iptables etc from attaching). */ if (reg->priority == orig_ops[i]->priority && reg->hook_ops_type == NF_HOOK_OP_BPF) return ERR_PTR(-EBUSY); } } if (alloc_entries > MAX_HOOK_COUNT) return ERR_PTR(-E2BIG); new = allocate_hook_entries_size(alloc_entries); if (!new) return ERR_PTR(-ENOMEM); new_ops = nf_hook_entries_get_hook_ops(new); i = 0; nhooks = 0; while (i < old_entries) { if (orig_ops[i] == &dummy_ops) { ++i; continue; } if (inserted || reg->priority > orig_ops[i]->priority) { new_ops[nhooks] = (void *)orig_ops[i]; new->hooks[nhooks] = old->hooks[i]; i++; } else { new_ops[nhooks] = (void *)reg; new->hooks[nhooks].hook = reg->hook; new->hooks[nhooks].priv = reg->priv; inserted = true; } nhooks++; } if (!inserted) { new_ops[nhooks] = (void *)reg; new->hooks[nhooks].hook = reg->hook; new->hooks[nhooks].priv = reg->priv; } return new; } static void hooks_validate(const struct nf_hook_entries *hooks) { #ifdef CONFIG_DEBUG_MISC struct nf_hook_ops **orig_ops; int prio = INT_MIN; size_t i = 0; orig_ops = nf_hook_entries_get_hook_ops(hooks); for (i = 0; i < hooks->num_hook_entries; i++) { if (orig_ops[i] == &dummy_ops) continue; WARN_ON(orig_ops[i]->priority < prio); if (orig_ops[i]->priority > prio) prio = orig_ops[i]->priority; } #endif } int nf_hook_entries_insert_raw(struct nf_hook_entries __rcu **pp, const struct nf_hook_ops *reg) { struct nf_hook_entries *new_hooks; struct nf_hook_entries *p; p = rcu_dereference_raw(*pp); new_hooks = nf_hook_entries_grow(p, reg); if (IS_ERR(new_hooks)) return PTR_ERR(new_hooks); hooks_validate(new_hooks); rcu_assign_pointer(*pp, new_hooks); BUG_ON(p == new_hooks); nf_hook_entries_free(p); return 0; } EXPORT_SYMBOL_GPL(nf_hook_entries_insert_raw); /* * __nf_hook_entries_try_shrink - try to shrink hook array * * @old -- current hook blob at @pp * @pp -- location of hook blob * * Hook unregistration must always succeed, so to-be-removed hooks * are replaced by a dummy one that will just move to next hook. * * This counts the current dummy hooks, attempts to allocate new blob, * copies the live hooks, then replaces and discards old one. * * return values: * * Returns address to free, or NULL. */ static void *__nf_hook_entries_try_shrink(struct nf_hook_entries *old, struct nf_hook_entries __rcu **pp) { unsigned int i, j, skip = 0, hook_entries; struct nf_hook_entries *new = NULL; struct nf_hook_ops **orig_ops; struct nf_hook_ops **new_ops; if (WARN_ON_ONCE(!old)) return NULL; orig_ops = nf_hook_entries_get_hook_ops(old); for (i = 0; i < old->num_hook_entries; i++) { if (orig_ops[i] == &dummy_ops) skip++; } /* if skip == hook_entries all hooks have been removed */ hook_entries = old->num_hook_entries; if (skip == hook_entries) goto out_assign; if (skip == 0) return NULL; hook_entries -= skip; new = allocate_hook_entries_size(hook_entries); if (!new) return NULL; new_ops = nf_hook_entries_get_hook_ops(new); for (i = 0, j = 0; i < old->num_hook_entries; i++) { if (orig_ops[i] == &dummy_ops) continue; new->hooks[j] = old->hooks[i]; new_ops[j] = (void *)orig_ops[i]; j++; } hooks_validate(new); out_assign: rcu_assign_pointer(*pp, new); return old; } static struct nf_hook_entries __rcu ** nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum, struct net_device *dev) { switch (pf) { case NFPROTO_NETDEV: break; #ifdef CONFIG_NETFILTER_FAMILY_ARP case NFPROTO_ARP: if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_arp) <= hooknum)) return NULL; return net->nf.hooks_arp + hooknum; #endif #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE case NFPROTO_BRIDGE: if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_bridge) <= hooknum)) return NULL; return net->nf.hooks_bridge + hooknum; #endif #ifdef CONFIG_NETFILTER_INGRESS case NFPROTO_INET: if (WARN_ON_ONCE(hooknum != NF_INET_INGRESS)) return NULL; if (!dev || dev_net(dev) != net) { WARN_ON_ONCE(1); return NULL; } return &dev->nf_hooks_ingress; #endif case NFPROTO_IPV4: if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_ipv4) <= hooknum)) return NULL; return net->nf.hooks_ipv4 + hooknum; case NFPROTO_IPV6: if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_ipv6) <= hooknum)) return NULL; return net->nf.hooks_ipv6 + hooknum; default: WARN_ON_ONCE(1); return NULL; } #ifdef CONFIG_NETFILTER_INGRESS if (hooknum == NF_NETDEV_INGRESS) { if (dev && dev_net(dev) == net) return &dev->nf_hooks_ingress; } #endif #ifdef CONFIG_NETFILTER_EGRESS if (hooknum == NF_NETDEV_EGRESS) { if (dev && dev_net(dev) == net) return &dev->nf_hooks_egress; } #endif WARN_ON_ONCE(1); return NULL; } static int nf_ingress_check(struct net *net, const struct nf_hook_ops *reg, int hooknum) { #ifndef CONFIG_NETFILTER_INGRESS if (reg->hooknum == hooknum) return -EOPNOTSUPP; #endif if (reg->hooknum != hooknum || !reg->dev || dev_net(reg->dev) != net) return -EINVAL; return 0; } static inline bool __maybe_unused nf_ingress_hook(const struct nf_hook_ops *reg, int pf) { if ((pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) || (pf == NFPROTO_INET && reg->hooknum == NF_INET_INGRESS)) return true; return false; } static inline bool __maybe_unused nf_egress_hook(const struct nf_hook_ops *reg, int pf) { return pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_EGRESS; } static void nf_static_key_inc(const struct nf_hook_ops *reg, int pf) { #ifdef CONFIG_JUMP_LABEL int hooknum; if (pf == NFPROTO_INET && reg->hooknum == NF_INET_INGRESS) { pf = NFPROTO_NETDEV; hooknum = NF_NETDEV_INGRESS; } else { hooknum = reg->hooknum; } static_key_slow_inc(&nf_hooks_needed[pf][hooknum]); #endif } static void nf_static_key_dec(const struct nf_hook_ops *reg, int pf) { #ifdef CONFIG_JUMP_LABEL int hooknum; if (pf == NFPROTO_INET && reg->hooknum == NF_INET_INGRESS) { pf = NFPROTO_NETDEV; hooknum = NF_NETDEV_INGRESS; } else { hooknum = reg->hooknum; } static_key_slow_dec(&nf_hooks_needed[pf][hooknum]); #endif } static int __nf_register_net_hook(struct net *net, int pf, const struct nf_hook_ops *reg) { struct nf_hook_entries *p, *new_hooks; struct nf_hook_entries __rcu **pp; int err; switch (pf) { case NFPROTO_NETDEV: #ifndef CONFIG_NETFILTER_INGRESS if (reg->hooknum == NF_NETDEV_INGRESS) return -EOPNOTSUPP; #endif #ifndef CONFIG_NETFILTER_EGRESS if (reg->hooknum == NF_NETDEV_EGRESS) return -EOPNOTSUPP; #endif if ((reg->hooknum != NF_NETDEV_INGRESS && reg->hooknum != NF_NETDEV_EGRESS) || !reg->dev || dev_net(reg->dev) != net) return -EINVAL; break; case NFPROTO_INET: if (reg->hooknum != NF_INET_INGRESS) break; err = nf_ingress_check(net, reg, NF_INET_INGRESS); if (err < 0) return err; break; } pp = nf_hook_entry_head(net, pf, reg->hooknum, reg->dev); if (!pp) return -EINVAL; mutex_lock(&nf_hook_mutex); p = nf_entry_dereference(*pp); new_hooks = nf_hook_entries_grow(p, reg); if (!IS_ERR(new_hooks)) { hooks_validate(new_hooks); rcu_assign_pointer(*pp, new_hooks); } mutex_unlock(&nf_hook_mutex); if (IS_ERR(new_hooks)) return PTR_ERR(new_hooks); #ifdef CONFIG_NETFILTER_INGRESS if (nf_ingress_hook(reg, pf)) net_inc_ingress_queue(); #endif #ifdef CONFIG_NETFILTER_EGRESS if (nf_egress_hook(reg, pf)) net_inc_egress_queue(); #endif nf_static_key_inc(reg, pf); BUG_ON(p == new_hooks); nf_hook_entries_free(p); return 0; } /* * nf_remove_net_hook - remove a hook from blob * * @oldp: current address of hook blob * @unreg: hook to unregister * * This cannot fail, hook unregistration must always succeed. * Therefore replace the to-be-removed hook with a dummy hook. */ static bool nf_remove_net_hook(struct nf_hook_entries *old, const struct nf_hook_ops *unreg) { struct nf_hook_ops **orig_ops; unsigned int i; orig_ops = nf_hook_entries_get_hook_ops(old); for (i = 0; i < old->num_hook_entries; i++) { if (orig_ops[i] != unreg) continue; WRITE_ONCE(old->hooks[i].hook, accept_all); WRITE_ONCE(orig_ops[i], (void *)&dummy_ops); return true; } return false; } static void __nf_unregister_net_hook(struct net *net, int pf, const struct nf_hook_ops *reg) { struct nf_hook_entries __rcu **pp; struct nf_hook_entries *p; pp = nf_hook_entry_head(net, pf, reg->hooknum, reg->dev); if (!pp) return; mutex_lock(&nf_hook_mutex); p = nf_entry_dereference(*pp); if (WARN_ON_ONCE(!p)) { mutex_unlock(&nf_hook_mutex); return; } if (nf_remove_net_hook(p, reg)) { #ifdef CONFIG_NETFILTER_INGRESS if (nf_ingress_hook(reg, pf)) net_dec_ingress_queue(); #endif #ifdef CONFIG_NETFILTER_EGRESS if (nf_egress_hook(reg, pf)) net_dec_egress_queue(); #endif nf_static_key_dec(reg, pf); } else { WARN_ONCE(1, "hook not found, pf %d num %d", pf, reg->hooknum); } p = __nf_hook_entries_try_shrink(p, pp); mutex_unlock(&nf_hook_mutex); if (!p) return; nf_queue_nf_hook_drop(net); nf_hook_entries_free(p); } void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) { if (reg->pf == NFPROTO_INET) { if (reg->hooknum == NF_INET_INGRESS) { __nf_unregister_net_hook(net, NFPROTO_INET, reg); } else { __nf_unregister_net_hook(net, NFPROTO_IPV4, reg); __nf_unregister_net_hook(net, NFPROTO_IPV6, reg); } } else { __nf_unregister_net_hook(net, reg->pf, reg); } } EXPORT_SYMBOL(nf_unregister_net_hook); void nf_hook_entries_delete_raw(struct nf_hook_entries __rcu **pp, const struct nf_hook_ops *reg) { struct nf_hook_entries *p; p = rcu_dereference_raw(*pp); if (nf_remove_net_hook(p, reg)) { p = __nf_hook_entries_try_shrink(p, pp); nf_hook_entries_free(p); } } EXPORT_SYMBOL_GPL(nf_hook_entries_delete_raw); int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) { int err; if (reg->pf == NFPROTO_INET) { if (reg->hooknum == NF_INET_INGRESS) { err = __nf_register_net_hook(net, NFPROTO_INET, reg); if (err < 0) return err; } else { err = __nf_register_net_hook(net, NFPROTO_IPV4, reg); if (err < 0) return err; err = __nf_register_net_hook(net, NFPROTO_IPV6, reg); if (err < 0) { __nf_unregister_net_hook(net, NFPROTO_IPV4, reg); return err; } } } else { err = __nf_register_net_hook(net, reg->pf, reg); if (err < 0) return err; } return 0; } EXPORT_SYMBOL(nf_register_net_hook); int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg, unsigned int n) { unsigned int i; int err = 0; for (i = 0; i < n; i++) { err = nf_register_net_hook(net, &reg[i]); if (err) goto err; } return err; err: if (i > 0) nf_unregister_net_hooks(net, reg, i); return err; } EXPORT_SYMBOL(nf_register_net_hooks); void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg, unsigned int hookcount) { unsigned int i; for (i = 0; i < hookcount; i++) nf_unregister_net_hook(net, &reg[i]); } EXPORT_SYMBOL(nf_unregister_net_hooks); /* Returns 1 if okfn() needs to be executed by the caller, * -EPERM for NF_DROP, 0 otherwise. Caller must hold rcu_read_lock. */ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state, const struct nf_hook_entries *e, unsigned int s) { unsigned int verdict; int ret; for (; s < e->num_hook_entries; s++) { verdict = nf_hook_entry_hookfn(&e->hooks[s], skb, state); switch (verdict & NF_VERDICT_MASK) { case NF_ACCEPT: break; case NF_DROP: kfree_skb_reason(skb, SKB_DROP_REASON_NETFILTER_DROP); ret = NF_DROP_GETERR(verdict); if (ret == 0) ret = -EPERM; return ret; case NF_QUEUE: ret = nf_queue(skb, state, s, verdict); if (ret == 1) continue; return ret; case NF_STOLEN: return NF_DROP_GETERR(verdict); default: WARN_ON_ONCE(1); return 0; } } return 1; } EXPORT_SYMBOL(nf_hook_slow); void nf_hook_slow_list(struct list_head *head, struct nf_hook_state *state, const struct nf_hook_entries *e) { struct sk_buff *skb, *next; LIST_HEAD(sublist); int ret; list_for_each_entry_safe(skb, next, head, list) { skb_list_del_init(skb); ret = nf_hook_slow(skb, state, e, 0); if (ret == 1) list_add_tail(&skb->list, &sublist); } /* Put passed packets back on main list */ list_splice(&sublist, head); } EXPORT_SYMBOL(nf_hook_slow_list); /* This needs to be compiled in any case to avoid dependencies between the * nfnetlink_queue code and nf_conntrack. */ const struct nfnl_ct_hook __rcu *nfnl_ct_hook __read_mostly; EXPORT_SYMBOL_GPL(nfnl_ct_hook); const struct nf_ct_hook __rcu *nf_ct_hook __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_hook); const struct nf_defrag_hook __rcu *nf_defrag_v4_hook __read_mostly; EXPORT_SYMBOL_GPL(nf_defrag_v4_hook); const struct nf_defrag_hook __rcu *nf_defrag_v6_hook __read_mostly; EXPORT_SYMBOL_GPL(nf_defrag_v6_hook); #if IS_ENABLED(CONFIG_NF_CONNTRACK) u8 nf_ctnetlink_has_listener; EXPORT_SYMBOL_GPL(nf_ctnetlink_has_listener); const struct nf_nat_hook __rcu *nf_nat_hook __read_mostly; EXPORT_SYMBOL_GPL(nf_nat_hook); /* This does not belong here, but locally generated errors need it if connection * tracking in use: without this, connection may not be in hash table, and hence * manufactured ICMP or RST packets will not be associated with it. */ void nf_ct_attach(struct sk_buff *new, const struct sk_buff *skb) { const struct nf_ct_hook *ct_hook; if (skb->_nfct) { rcu_read_lock(); ct_hook = rcu_dereference(nf_ct_hook); if (ct_hook) ct_hook->attach(new, skb); rcu_read_unlock(); } } EXPORT_SYMBOL(nf_ct_attach); void nf_conntrack_destroy(struct nf_conntrack *nfct) { const struct nf_ct_hook *ct_hook; rcu_read_lock(); ct_hook = rcu_dereference(nf_ct_hook); if (ct_hook) ct_hook->destroy(nfct); rcu_read_unlock(); WARN_ON(!ct_hook); } EXPORT_SYMBOL(nf_conntrack_destroy); void nf_ct_set_closing(struct nf_conntrack *nfct) { const struct nf_ct_hook *ct_hook; if (!nfct) return; rcu_read_lock(); ct_hook = rcu_dereference(nf_ct_hook); if (ct_hook) ct_hook->set_closing(nfct); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(nf_ct_set_closing); bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, const struct sk_buff *skb) { const struct nf_ct_hook *ct_hook; bool ret = false; rcu_read_lock(); ct_hook = rcu_dereference(nf_ct_hook); if (ct_hook) ret = ct_hook->get_tuple_skb(dst_tuple, skb); rcu_read_unlock(); return ret; } EXPORT_SYMBOL(nf_ct_get_tuple_skb); /* Built-in default zone used e.g. by modules. */ const struct nf_conntrack_zone nf_ct_zone_dflt = { .id = NF_CT_DEFAULT_ZONE_ID, .dir = NF_CT_DEFAULT_ZONE_DIR, }; EXPORT_SYMBOL_GPL(nf_ct_zone_dflt); #endif /* CONFIG_NF_CONNTRACK */ static void __net_init __netfilter_net_init(struct nf_hook_entries __rcu **e, int max) { int h; for (h = 0; h < max; h++) RCU_INIT_POINTER(e[h], NULL); } static int __net_init netfilter_net_init(struct net *net) { __netfilter_net_init(net->nf.hooks_ipv4, ARRAY_SIZE(net->nf.hooks_ipv4)); __netfilter_net_init(net->nf.hooks_ipv6, ARRAY_SIZE(net->nf.hooks_ipv6)); #ifdef CONFIG_NETFILTER_FAMILY_ARP __netfilter_net_init(net->nf.hooks_arp, ARRAY_SIZE(net->nf.hooks_arp)); #endif #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE __netfilter_net_init(net->nf.hooks_bridge, ARRAY_SIZE(net->nf.hooks_bridge)); #endif #ifdef CONFIG_PROC_FS net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter", net->proc_net); if (!net->nf.proc_netfilter) { if (!net_eq(net, &init_net)) pr_err("cannot create netfilter proc entry"); return -ENOMEM; } #endif return 0; } static void __net_exit netfilter_net_exit(struct net *net) { remove_proc_entry("netfilter", net->proc_net); } static struct pernet_operations netfilter_net_ops = { .init = netfilter_net_init, .exit = netfilter_net_exit, }; int __init netfilter_init(void) { int ret; ret = register_pernet_subsys(&netfilter_net_ops); if (ret < 0) goto err; #ifdef CONFIG_LWTUNNEL ret = netfilter_lwtunnel_init(); if (ret < 0) goto err_lwtunnel_pernet; #endif ret = netfilter_log_init(); if (ret < 0) goto err_log_pernet; return 0; err_log_pernet: #ifdef CONFIG_LWTUNNEL netfilter_lwtunnel_fini(); err_lwtunnel_pernet: #endif unregister_pernet_subsys(&netfilter_net_ops); err: return ret; }
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 /* * include/net/tipc.h: Include file for TIPC message header routines * * Copyright (c) 2017 Ericsson AB * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _TIPC_HDR_H #define _TIPC_HDR_H #include <linux/random.h> #define KEEPALIVE_MSG_MASK 0x0e080000 /* LINK_PROTOCOL + MSG_IS_KEEPALIVE */ struct tipc_basic_hdr { __be32 w[4]; }; static inline __be32 tipc_hdr_rps_key(struct tipc_basic_hdr *hdr) { u32 w0 = ntohl(hdr->w[0]); bool keepalive_msg = (w0 & KEEPALIVE_MSG_MASK) == KEEPALIVE_MSG_MASK; __be32 key; /* Return source node identity as key */ if (likely(!keepalive_msg)) return hdr->w[3]; /* Spread PROBE/PROBE_REPLY messages across the cores */ get_random_bytes(&key, sizeof(key)); return key; } #endif
78 465 466 152 399 460 397 147 458 37 37 5 35 37 3 3 3 3 251 254 85 214 245 209 84 245 243 1 1 7 7 6 1 210 211 3 211 212 3 211 209 211 210 213 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 // SPDX-License-Identifier: GPL-2.0-only /* * jump label support * * Copyright (C) 2009 Jason Baron <jbaron@redhat.com> * Copyright (C) 2011 Peter Zijlstra * */ #include <linux/memory.h> #include <linux/uaccess.h> #include <linux/module.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/sort.h> #include <linux/err.h> #include <linux/static_key.h> #include <linux/jump_label_ratelimit.h> #include <linux/bug.h> #include <linux/cpu.h> #include <asm/sections.h> /* mutex to protect coming/going of the jump_label table */ static DEFINE_MUTEX(jump_label_mutex); void jump_label_lock(void) { mutex_lock(&jump_label_mutex); } void jump_label_unlock(void) { mutex_unlock(&jump_label_mutex); } static int jump_label_cmp(const void *a, const void *b) { const struct jump_entry *jea = a; const struct jump_entry *jeb = b; /* * Entrires are sorted by key. */ if (jump_entry_key(jea) < jump_entry_key(jeb)) return -1; if (jump_entry_key(jea) > jump_entry_key(jeb)) return 1; /* * In the batching mode, entries should also be sorted by the code * inside the already sorted list of entries, enabling a bsearch in * the vector. */ if (jump_entry_code(jea) < jump_entry_code(jeb)) return -1; if (jump_entry_code(jea) > jump_entry_code(jeb)) return 1; return 0; } static void jump_label_swap(void *a, void *b, int size) { long delta = (unsigned long)a - (unsigned long)b; struct jump_entry *jea = a; struct jump_entry *jeb = b; struct jump_entry tmp = *jea; jea->code = jeb->code - delta; jea->target = jeb->target - delta; jea->key = jeb->key - delta; jeb->code = tmp.code + delta; jeb->target = tmp.target + delta; jeb->key = tmp.key + delta; } static void jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop) { unsigned long size; void *swapfn = NULL; if (IS_ENABLED(CONFIG_HAVE_ARCH_JUMP_LABEL_RELATIVE)) swapfn = jump_label_swap; size = (((unsigned long)stop - (unsigned long)start) / sizeof(struct jump_entry)); sort(start, size, sizeof(struct jump_entry), jump_label_cmp, swapfn); } static void jump_label_update(struct static_key *key); /* * There are similar definitions for the !CONFIG_JUMP_LABEL case in jump_label.h. * The use of 'atomic_read()' requires atomic.h and its problematic for some * kernel headers such as kernel.h and others. Since static_key_count() is not * used in the branch statements as it is for the !CONFIG_JUMP_LABEL case its ok * to have it be a function here. Similarly, for 'static_key_enable()' and * 'static_key_disable()', which require bug.h. This should allow jump_label.h * to be included from most/all places for CONFIG_JUMP_LABEL. */ int static_key_count(struct static_key *key) { /* * -1 means the first static_key_slow_inc() is in progress. * static_key_enabled() must return true, so return 1 here. */ int n = atomic_read(&key->enabled); return n >= 0 ? n : 1; } EXPORT_SYMBOL_GPL(static_key_count); /* * static_key_fast_inc_not_disabled - adds a user for a static key * @key: static key that must be already enabled * * The caller must make sure that the static key can't get disabled while * in this function. It doesn't patch jump labels, only adds a user to * an already enabled static key. * * Returns true if the increment was done. Unlike refcount_t the ref counter * is not saturated, but will fail to increment on overflow. */ bool static_key_fast_inc_not_disabled(struct static_key *key) { int v; STATIC_KEY_CHECK_USE(key); /* * Negative key->enabled has a special meaning: it sends * static_key_slow_inc/dec() down the slow path, and it is non-zero * so it counts as "enabled" in jump_label_update(). * * The INT_MAX overflow condition is either used by the networking * code to reset or detected in the slow path of * static_key_slow_inc_cpuslocked(). */ v = atomic_read(&key->enabled); do { if (v <= 0 || v == INT_MAX) return false; } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v + 1))); return true; } EXPORT_SYMBOL_GPL(static_key_fast_inc_not_disabled); bool static_key_slow_inc_cpuslocked(struct static_key *key) { lockdep_assert_cpus_held(); /* * Careful if we get concurrent static_key_slow_inc/dec() calls; * later calls must wait for the first one to _finish_ the * jump_label_update() process. At the same time, however, * the jump_label_update() call below wants to see * static_key_enabled(&key) for jumps to be updated properly. */ if (static_key_fast_inc_not_disabled(key)) return true; guard(mutex)(&jump_label_mutex); /* Try to mark it as 'enabling in progress. */ if (!atomic_cmpxchg(&key->enabled, 0, -1)) { jump_label_update(key); /* * Ensure that when static_key_fast_inc_not_disabled() or * static_key_dec_not_one() observe the positive value, * they must also observe all the text changes. */ atomic_set_release(&key->enabled, 1); } else { /* * While holding the mutex this should never observe * anything else than a value >= 1 and succeed */ if (WARN_ON_ONCE(!static_key_fast_inc_not_disabled(key))) return false; } return true; } bool static_key_slow_inc(struct static_key *key) { bool ret; cpus_read_lock(); ret = static_key_slow_inc_cpuslocked(key); cpus_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(static_key_slow_inc); void static_key_enable_cpuslocked(struct static_key *key) { STATIC_KEY_CHECK_USE(key); lockdep_assert_cpus_held(); if (atomic_read(&key->enabled) > 0) { WARN_ON_ONCE(atomic_read(&key->enabled) != 1); return; } jump_label_lock(); if (atomic_read(&key->enabled) == 0) { atomic_set(&key->enabled, -1); jump_label_update(key); /* * See static_key_slow_inc(). */ atomic_set_release(&key->enabled, 1); } jump_label_unlock(); } EXPORT_SYMBOL_GPL(static_key_enable_cpuslocked); void static_key_enable(struct static_key *key) { cpus_read_lock(); static_key_enable_cpuslocked(key); cpus_read_unlock(); } EXPORT_SYMBOL_GPL(static_key_enable); void static_key_disable_cpuslocked(struct static_key *key) { STATIC_KEY_CHECK_USE(key); lockdep_assert_cpus_held(); if (atomic_read(&key->enabled) != 1) { WARN_ON_ONCE(atomic_read(&key->enabled) != 0); return; } jump_label_lock(); if (atomic_cmpxchg(&key->enabled, 1, 0) == 1) jump_label_update(key); jump_label_unlock(); } EXPORT_SYMBOL_GPL(static_key_disable_cpuslocked); void static_key_disable(struct static_key *key) { cpus_read_lock(); static_key_disable_cpuslocked(key); cpus_read_unlock(); } EXPORT_SYMBOL_GPL(static_key_disable); static bool static_key_dec_not_one(struct static_key *key) { int v; /* * Go into the slow path if key::enabled is less than or equal than * one. One is valid to shut down the key, anything less than one * is an imbalance, which is handled at the call site. * * That includes the special case of '-1' which is set in * static_key_slow_inc_cpuslocked(), but that's harmless as it is * fully serialized in the slow path below. By the time this task * acquires the jump label lock the value is back to one and the * retry under the lock must succeed. */ v = atomic_read(&key->enabled); do { /* * Warn about the '-1' case though; since that means a * decrement is concurrent with a first (0->1) increment. IOW * people are trying to disable something that wasn't yet fully * enabled. This suggests an ordering problem on the user side. */ WARN_ON_ONCE(v < 0); /* * Warn about underflow, and lie about success in an attempt to * not make things worse. */ if (WARN_ON_ONCE(v == 0)) return true; if (v <= 1) return false; } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v - 1))); return true; } static void __static_key_slow_dec_cpuslocked(struct static_key *key) { lockdep_assert_cpus_held(); int val; if (static_key_dec_not_one(key)) return; guard(mutex)(&jump_label_mutex); val = atomic_read(&key->enabled); /* * It should be impossible to observe -1 with jump_label_mutex held, * see static_key_slow_inc_cpuslocked(). */ if (WARN_ON_ONCE(val == -1)) return; /* * Cannot already be 0, something went sideways. */ if (WARN_ON_ONCE(val == 0)) return; if (atomic_dec_and_test(&key->enabled)) jump_label_update(key); } static void __static_key_slow_dec(struct static_key *key) { cpus_read_lock(); __static_key_slow_dec_cpuslocked(key); cpus_read_unlock(); } void jump_label_update_timeout(struct work_struct *work) { struct static_key_deferred *key = container_of(work, struct static_key_deferred, work.work); __static_key_slow_dec(&key->key); } EXPORT_SYMBOL_GPL(jump_label_update_timeout); void static_key_slow_dec(struct static_key *key) { STATIC_KEY_CHECK_USE(key); __static_key_slow_dec(key); } EXPORT_SYMBOL_GPL(static_key_slow_dec); void static_key_slow_dec_cpuslocked(struct static_key *key) { STATIC_KEY_CHECK_USE(key); __static_key_slow_dec_cpuslocked(key); } void __static_key_slow_dec_deferred(struct static_key *key, struct delayed_work *work, unsigned long timeout) { STATIC_KEY_CHECK_USE(key); if (static_key_dec_not_one(key)) return; schedule_delayed_work(work, timeout); } EXPORT_SYMBOL_GPL(__static_key_slow_dec_deferred); void __static_key_deferred_flush(void *key, struct delayed_work *work) { STATIC_KEY_CHECK_USE(key); flush_delayed_work(work); } EXPORT_SYMBOL_GPL(__static_key_deferred_flush); void jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl) { STATIC_KEY_CHECK_USE(key); key->timeout = rl; INIT_DELAYED_WORK(&key->work, jump_label_update_timeout); } EXPORT_SYMBOL_GPL(jump_label_rate_limit); static int addr_conflict(struct jump_entry *entry, void *start, void *end) { if (jump_entry_code(entry) <= (unsigned long)end && jump_entry_code(entry) + jump_entry_size(entry) > (unsigned long)start) return 1; return 0; } static int __jump_label_text_reserved(struct jump_entry *iter_start, struct jump_entry *iter_stop, void *start, void *end, bool init) { struct jump_entry *iter; iter = iter_start; while (iter < iter_stop) { if (init || !jump_entry_is_init(iter)) { if (addr_conflict(iter, start, end)) return 1; } iter++; } return 0; } #ifndef arch_jump_label_transform_static static void arch_jump_label_transform_static(struct jump_entry *entry, enum jump_label_type type) { /* nothing to do on most architectures */ } #endif static inline struct jump_entry *static_key_entries(struct static_key *key) { WARN_ON_ONCE(key->type & JUMP_TYPE_LINKED); return (struct jump_entry *)(key->type & ~JUMP_TYPE_MASK); } static inline bool static_key_type(struct static_key *key) { return key->type & JUMP_TYPE_TRUE; } static inline bool static_key_linked(struct static_key *key) { return key->type & JUMP_TYPE_LINKED; } static inline void static_key_clear_linked(struct static_key *key) { key->type &= ~JUMP_TYPE_LINKED; } static inline void static_key_set_linked(struct static_key *key) { key->type |= JUMP_TYPE_LINKED; } /*** * A 'struct static_key' uses a union such that it either points directly * to a table of 'struct jump_entry' or to a linked list of modules which in * turn point to 'struct jump_entry' tables. * * The two lower bits of the pointer are used to keep track of which pointer * type is in use and to store the initial branch direction, we use an access * function which preserves these bits. */ static void static_key_set_entries(struct static_key *key, struct jump_entry *entries) { unsigned long type; WARN_ON_ONCE((unsigned long)entries & JUMP_TYPE_MASK); type = key->type & JUMP_TYPE_MASK; key->entries = entries; key->type |= type; } static enum jump_label_type jump_label_type(struct jump_entry *entry) { struct static_key *key = jump_entry_key(entry); bool enabled = static_key_enabled(key); bool branch = jump_entry_is_branch(entry); /* See the comment in linux/jump_label.h */ return enabled ^ branch; } static bool jump_label_can_update(struct jump_entry *entry, bool init) { /* * Cannot update code that was in an init text area. */ if (!init && jump_entry_is_init(entry)) return false; if (!kernel_text_address(jump_entry_code(entry))) { /* * This skips patching built-in __exit, which * is part of init_section_contains() but is * not part of kernel_text_address(). * * Skipping built-in __exit is fine since it * will never be executed. */ WARN_ONCE(!jump_entry_is_init(entry), "can't patch jump_label at %pS", (void *)jump_entry_code(entry)); return false; } return true; } #ifndef HAVE_JUMP_LABEL_BATCH static void __jump_label_update(struct static_key *key, struct jump_entry *entry, struct jump_entry *stop, bool init) { for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) { if (jump_label_can_update(entry, init)) arch_jump_label_transform(entry, jump_label_type(entry)); } } #else static void __jump_label_update(struct static_key *key, struct jump_entry *entry, struct jump_entry *stop, bool init) { for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) { if (!jump_label_can_update(entry, init)) continue; if (!arch_jump_label_transform_queue(entry, jump_label_type(entry))) { /* * Queue is full: Apply the current queue and try again. */ arch_jump_label_transform_apply(); BUG_ON(!arch_jump_label_transform_queue(entry, jump_label_type(entry))); } } arch_jump_label_transform_apply(); } #endif void __init jump_label_init(void) { struct jump_entry *iter_start = __start___jump_table; struct jump_entry *iter_stop = __stop___jump_table; struct static_key *key = NULL; struct jump_entry *iter; /* * Since we are initializing the static_key.enabled field with * with the 'raw' int values (to avoid pulling in atomic.h) in * jump_label.h, let's make sure that is safe. There are only two * cases to check since we initialize to 0 or 1. */ BUILD_BUG_ON((int)ATOMIC_INIT(0) != 0); BUILD_BUG_ON((int)ATOMIC_INIT(1) != 1); if (static_key_initialized) return; cpus_read_lock(); jump_label_lock(); jump_label_sort_entries(iter_start, iter_stop); for (iter = iter_start; iter < iter_stop; iter++) { struct static_key *iterk; bool in_init; /* rewrite NOPs */ if (jump_label_type(iter) == JUMP_LABEL_NOP) arch_jump_label_transform_static(iter, JUMP_LABEL_NOP); in_init = init_section_contains((void *)jump_entry_code(iter), 1); jump_entry_set_init(iter, in_init); iterk = jump_entry_key(iter); if (iterk == key) continue; key = iterk; static_key_set_entries(key, iter); } static_key_initialized = true; jump_label_unlock(); cpus_read_unlock(); } static inline bool static_key_sealed(struct static_key *key) { return (key->type & JUMP_TYPE_LINKED) && !(key->type & ~JUMP_TYPE_MASK); } static inline void static_key_seal(struct static_key *key) { unsigned long type = key->type & JUMP_TYPE_TRUE; key->type = JUMP_TYPE_LINKED | type; } void jump_label_init_ro(void) { struct jump_entry *iter_start = __start___jump_table; struct jump_entry *iter_stop = __stop___jump_table; struct jump_entry *iter; if (WARN_ON_ONCE(!static_key_initialized)) return; cpus_read_lock(); jump_label_lock(); for (iter = iter_start; iter < iter_stop; iter++) { struct static_key *iterk = jump_entry_key(iter); if (!is_kernel_ro_after_init((unsigned long)iterk)) continue; if (static_key_sealed(iterk)) continue; static_key_seal(iterk); } jump_label_unlock(); cpus_read_unlock(); } #ifdef CONFIG_MODULES enum jump_label_type jump_label_init_type(struct jump_entry *entry) { struct static_key *key = jump_entry_key(entry); bool type = static_key_type(key); bool branch = jump_entry_is_branch(entry); /* See the comment in linux/jump_label.h */ return type ^ branch; } struct static_key_mod { struct static_key_mod *next; struct jump_entry *entries; struct module *mod; }; static inline struct static_key_mod *static_key_mod(struct static_key *key) { WARN_ON_ONCE(!static_key_linked(key)); return (struct static_key_mod *)(key->type & ~JUMP_TYPE_MASK); } /*** * key->type and key->next are the same via union. * This sets key->next and preserves the type bits. * * See additional comments above static_key_set_entries(). */ static void static_key_set_mod(struct static_key *key, struct static_key_mod *mod) { unsigned long type; WARN_ON_ONCE((unsigned long)mod & JUMP_TYPE_MASK); type = key->type & JUMP_TYPE_MASK; key->next = mod; key->type |= type; } static int __jump_label_mod_text_reserved(void *start, void *end) { struct module *mod; int ret; preempt_disable(); mod = __module_text_address((unsigned long)start); WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); if (!try_module_get(mod)) mod = NULL; preempt_enable(); if (!mod) return 0; ret = __jump_label_text_reserved(mod->jump_entries, mod->jump_entries + mod->num_jump_entries, start, end, mod->state == MODULE_STATE_COMING); module_put(mod); return ret; } static void __jump_label_mod_update(struct static_key *key) { struct static_key_mod *mod; for (mod = static_key_mod(key); mod; mod = mod->next) { struct jump_entry *stop; struct module *m; /* * NULL if the static_key is defined in a module * that does not use it */ if (!mod->entries) continue; m = mod->mod; if (!m) stop = __stop___jump_table; else stop = m->jump_entries + m->num_jump_entries; __jump_label_update(key, mod->entries, stop, m && m->state == MODULE_STATE_COMING); } } static int jump_label_add_module(struct module *mod) { struct jump_entry *iter_start = mod->jump_entries; struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; struct jump_entry *iter; struct static_key *key = NULL; struct static_key_mod *jlm, *jlm2; /* if the module doesn't have jump label entries, just return */ if (iter_start == iter_stop) return 0; jump_label_sort_entries(iter_start, iter_stop); for (iter = iter_start; iter < iter_stop; iter++) { struct static_key *iterk; bool in_init; in_init = within_module_init(jump_entry_code(iter), mod); jump_entry_set_init(iter, in_init); iterk = jump_entry_key(iter); if (iterk == key) continue; key = iterk; if (within_module((unsigned long)key, mod)) { static_key_set_entries(key, iter); continue; } /* * If the key was sealed at init, then there's no need to keep a * reference to its module entries - just patch them now and be * done with it. */ if (static_key_sealed(key)) goto do_poke; jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL); if (!jlm) return -ENOMEM; if (!static_key_linked(key)) { jlm2 = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL); if (!jlm2) { kfree(jlm); return -ENOMEM; } preempt_disable(); jlm2->mod = __module_address((unsigned long)key); preempt_enable(); jlm2->entries = static_key_entries(key); jlm2->next = NULL; static_key_set_mod(key, jlm2); static_key_set_linked(key); } jlm->mod = mod; jlm->entries = iter; jlm->next = static_key_mod(key); static_key_set_mod(key, jlm); static_key_set_linked(key); /* Only update if we've changed from our initial state */ do_poke: if (jump_label_type(iter) != jump_label_init_type(iter)) __jump_label_update(key, iter, iter_stop, true); } return 0; } static void jump_label_del_module(struct module *mod) { struct jump_entry *iter_start = mod->jump_entries; struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; struct jump_entry *iter; struct static_key *key = NULL; struct static_key_mod *jlm, **prev; for (iter = iter_start; iter < iter_stop; iter++) { if (jump_entry_key(iter) == key) continue; key = jump_entry_key(iter); if (within_module((unsigned long)key, mod)) continue; /* No @jlm allocated because key was sealed at init. */ if (static_key_sealed(key)) continue; /* No memory during module load */ if (WARN_ON(!static_key_linked(key))) continue; prev = &key->next; jlm = static_key_mod(key); while (jlm && jlm->mod != mod) { prev = &jlm->next; jlm = jlm->next; } /* No memory during module load */ if (WARN_ON(!jlm)) continue; if (prev == &key->next) static_key_set_mod(key, jlm->next); else *prev = jlm->next; kfree(jlm); jlm = static_key_mod(key); /* if only one etry is left, fold it back into the static_key */ if (jlm->next == NULL) { static_key_set_entries(key, jlm->entries); static_key_clear_linked(key); kfree(jlm); } } } static int jump_label_module_notify(struct notifier_block *self, unsigned long val, void *data) { struct module *mod = data; int ret = 0; cpus_read_lock(); jump_label_lock(); switch (val) { case MODULE_STATE_COMING: ret = jump_label_add_module(mod); if (ret) { WARN(1, "Failed to allocate memory: jump_label may not work properly.\n"); jump_label_del_module(mod); } break; case MODULE_STATE_GOING: jump_label_del_module(mod); break; } jump_label_unlock(); cpus_read_unlock(); return notifier_from_errno(ret); } static struct notifier_block jump_label_module_nb = { .notifier_call = jump_label_module_notify, .priority = 1, /* higher than tracepoints */ }; static __init int jump_label_init_module(void) { return register_module_notifier(&jump_label_module_nb); } early_initcall(jump_label_init_module); #endif /* CONFIG_MODULES */ /*** * jump_label_text_reserved - check if addr range is reserved * @start: start text addr * @end: end text addr * * checks if the text addr located between @start and @end * overlaps with any of the jump label patch addresses. Code * that wants to modify kernel text should first verify that * it does not overlap with any of the jump label addresses. * Caller must hold jump_label_mutex. * * returns 1 if there is an overlap, 0 otherwise */ int jump_label_text_reserved(void *start, void *end) { bool init = system_state < SYSTEM_RUNNING; int ret = __jump_label_text_reserved(__start___jump_table, __stop___jump_table, start, end, init); if (ret) return ret; #ifdef CONFIG_MODULES ret = __jump_label_mod_text_reserved(start, end); #endif return ret; } static void jump_label_update(struct static_key *key) { struct jump_entry *stop = __stop___jump_table; bool init = system_state < SYSTEM_RUNNING; struct jump_entry *entry; #ifdef CONFIG_MODULES struct module *mod; if (static_key_linked(key)) { __jump_label_mod_update(key); return; } preempt_disable(); mod = __module_address((unsigned long)key); if (mod) { stop = mod->jump_entries + mod->num_jump_entries; init = mod->state == MODULE_STATE_COMING; } preempt_enable(); #endif entry = static_key_entries(key); /* if there are no users, entry can be NULL */ if (entry) __jump_label_update(key, entry, stop, init); } #ifdef CONFIG_STATIC_KEYS_SELFTEST static DEFINE_STATIC_KEY_TRUE(sk_true); static DEFINE_STATIC_KEY_FALSE(sk_false); static __init int jump_label_test(void) { int i; for (i = 0; i < 2; i++) { WARN_ON(static_key_enabled(&sk_true.key) != true); WARN_ON(static_key_enabled(&sk_false.key) != false); WARN_ON(!static_branch_likely(&sk_true)); WARN_ON(!static_branch_unlikely(&sk_true)); WARN_ON(static_branch_likely(&sk_false)); WARN_ON(static_branch_unlikely(&sk_false)); static_branch_disable(&sk_true); static_branch_enable(&sk_false); WARN_ON(static_key_enabled(&sk_true.key) == true); WARN_ON(static_key_enabled(&sk_false.key) == false); WARN_ON(static_branch_likely(&sk_true)); WARN_ON(static_branch_unlikely(&sk_true)); WARN_ON(!static_branch_likely(&sk_false)); WARN_ON(!static_branch_unlikely(&sk_false)); static_branch_enable(&sk_true); static_branch_disable(&sk_false); } return 0; } early_initcall(jump_label_test); #endif /* STATIC_KEYS_SELFTEST */
2410 2412 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 // SPDX-License-Identifier: GPL-2.0+ /* * Universal/legacy driver for 8250/16550-type serial ports * * Based on drivers/char/serial.c, by Linus Torvalds, Theodore Ts'o. * * Copyright (C) 2001 Russell King. * * Supports: * early_serial_setup() ports * userspace-configurable "phantom" ports * serial8250_register_8250_port() ports */ #include <linux/acpi.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/ioport.h> #include <linux/init.h> #include <linux/console.h> #include <linux/sysrq.h> #include <linux/delay.h> #include <linux/platform_device.h> #include <linux/pm_runtime.h> #include <linux/tty.h> #include <linux/ratelimit.h> #include <linux/tty_flip.h> #include <linux/serial.h> #include <linux/serial_8250.h> #include <linux/nmi.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/string_helpers.h> #include <linux/uaccess.h> #include <linux/io.h> #include <asm/irq.h> #include "8250.h" #define PASS_LIMIT 512 struct irq_info { struct hlist_node node; int irq; spinlock_t lock; /* Protects list not the hash */ struct list_head *head; }; #define NR_IRQ_HASH 32 /* Can be adjusted later */ static struct hlist_head irq_lists[NR_IRQ_HASH]; static DEFINE_MUTEX(hash_mutex); /* Used to walk the hash */ /* * This is the serial driver's interrupt routine. * * Arjan thinks the old way was overly complex, so it got simplified. * Alan disagrees, saying that need the complexity to handle the weird * nature of ISA shared interrupts. (This is a special exception.) * * In order to handle ISA shared interrupts properly, we need to check * that all ports have been serviced, and therefore the ISA interrupt * line has been de-asserted. * * This means we need to loop through all ports. checking that they * don't have an interrupt pending. */ static irqreturn_t serial8250_interrupt(int irq, void *dev_id) { struct irq_info *i = dev_id; struct list_head *l, *end = NULL; int pass_counter = 0, handled = 0; pr_debug("%s(%d): start\n", __func__, irq); spin_lock(&i->lock); l = i->head; do { struct uart_8250_port *up; struct uart_port *port; up = list_entry(l, struct uart_8250_port, list); port = &up->port; if (port->handle_irq(port)) { handled = 1; end = NULL; } else if (end == NULL) end = l; l = l->next; if (l == i->head && pass_counter++ > PASS_LIMIT) break; } while (l != end); spin_unlock(&i->lock); pr_debug("%s(%d): end\n", __func__, irq); return IRQ_RETVAL(handled); } /* * To support ISA shared interrupts, we need to have one interrupt * handler that ensures that the IRQ line has been deasserted * before returning. Failing to do this will result in the IRQ * line being stuck active, and, since ISA irqs are edge triggered, * no more IRQs will be seen. */ static void serial_do_unlink(struct irq_info *i, struct uart_8250_port *up) { spin_lock_irq(&i->lock); if (!list_empty(i->head)) { if (i->head == &up->list) i->head = i->head->next; list_del(&up->list); } else { BUG_ON(i->head != &up->list); i->head = NULL; } spin_unlock_irq(&i->lock); /* List empty so throw away the hash node */ if (i->head == NULL) { hlist_del(&i->node); kfree(i); } } static int serial_link_irq_chain(struct uart_8250_port *up) { struct hlist_head *h; struct irq_info *i; int ret; mutex_lock(&hash_mutex); h = &irq_lists[up->port.irq % NR_IRQ_HASH]; hlist_for_each_entry(i, h, node) if (i->irq == up->port.irq) break; if (i == NULL) { i = kzalloc(sizeof(struct irq_info), GFP_KERNEL); if (i == NULL) { mutex_unlock(&hash_mutex); return -ENOMEM; } spin_lock_init(&i->lock); i->irq = up->port.irq; hlist_add_head(&i->node, h); } mutex_unlock(&hash_mutex); spin_lock_irq(&i->lock); if (i->head) { list_add(&up->list, i->head); spin_unlock_irq(&i->lock); ret = 0; } else { INIT_LIST_HEAD(&up->list); i->head = &up->list; spin_unlock_irq(&i->lock); ret = request_irq(up->port.irq, serial8250_interrupt, up->port.irqflags, up->port.name, i); if (ret < 0) serial_do_unlink(i, up); } return ret; } static void serial_unlink_irq_chain(struct uart_8250_port *up) { struct irq_info *i; struct hlist_head *h; mutex_lock(&hash_mutex); h = &irq_lists[up->port.irq % NR_IRQ_HASH]; hlist_for_each_entry(i, h, node) if (i->irq == up->port.irq) break; BUG_ON(i == NULL); BUG_ON(i->head == NULL); if (list_empty(i->head)) free_irq(up->port.irq, i); serial_do_unlink(i, up); mutex_unlock(&hash_mutex); } /* * This function is used to handle ports that do not have an * interrupt. This doesn't work very well for 16450's, but gives * barely passable results for a 16550A. (Although at the expense * of much CPU overhead). */ static void serial8250_timeout(struct timer_list *t) { struct uart_8250_port *up = from_timer(up, t, timer); up->port.handle_irq(&up->port); mod_timer(&up->timer, jiffies + uart_poll_timeout(&up->port)); } static void serial8250_backup_timeout(struct timer_list *t) { struct uart_8250_port *up = from_timer(up, t, timer); unsigned int iir, ier = 0, lsr; unsigned long flags; uart_port_lock_irqsave(&up->port, &flags); /* * Must disable interrupts or else we risk racing with the interrupt * based handler. */ if (up->port.irq) { ier = serial_in(up, UART_IER); serial_out(up, UART_IER, 0); } iir = serial_in(up, UART_IIR); /* * This should be a safe test for anyone who doesn't trust the * IIR bits on their UART, but it's specifically designed for * the "Diva" UART used on the management processor on many HP * ia64 and parisc boxes. */ lsr = serial_lsr_in(up); if ((iir & UART_IIR_NO_INT) && (up->ier & UART_IER_THRI) && (!kfifo_is_empty(&up->port.state->port.xmit_fifo) || up->port.x_char) && (lsr & UART_LSR_THRE)) { iir &= ~(UART_IIR_ID | UART_IIR_NO_INT); iir |= UART_IIR_THRI; } if (!(iir & UART_IIR_NO_INT)) serial8250_tx_chars(up); if (up->port.irq) serial_out(up, UART_IER, ier); uart_port_unlock_irqrestore(&up->port, flags); /* Standard timer interval plus 0.2s to keep the port running */ mod_timer(&up->timer, jiffies + uart_poll_timeout(&up->port) + HZ / 5); } static void univ8250_setup_timer(struct uart_8250_port *up) { struct uart_port *port = &up->port; /* * The above check will only give an accurate result the first time * the port is opened so this value needs to be preserved. */ if (up->bugs & UART_BUG_THRE) { pr_debug("%s - using backup timer\n", port->name); up->timer.function = serial8250_backup_timeout; mod_timer(&up->timer, jiffies + uart_poll_timeout(port) + HZ / 5); } /* * If the "interrupt" for this port doesn't correspond with any * hardware interrupt, we use a timer-based system. The original * driver used to do this with IRQ0. */ if (!port->irq) mod_timer(&up->timer, jiffies + uart_poll_timeout(port)); } static int univ8250_setup_irq(struct uart_8250_port *up) { struct uart_port *port = &up->port; if (port->irq) return serial_link_irq_chain(up); return 0; } static void univ8250_release_irq(struct uart_8250_port *up) { struct uart_port *port = &up->port; del_timer_sync(&up->timer); up->timer.function = serial8250_timeout; if (port->irq) serial_unlink_irq_chain(up); } const struct uart_ops *univ8250_port_base_ops = NULL; struct uart_ops univ8250_port_ops; static const struct uart_8250_ops univ8250_driver_ops = { .setup_irq = univ8250_setup_irq, .release_irq = univ8250_release_irq, .setup_timer = univ8250_setup_timer, }; static struct uart_8250_port serial8250_ports[UART_NR]; /** * serial8250_get_port - retrieve struct uart_8250_port * @line: serial line number * * This function retrieves struct uart_8250_port for the specific line. * This struct *must* *not* be used to perform a 8250 or serial core operation * which is not accessible otherwise. Its only purpose is to make the struct * accessible to the runtime-pm callbacks for context suspend/restore. * The lock assumption made here is none because runtime-pm suspend/resume * callbacks should not be invoked if there is any operation performed on the * port. */ struct uart_8250_port *serial8250_get_port(int line) { return &serial8250_ports[line]; } EXPORT_SYMBOL_GPL(serial8250_get_port); static inline void serial8250_apply_quirks(struct uart_8250_port *up) { up->port.quirks |= skip_txen_test ? UPQ_NO_TXEN_TEST : 0; } struct uart_8250_port *serial8250_setup_port(int index) { struct uart_8250_port *up; if (index >= UART_NR) return NULL; up = &serial8250_ports[index]; up->port.line = index; up->port.port_id = index; serial8250_init_port(up); if (!univ8250_port_base_ops) univ8250_port_base_ops = up->port.ops; up->port.ops = &univ8250_port_ops; timer_setup(&up->timer, serial8250_timeout, 0); up->ops = &univ8250_driver_ops; serial8250_set_defaults(up); return up; } void __init serial8250_register_ports(struct uart_driver *drv, struct device *dev) { int i; for (i = 0; i < nr_uarts; i++) { struct uart_8250_port *up = &serial8250_ports[i]; if (up->port.type == PORT_8250_CIR) continue; if (up->port.dev) continue; up->port.dev = dev; if (uart_console_registered(&up->port)) pm_runtime_get_sync(up->port.dev); serial8250_apply_quirks(up); uart_add_one_port(drv, &up->port); } } #ifdef CONFIG_SERIAL_8250_CONSOLE static void univ8250_console_write(struct console *co, const char *s, unsigned int count) { struct uart_8250_port *up = &serial8250_ports[co->index]; serial8250_console_write(up, s, count); } static int univ8250_console_setup(struct console *co, char *options) { struct uart_8250_port *up; struct uart_port *port; int retval, i; /* * Check whether an invalid uart number has been specified, and * if so, search for the first available port that does have * console support. */ if (co->index < 0 || co->index >= UART_NR) co->index = 0; /* * If the console is past the initial isa ports, init more ports up to * co->index as needed and increment nr_uarts accordingly. */ for (i = nr_uarts; i <= co->index; i++) { up = serial8250_setup_port(i); if (!up) return -ENODEV; nr_uarts++; } port = &serial8250_ports[co->index].port; /* link port to console */ uart_port_set_cons(port, co); retval = serial8250_console_setup(port, options, false); if (retval != 0) uart_port_set_cons(port, NULL); return retval; } static int univ8250_console_exit(struct console *co) { struct uart_port *port; port = &serial8250_ports[co->index].port; return serial8250_console_exit(port); } /** * univ8250_console_match - non-standard console matching * @co: registering console * @name: name from console command line * @idx: index from console command line * @options: ptr to option string from console command line * * Only attempts to match console command lines of the form: * console=uart[8250],io|mmio|mmio16|mmio32,<addr>[,<options>] * console=uart[8250],0x<addr>[,<options>] * This form is used to register an initial earlycon boot console and * replace it with the serial8250_console at 8250 driver init. * * Performs console setup for a match (as required by interface) * If no <options> are specified, then assume the h/w is already setup. * * Returns 0 if console matches; otherwise non-zero to use default matching */ static int univ8250_console_match(struct console *co, char *name, int idx, char *options) { char match[] = "uart"; /* 8250-specific earlycon name */ unsigned char iotype; resource_size_t addr; int i; if (strncmp(name, match, 4) != 0) return -ENODEV; if (uart_parse_earlycon(options, &iotype, &addr, &options)) return -ENODEV; /* try to match the port specified on the command line */ for (i = 0; i < nr_uarts; i++) { struct uart_port *port = &serial8250_ports[i].port; if (port->iotype != iotype) continue; if ((iotype == UPIO_MEM || iotype == UPIO_MEM16 || iotype == UPIO_MEM32 || iotype == UPIO_MEM32BE) && (port->mapbase != addr)) continue; if (iotype == UPIO_PORT && port->iobase != addr) continue; co->index = i; uart_port_set_cons(port, co); return serial8250_console_setup(port, options, true); } return -ENODEV; } static struct console univ8250_console = { .name = "ttyS", .write = univ8250_console_write, .device = uart_console_device, .setup = univ8250_console_setup, .exit = univ8250_console_exit, .match = univ8250_console_match, .flags = CON_PRINTBUFFER | CON_ANYTIME, .index = -1, .data = &serial8250_reg, }; static int __init univ8250_console_init(void) { if (nr_uarts == 0) return -ENODEV; serial8250_isa_init_ports(); register_console(&univ8250_console); return 0; } console_initcall(univ8250_console_init); #define SERIAL8250_CONSOLE (&univ8250_console) #else #define SERIAL8250_CONSOLE NULL #endif struct uart_driver serial8250_reg = { .owner = THIS_MODULE, .driver_name = "serial", .dev_name = "ttyS", .major = TTY_MAJOR, .minor = 64, .cons = SERIAL8250_CONSOLE, }; /* * early_serial_setup - early registration for 8250 ports * * Setup an 8250 port structure prior to console initialisation. Use * after console initialisation will cause undefined behaviour. */ int __init early_serial_setup(struct uart_port *port) { struct uart_port *p; if (port->line >= ARRAY_SIZE(serial8250_ports) || nr_uarts == 0) return -ENODEV; serial8250_isa_init_ports(); p = &serial8250_ports[port->line].port; p->iobase = port->iobase; p->membase = port->membase; p->irq = port->irq; p->irqflags = port->irqflags; p->uartclk = port->uartclk; p->fifosize = port->fifosize; p->regshift = port->regshift; p->iotype = port->iotype; p->flags = port->flags; p->mapbase = port->mapbase; p->mapsize = port->mapsize; p->private_data = port->private_data; p->type = port->type; p->line = port->line; serial8250_set_defaults(up_to_u8250p(p)); if (port->serial_in) p->serial_in = port->serial_in; if (port->serial_out) p->serial_out = port->serial_out; if (port->handle_irq) p->handle_irq = port->handle_irq; return 0; } /** * serial8250_suspend_port - suspend one serial port * @line: serial line number * * Suspend one serial port. */ void serial8250_suspend_port(int line) { struct uart_8250_port *up = &serial8250_ports[line]; struct uart_port *port = &up->port; if (!console_suspend_enabled && uart_console(port) && port->type != PORT_8250) { unsigned char canary = 0xa5; serial_out(up, UART_SCR, canary); if (serial_in(up, UART_SCR) == canary) up->canary = canary; } uart_suspend_port(&serial8250_reg, port); } EXPORT_SYMBOL(serial8250_suspend_port); /** * serial8250_resume_port - resume one serial port * @line: serial line number * * Resume one serial port. */ void serial8250_resume_port(int line) { struct uart_8250_port *up = &serial8250_ports[line]; struct uart_port *port = &up->port; up->canary = 0; if (up->capabilities & UART_NATSEMI) { /* Ensure it's still in high speed mode */ serial_port_out(port, UART_LCR, 0xE0); ns16550a_goto_highspeed(up); serial_port_out(port, UART_LCR, 0); port->uartclk = 921600*16; } uart_resume_port(&serial8250_reg, port); } EXPORT_SYMBOL(serial8250_resume_port); /* * serial8250_register_8250_port and serial8250_unregister_port allows for * 16x50 serial ports to be configured at run-time, to support PCMCIA * modems and PCI multiport cards. */ static DEFINE_MUTEX(serial_mutex); static struct uart_8250_port *serial8250_find_match_or_unused(const struct uart_port *port) { int i; /* * First, find a port entry which matches. */ for (i = 0; i < nr_uarts; i++) if (uart_match_port(&serial8250_ports[i].port, port)) return &serial8250_ports[i]; /* try line number first if still available */ i = port->line; if (i < nr_uarts && serial8250_ports[i].port.type == PORT_UNKNOWN && serial8250_ports[i].port.iobase == 0) return &serial8250_ports[i]; /* * We didn't find a matching entry, so look for the first * free entry. We look for one which hasn't been previously * used (indicated by zero iobase). */ for (i = 0; i < nr_uarts; i++) if (serial8250_ports[i].port.type == PORT_UNKNOWN && serial8250_ports[i].port.iobase == 0) return &serial8250_ports[i]; /* * That also failed. Last resort is to find any entry which * doesn't have a real port associated with it. */ for (i = 0; i < nr_uarts; i++) if (serial8250_ports[i].port.type == PORT_UNKNOWN) return &serial8250_ports[i]; return NULL; } static void serial_8250_overrun_backoff_work(struct work_struct *work) { struct uart_8250_port *up = container_of(to_delayed_work(work), struct uart_8250_port, overrun_backoff); struct uart_port *port = &up->port; unsigned long flags; uart_port_lock_irqsave(port, &flags); up->ier |= UART_IER_RLSI | UART_IER_RDI; serial_out(up, UART_IER, up->ier); uart_port_unlock_irqrestore(port, flags); } /** * serial8250_register_8250_port - register a serial port * @up: serial port template * * Configure the serial port specified by the request. If the * port exists and is in use, it is hung up and unregistered * first. * * The port is then probed and if necessary the IRQ is autodetected * If this fails an error is returned. * * On success the port is ready to use and the line number is returned. */ int serial8250_register_8250_port(const struct uart_8250_port *up) { struct uart_8250_port *uart; int ret = -ENOSPC; if (up->port.uartclk == 0) return -EINVAL; mutex_lock(&serial_mutex); uart = serial8250_find_match_or_unused(&up->port); if (!uart) { /* * If the port is past the initial isa ports, initialize a new * port and increment nr_uarts accordingly. */ uart = serial8250_setup_port(nr_uarts); if (!uart) goto unlock; nr_uarts++; } if (uart->port.type != PORT_8250_CIR) { struct mctrl_gpios *gpios; if (uart->port.dev) uart_remove_one_port(&serial8250_reg, &uart->port); uart->port.ctrl_id = up->port.ctrl_id; uart->port.port_id = up->port.port_id; uart->port.iobase = up->port.iobase; uart->port.membase = up->port.membase; uart->port.irq = up->port.irq; uart->port.irqflags = up->port.irqflags; uart->port.uartclk = up->port.uartclk; uart->port.fifosize = up->port.fifosize; uart->port.regshift = up->port.regshift; uart->port.iotype = up->port.iotype; uart->port.flags = up->port.flags | UPF_BOOT_AUTOCONF; uart->bugs = up->bugs; uart->port.mapbase = up->port.mapbase; uart->port.mapsize = up->port.mapsize; uart->port.private_data = up->port.private_data; uart->tx_loadsz = up->tx_loadsz; uart->capabilities = up->capabilities; uart->port.throttle = up->port.throttle; uart->port.unthrottle = up->port.unthrottle; uart->port.rs485_config = up->port.rs485_config; uart->port.rs485_supported = up->port.rs485_supported; uart->port.rs485 = up->port.rs485; uart->rs485_start_tx = up->rs485_start_tx; uart->rs485_stop_tx = up->rs485_stop_tx; uart->lsr_save_mask = up->lsr_save_mask; uart->dma = up->dma; /* Take tx_loadsz from fifosize if it wasn't set separately */ if (uart->port.fifosize && !uart->tx_loadsz) uart->tx_loadsz = uart->port.fifosize; if (up->port.dev) { uart->port.dev = up->port.dev; ret = uart_get_rs485_mode(&uart->port); if (ret) goto err; } if (up->port.flags & UPF_FIXED_TYPE) uart->port.type = up->port.type; /* * Only call mctrl_gpio_init(), if the device has no ACPI * companion device */ if (!has_acpi_companion(uart->port.dev)) { gpios = mctrl_gpio_init(&uart->port, 0); if (IS_ERR(gpios)) { ret = PTR_ERR(gpios); goto err; } else { uart->gpios = gpios; } } serial8250_set_defaults(uart); /* Possibly override default I/O functions. */ if (up->port.serial_in) uart->port.serial_in = up->port.serial_in; if (up->port.serial_out) uart->port.serial_out = up->port.serial_out; if (up->port.handle_irq) uart->port.handle_irq = up->port.handle_irq; /* Possibly override set_termios call */ if (up->port.set_termios) uart->port.set_termios = up->port.set_termios; if (up->port.set_ldisc) uart->port.set_ldisc = up->port.set_ldisc; if (up->port.get_mctrl) uart->port.get_mctrl = up->port.get_mctrl; if (up->port.set_mctrl) uart->port.set_mctrl = up->port.set_mctrl; if (up->port.get_divisor) uart->port.get_divisor = up->port.get_divisor; if (up->port.set_divisor) uart->port.set_divisor = up->port.set_divisor; if (up->port.startup) uart->port.startup = up->port.startup; if (up->port.shutdown) uart->port.shutdown = up->port.shutdown; if (up->port.pm) uart->port.pm = up->port.pm; if (up->port.handle_break) uart->port.handle_break = up->port.handle_break; if (up->dl_read) uart->dl_read = up->dl_read; if (up->dl_write) uart->dl_write = up->dl_write; if (uart->port.type != PORT_8250_CIR) { if (uart_console_registered(&uart->port)) pm_runtime_get_sync(uart->port.dev); if (serial8250_isa_config != NULL) serial8250_isa_config(0, &uart->port, &uart->capabilities); serial8250_apply_quirks(uart); ret = uart_add_one_port(&serial8250_reg, &uart->port); if (ret) goto err; ret = uart->port.line; } else { dev_info(uart->port.dev, "skipping CIR port at 0x%lx / 0x%llx, IRQ %d\n", uart->port.iobase, (unsigned long long)uart->port.mapbase, uart->port.irq); ret = 0; } if (!uart->lsr_save_mask) uart->lsr_save_mask = LSR_SAVE_FLAGS; /* Use default LSR mask */ /* Initialise interrupt backoff work if required */ if (up->overrun_backoff_time_ms > 0) { uart->overrun_backoff_time_ms = up->overrun_backoff_time_ms; INIT_DELAYED_WORK(&uart->overrun_backoff, serial_8250_overrun_backoff_work); } else { uart->overrun_backoff_time_ms = 0; } } unlock: mutex_unlock(&serial_mutex); return ret; err: uart->port.dev = NULL; mutex_unlock(&serial_mutex); return ret; } EXPORT_SYMBOL(serial8250_register_8250_port); /** * serial8250_unregister_port - remove a 16x50 serial port at runtime * @line: serial line number * * Remove one serial port. This may not be called from interrupt * context. We hand the port back to the our control. */ void serial8250_unregister_port(int line) { struct uart_8250_port *uart = &serial8250_ports[line]; mutex_lock(&serial_mutex); if (uart->em485) { unsigned long flags; uart_port_lock_irqsave(&uart->port, &flags); serial8250_em485_destroy(uart); uart_port_unlock_irqrestore(&uart->port, flags); } uart_remove_one_port(&serial8250_reg, &uart->port); if (serial8250_isa_devs) { uart->port.flags &= ~UPF_BOOT_AUTOCONF; uart->port.type = PORT_UNKNOWN; uart->port.dev = &serial8250_isa_devs->dev; uart->port.port_id = line; uart->capabilities = 0; serial8250_init_port(uart); serial8250_apply_quirks(uart); uart_add_one_port(&serial8250_reg, &uart->port); } else { uart->port.dev = NULL; } mutex_unlock(&serial_mutex); } EXPORT_SYMBOL(serial8250_unregister_port); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Generic 8250/16x50 serial driver");
7 7 8 8 2 7 7 7 8 8 4 8 31 30 8 8 1 6 4 1 3 1 5 1 6 2 4 1 2 4 1 1 1 4 2 6 2 4 6 13 13 13 1 12 7 7 7 7 12 3 10 12 2 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/sch_sfq.c Stochastic Fairness Queueing discipline. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/string.h> #include <linux/in.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/skbuff.h> #include <linux/siphash.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/red.h> /* Stochastic Fairness Queuing algorithm. ======================================= Source: Paul E. McKenney "Stochastic Fairness Queuing", IEEE INFOCOMM'90 Proceedings, San Francisco, 1990. Paul E. McKenney "Stochastic Fairness Queuing", "Interworking: Research and Experience", v.2, 1991, p.113-131. See also: M. Shreedhar and George Varghese "Efficient Fair Queuing using Deficit Round Robin", Proc. SIGCOMM 95. This is not the thing that is usually called (W)FQ nowadays. It does not use any timestamp mechanism, but instead processes queues in round-robin order. ADVANTAGE: - It is very cheap. Both CPU and memory requirements are minimal. DRAWBACKS: - "Stochastic" -> It is not 100% fair. When hash collisions occur, several flows are considered as one. - "Round-robin" -> It introduces larger delays than virtual clock based schemes, and should not be used for isolating interactive traffic from non-interactive. It means, that this scheduler should be used as leaf of CBQ or P3, which put interactive traffic to higher priority band. We still need true WFQ for top level CSZ, but using WFQ for the best effort traffic is absolutely pointless: SFQ is superior for this purpose. IMPLEMENTATION: This implementation limits : - maximal queue length per flow to 127 packets. - max mtu to 2^18-1; - max 65408 flows, - number of hash buckets to 65536. It is easy to increase these values, but not in flight. */ #define SFQ_MAX_DEPTH 127 /* max number of packets per flow */ #define SFQ_DEFAULT_FLOWS 128 #define SFQ_MAX_FLOWS (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */ #define SFQ_EMPTY_SLOT 0xffff #define SFQ_DEFAULT_HASH_DIVISOR 1024 /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */ typedef u16 sfq_index; /* * We dont use pointers to save space. * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH] * are 'pointers' to dep[] array */ struct sfq_head { sfq_index next; sfq_index prev; }; struct sfq_slot { struct sk_buff *skblist_next; struct sk_buff *skblist_prev; sfq_index qlen; /* number of skbs in skblist */ sfq_index next; /* next slot in sfq RR chain */ struct sfq_head dep; /* anchor in dep[] chains */ unsigned short hash; /* hash value (index in ht[]) */ int allot; /* credit for this slot */ unsigned int backlog; struct red_vars vars; }; struct sfq_sched_data { /* frequently used fields */ int limit; /* limit of total number of packets in this qdisc */ unsigned int divisor; /* number of slots in hash table */ u8 headdrop; u8 maxdepth; /* limit of packets per flow */ siphash_key_t perturbation; u8 cur_depth; /* depth of longest slot */ u8 flags; struct tcf_proto __rcu *filter_list; struct tcf_block *block; sfq_index *ht; /* Hash table ('divisor' slots) */ struct sfq_slot *slots; /* Flows table ('maxflows' entries) */ struct red_parms *red_parms; struct tc_sfqred_stats stats; struct sfq_slot *tail; /* current slot in round */ struct sfq_head dep[SFQ_MAX_DEPTH + 1]; /* Linked lists of slots, indexed by depth * dep[0] : list of unused flows * dep[1] : list of flows with 1 packet * dep[X] : list of flows with X packets */ unsigned int maxflows; /* number of flows in flows array */ int perturb_period; unsigned int quantum; /* Allotment per round: MUST BE >= MTU */ struct timer_list perturb_timer; struct Qdisc *sch; }; /* * sfq_head are either in a sfq_slot or in dep[] array */ static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val) { if (val < SFQ_MAX_FLOWS) return &q->slots[val].dep; return &q->dep[val - SFQ_MAX_FLOWS]; } static unsigned int sfq_hash(const struct sfq_sched_data *q, const struct sk_buff *skb) { return skb_get_hash_perturb(skb, &q->perturbation) & (q->divisor - 1); } static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) { struct sfq_sched_data *q = qdisc_priv(sch); struct tcf_result res; struct tcf_proto *fl; int result; if (TC_H_MAJ(skb->priority) == sch->handle && TC_H_MIN(skb->priority) > 0 && TC_H_MIN(skb->priority) <= q->divisor) return TC_H_MIN(skb->priority); fl = rcu_dereference_bh(q->filter_list); if (!fl) return sfq_hash(q, skb) + 1; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; result = tcf_classify(skb, NULL, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; fallthrough; case TC_ACT_SHOT: return 0; } #endif if (TC_H_MIN(res.classid) <= q->divisor) return TC_H_MIN(res.classid); } return 0; } /* * x : slot number [0 .. SFQ_MAX_FLOWS - 1] */ static inline void sfq_link(struct sfq_sched_data *q, sfq_index x) { sfq_index p, n; struct sfq_slot *slot = &q->slots[x]; int qlen = slot->qlen; p = qlen + SFQ_MAX_FLOWS; n = q->dep[qlen].next; slot->dep.next = n; slot->dep.prev = p; q->dep[qlen].next = x; /* sfq_dep_head(q, p)->next = x */ sfq_dep_head(q, n)->prev = x; } #define sfq_unlink(q, x, n, p) \ do { \ n = q->slots[x].dep.next; \ p = q->slots[x].dep.prev; \ sfq_dep_head(q, p)->next = n; \ sfq_dep_head(q, n)->prev = p; \ } while (0) static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x) { sfq_index p, n; int d; sfq_unlink(q, x, n, p); d = q->slots[x].qlen--; if (n == p && q->cur_depth == d) q->cur_depth--; sfq_link(q, x); } static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x) { sfq_index p, n; int d; sfq_unlink(q, x, n, p); d = ++q->slots[x].qlen; if (q->cur_depth < d) q->cur_depth = d; sfq_link(q, x); } /* helper functions : might be changed when/if skb use a standard list_head */ /* remove one skb from tail of slot queue */ static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot) { struct sk_buff *skb = slot->skblist_prev; slot->skblist_prev = skb->prev; skb->prev->next = (struct sk_buff *)slot; skb->next = skb->prev = NULL; return skb; } /* remove one skb from head of slot queue */ static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot) { struct sk_buff *skb = slot->skblist_next; slot->skblist_next = skb->next; skb->next->prev = (struct sk_buff *)slot; skb->next = skb->prev = NULL; return skb; } static inline void slot_queue_init(struct sfq_slot *slot) { memset(slot, 0, sizeof(*slot)); slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot; } /* add skb to slot queue (tail add) */ static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb) { skb->prev = slot->skblist_prev; skb->next = (struct sk_buff *)slot; slot->skblist_prev->next = skb; slot->skblist_prev = skb; } static unsigned int sfq_drop(struct Qdisc *sch, struct sk_buff **to_free) { struct sfq_sched_data *q = qdisc_priv(sch); sfq_index x, d = q->cur_depth; struct sk_buff *skb; unsigned int len; struct sfq_slot *slot; /* Queue is full! Find the longest slot and drop tail packet from it */ if (d > 1) { x = q->dep[d].next; slot = &q->slots[x]; drop: skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot); len = qdisc_pkt_len(skb); slot->backlog -= len; sfq_dec(q, x); sch->q.qlen--; qdisc_qstats_backlog_dec(sch, skb); qdisc_drop(skb, sch, to_free); return len; } if (d == 1) { /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */ x = q->tail->next; slot = &q->slots[x]; q->tail->next = slot->next; q->ht[slot->hash] = SFQ_EMPTY_SLOT; goto drop; } return 0; } /* Is ECN parameter configured */ static int sfq_prob_mark(const struct sfq_sched_data *q) { return q->flags & TC_RED_ECN; } /* Should packets over max threshold just be marked */ static int sfq_hard_mark(const struct sfq_sched_data *q) { return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN; } static int sfq_headdrop(const struct sfq_sched_data *q) { return q->headdrop; } static int sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct sfq_sched_data *q = qdisc_priv(sch); unsigned int hash, dropped; sfq_index x, qlen; struct sfq_slot *slot; int ret; struct sk_buff *head; int delta; hash = sfq_classify(skb, sch, &ret); if (hash == 0) { if (ret & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); __qdisc_drop(skb, to_free); return ret; } hash--; x = q->ht[hash]; slot = &q->slots[x]; if (x == SFQ_EMPTY_SLOT) { x = q->dep[0].next; /* get a free slot */ if (x >= SFQ_MAX_FLOWS) return qdisc_drop(skb, sch, to_free); q->ht[hash] = x; slot = &q->slots[x]; slot->hash = hash; slot->backlog = 0; /* should already be 0 anyway... */ red_set_vars(&slot->vars); goto enqueue; } if (q->red_parms) { slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms, &slot->vars, slot->backlog); switch (red_action(q->red_parms, &slot->vars, slot->vars.qavg)) { case RED_DONT_MARK: break; case RED_PROB_MARK: qdisc_qstats_overlimit(sch); if (sfq_prob_mark(q)) { /* We know we have at least one packet in queue */ if (sfq_headdrop(q) && INET_ECN_set_ce(slot->skblist_next)) { q->stats.prob_mark_head++; break; } if (INET_ECN_set_ce(skb)) { q->stats.prob_mark++; break; } } q->stats.prob_drop++; goto congestion_drop; case RED_HARD_MARK: qdisc_qstats_overlimit(sch); if (sfq_hard_mark(q)) { /* We know we have at least one packet in queue */ if (sfq_headdrop(q) && INET_ECN_set_ce(slot->skblist_next)) { q->stats.forced_mark_head++; break; } if (INET_ECN_set_ce(skb)) { q->stats.forced_mark++; break; } } q->stats.forced_drop++; goto congestion_drop; } } if (slot->qlen >= q->maxdepth) { congestion_drop: if (!sfq_headdrop(q)) return qdisc_drop(skb, sch, to_free); /* We know we have at least one packet in queue */ head = slot_dequeue_head(slot); delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb); sch->qstats.backlog -= delta; slot->backlog -= delta; qdisc_drop(head, sch, to_free); slot_queue_add(slot, skb); qdisc_tree_reduce_backlog(sch, 0, delta); return NET_XMIT_CN; } enqueue: qdisc_qstats_backlog_inc(sch, skb); slot->backlog += qdisc_pkt_len(skb); slot_queue_add(slot, skb); sfq_inc(q, x); if (slot->qlen == 1) { /* The flow is new */ if (q->tail == NULL) { /* It is the first flow */ slot->next = x; } else { slot->next = q->tail->next; q->tail->next = x; } /* We put this flow at the end of our flow list. * This might sound unfair for a new flow to wait after old ones, * but we could endup servicing new flows only, and freeze old ones. */ q->tail = slot; /* We could use a bigger initial quantum for new flows */ slot->allot = q->quantum; } if (++sch->q.qlen <= q->limit) return NET_XMIT_SUCCESS; qlen = slot->qlen; dropped = sfq_drop(sch, to_free); /* Return Congestion Notification only if we dropped a packet * from this flow. */ if (qlen != slot->qlen) { qdisc_tree_reduce_backlog(sch, 0, dropped - qdisc_pkt_len(skb)); return NET_XMIT_CN; } /* As we dropped a packet, better let upper stack know this */ qdisc_tree_reduce_backlog(sch, 1, dropped); return NET_XMIT_SUCCESS; } static struct sk_buff * sfq_dequeue(struct Qdisc *sch) { struct sfq_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; sfq_index a, next_a; struct sfq_slot *slot; /* No active slots */ if (q->tail == NULL) return NULL; next_slot: a = q->tail->next; slot = &q->slots[a]; if (slot->allot <= 0) { q->tail = slot; slot->allot += q->quantum; goto next_slot; } skb = slot_dequeue_head(slot); sfq_dec(q, a); qdisc_bstats_update(sch, skb); sch->q.qlen--; qdisc_qstats_backlog_dec(sch, skb); slot->backlog -= qdisc_pkt_len(skb); /* Is the slot empty? */ if (slot->qlen == 0) { q->ht[slot->hash] = SFQ_EMPTY_SLOT; next_a = slot->next; if (a == next_a) { q->tail = NULL; /* no more active slots */ return skb; } q->tail->next = next_a; } else { slot->allot -= qdisc_pkt_len(skb); } return skb; } static void sfq_reset(struct Qdisc *sch) { struct sk_buff *skb; while ((skb = sfq_dequeue(sch)) != NULL) rtnl_kfree_skbs(skb, skb); } /* * When q->perturbation is changed, we rehash all queued skbs * to avoid OOO (Out Of Order) effects. * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change * counters. */ static void sfq_rehash(struct Qdisc *sch) { struct sfq_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; int i; struct sfq_slot *slot; struct sk_buff_head list; int dropped = 0; unsigned int drop_len = 0; __skb_queue_head_init(&list); for (i = 0; i < q->maxflows; i++) { slot = &q->slots[i]; if (!slot->qlen) continue; while (slot->qlen) { skb = slot_dequeue_head(slot); sfq_dec(q, i); __skb_queue_tail(&list, skb); } slot->backlog = 0; red_set_vars(&slot->vars); q->ht[slot->hash] = SFQ_EMPTY_SLOT; } q->tail = NULL; while ((skb = __skb_dequeue(&list)) != NULL) { unsigned int hash = sfq_hash(q, skb); sfq_index x = q->ht[hash]; slot = &q->slots[x]; if (x == SFQ_EMPTY_SLOT) { x = q->dep[0].next; /* get a free slot */ if (x >= SFQ_MAX_FLOWS) { drop: qdisc_qstats_backlog_dec(sch, skb); drop_len += qdisc_pkt_len(skb); kfree_skb(skb); dropped++; continue; } q->ht[hash] = x; slot = &q->slots[x]; slot->hash = hash; } if (slot->qlen >= q->maxdepth) goto drop; slot_queue_add(slot, skb); if (q->red_parms) slot->vars.qavg = red_calc_qavg(q->red_parms, &slot->vars, slot->backlog); slot->backlog += qdisc_pkt_len(skb); sfq_inc(q, x); if (slot->qlen == 1) { /* The flow is new */ if (q->tail == NULL) { /* It is the first flow */ slot->next = x; } else { slot->next = q->tail->next; q->tail->next = x; } q->tail = slot; slot->allot = q->quantum; } } sch->q.qlen -= dropped; qdisc_tree_reduce_backlog(sch, dropped, drop_len); } static void sfq_perturbation(struct timer_list *t) { struct sfq_sched_data *q = from_timer(q, t, perturb_timer); struct Qdisc *sch = q->sch; spinlock_t *root_lock; siphash_key_t nkey; int period; get_random_bytes(&nkey, sizeof(nkey)); rcu_read_lock(); root_lock = qdisc_lock(qdisc_root_sleeping(sch)); spin_lock(root_lock); q->perturbation = nkey; if (!q->filter_list && q->tail) sfq_rehash(sch); spin_unlock(root_lock); /* q->perturb_period can change under us from * sfq_change() and sfq_destroy(). */ period = READ_ONCE(q->perturb_period); if (period) mod_timer(&q->perturb_timer, jiffies + period); rcu_read_unlock(); } static int sfq_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct sfq_sched_data *q = qdisc_priv(sch); struct tc_sfq_qopt *ctl = nla_data(opt); struct tc_sfq_qopt_v1 *ctl_v1 = NULL; unsigned int qlen, dropped = 0; struct red_parms *p = NULL; struct sk_buff *to_free = NULL; struct sk_buff *tail = NULL; if (opt->nla_len < nla_attr_size(sizeof(*ctl))) return -EINVAL; if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1))) ctl_v1 = nla_data(opt); if (ctl->divisor && (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536)) return -EINVAL; if ((int)ctl->quantum < 0) { NL_SET_ERR_MSG_MOD(extack, "invalid quantum"); return -EINVAL; } if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max, ctl_v1->Wlog, ctl_v1->Scell_log, NULL)) return -EINVAL; if (ctl_v1 && ctl_v1->qth_min) { p = kmalloc(sizeof(*p), GFP_KERNEL); if (!p) return -ENOMEM; } if (ctl->limit == 1) { NL_SET_ERR_MSG_MOD(extack, "invalid limit"); return -EINVAL; } sch_tree_lock(sch); if (ctl->quantum) q->quantum = ctl->quantum; WRITE_ONCE(q->perturb_period, ctl->perturb_period * HZ); if (ctl->flows) q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS); if (ctl->divisor) { q->divisor = ctl->divisor; q->maxflows = min_t(u32, q->maxflows, q->divisor); } if (ctl_v1) { if (ctl_v1->depth) q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH); if (p) { swap(q->red_parms, p); red_set_parms(q->red_parms, ctl_v1->qth_min, ctl_v1->qth_max, ctl_v1->Wlog, ctl_v1->Plog, ctl_v1->Scell_log, NULL, ctl_v1->max_P); } q->flags = ctl_v1->flags; q->headdrop = ctl_v1->headdrop; } if (ctl->limit) { q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows); q->maxflows = min_t(u32, q->maxflows, q->limit); } qlen = sch->q.qlen; while (sch->q.qlen > q->limit) { dropped += sfq_drop(sch, &to_free); if (!tail) tail = to_free; } rtnl_kfree_skbs(to_free, tail); qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped); del_timer(&q->perturb_timer); if (q->perturb_period) { mod_timer(&q->perturb_timer, jiffies + q->perturb_period); get_random_bytes(&q->perturbation, sizeof(q->perturbation)); } sch_tree_unlock(sch); kfree(p); return 0; } static void *sfq_alloc(size_t sz) { return kvmalloc(sz, GFP_KERNEL); } static void sfq_free(void *addr) { kvfree(addr); } static void sfq_destroy(struct Qdisc *sch) { struct sfq_sched_data *q = qdisc_priv(sch); tcf_block_put(q->block); WRITE_ONCE(q->perturb_period, 0); del_timer_sync(&q->perturb_timer); sfq_free(q->ht); sfq_free(q->slots); kfree(q->red_parms); } static int sfq_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct sfq_sched_data *q = qdisc_priv(sch); int i; int err; q->sch = sch; timer_setup(&q->perturb_timer, sfq_perturbation, TIMER_DEFERRABLE); err = tcf_block_get(&q->block, &q->filter_list, sch, extack); if (err) return err; for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) { q->dep[i].next = i + SFQ_MAX_FLOWS; q->dep[i].prev = i + SFQ_MAX_FLOWS; } q->limit = SFQ_MAX_DEPTH; q->maxdepth = SFQ_MAX_DEPTH; q->cur_depth = 0; q->tail = NULL; q->divisor = SFQ_DEFAULT_HASH_DIVISOR; q->maxflows = SFQ_DEFAULT_FLOWS; q->quantum = psched_mtu(qdisc_dev(sch)); q->perturb_period = 0; get_random_bytes(&q->perturbation, sizeof(q->perturbation)); if (opt) { int err = sfq_change(sch, opt, extack); if (err) return err; } q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor); q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows); if (!q->ht || !q->slots) { /* Note: sfq_destroy() will be called by our caller */ return -ENOMEM; } for (i = 0; i < q->divisor; i++) q->ht[i] = SFQ_EMPTY_SLOT; for (i = 0; i < q->maxflows; i++) { slot_queue_init(&q->slots[i]); sfq_link(q, i); } if (q->limit >= 1) sch->flags |= TCQ_F_CAN_BYPASS; else sch->flags &= ~TCQ_F_CAN_BYPASS; return 0; } static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb) { struct sfq_sched_data *q = qdisc_priv(sch); unsigned char *b = skb_tail_pointer(skb); struct tc_sfq_qopt_v1 opt; struct red_parms *p = q->red_parms; memset(&opt, 0, sizeof(opt)); opt.v0.quantum = q->quantum; opt.v0.perturb_period = q->perturb_period / HZ; opt.v0.limit = q->limit; opt.v0.divisor = q->divisor; opt.v0.flows = q->maxflows; opt.depth = q->maxdepth; opt.headdrop = q->headdrop; if (p) { opt.qth_min = p->qth_min >> p->Wlog; opt.qth_max = p->qth_max >> p->Wlog; opt.Wlog = p->Wlog; opt.Plog = p->Plog; opt.Scell_log = p->Scell_log; opt.max_P = p->max_P; } memcpy(&opt.stats, &q->stats, sizeof(opt.stats)); opt.flags = q->flags; if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) goto nla_put_failure; return skb->len; nla_put_failure: nlmsg_trim(skb, b); return -1; } static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg) { return NULL; } static unsigned long sfq_find(struct Qdisc *sch, u32 classid) { return 0; } static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent, u32 classid) { return 0; } static void sfq_unbind(struct Qdisc *q, unsigned long cl) { } static struct tcf_block *sfq_tcf_block(struct Qdisc *sch, unsigned long cl, struct netlink_ext_ack *extack) { struct sfq_sched_data *q = qdisc_priv(sch); if (cl) return NULL; return q->block; } static int sfq_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { tcm->tcm_handle |= TC_H_MIN(cl); return 0; } static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct gnet_dump *d) { struct sfq_sched_data *q = qdisc_priv(sch); sfq_index idx = q->ht[cl - 1]; struct gnet_stats_queue qs = { 0 }; struct tc_sfq_xstats xstats = { 0 }; if (idx != SFQ_EMPTY_SLOT) { const struct sfq_slot *slot = &q->slots[idx]; xstats.allot = slot->allot; qs.qlen = slot->qlen; qs.backlog = slot->backlog; } if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0) return -1; return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); } static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct sfq_sched_data *q = qdisc_priv(sch); unsigned int i; if (arg->stop) return; for (i = 0; i < q->divisor; i++) { if (q->ht[i] == SFQ_EMPTY_SLOT) { arg->count++; continue; } if (!tc_qdisc_stats_dump(sch, i + 1, arg)) break; } } static const struct Qdisc_class_ops sfq_class_ops = { .leaf = sfq_leaf, .find = sfq_find, .tcf_block = sfq_tcf_block, .bind_tcf = sfq_bind, .unbind_tcf = sfq_unbind, .dump = sfq_dump_class, .dump_stats = sfq_dump_class_stats, .walk = sfq_walk, }; static struct Qdisc_ops sfq_qdisc_ops __read_mostly = { .cl_ops = &sfq_class_ops, .id = "sfq", .priv_size = sizeof(struct sfq_sched_data), .enqueue = sfq_enqueue, .dequeue = sfq_dequeue, .peek = qdisc_peek_dequeued, .init = sfq_init, .reset = sfq_reset, .destroy = sfq_destroy, .change = NULL, .dump = sfq_dump, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("sfq"); static int __init sfq_module_init(void) { return register_qdisc(&sfq_qdisc_ops); } static void __exit sfq_module_exit(void) { unregister_qdisc(&sfq_qdisc_ops); } module_init(sfq_module_init) module_exit(sfq_module_exit) MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Stochastic Fairness qdisc");
7 8 8 8 7 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 // SPDX-License-Identifier: GPL-2.0 /* * Neil Brown <neilb@cse.unsw.edu.au> * J. Bruce Fields <bfields@umich.edu> * Andy Adamson <andros@umich.edu> * Dug Song <dugsong@monkey.org> * * RPCSEC_GSS server authentication. * This implements RPCSEC_GSS as defined in rfc2203 (rpcsec_gss) and rfc2078 * (gssapi) * * The RPCSEC_GSS involves three stages: * 1/ context creation * 2/ data exchange * 3/ context destruction * * Context creation is handled largely by upcalls to user-space. * In particular, GSS_Accept_sec_context is handled by an upcall * Data exchange is handled entirely within the kernel * In particular, GSS_GetMIC, GSS_VerifyMIC, GSS_Seal, GSS_Unseal are in-kernel. * Context destruction is handled in-kernel * GSS_Delete_sec_context is in-kernel * * Context creation is initiated by a RPCSEC_GSS_INIT request arriving. * The context handle and gss_token are used as a key into the rpcsec_init cache. * The content of this cache includes some of the outputs of GSS_Accept_sec_context, * being major_status, minor_status, context_handle, reply_token. * These are sent back to the client. * Sequence window management is handled by the kernel. The window size if currently * a compile time constant. * * When user-space is happy that a context is established, it places an entry * in the rpcsec_context cache. The key for this cache is the context_handle. * The content includes: * uid/gidlist - for determining access rights * mechanism type * mechanism specific information, such as a key * */ #include <linux/slab.h> #include <linux/types.h> #include <linux/module.h> #include <linux/pagemap.h> #include <linux/user_namespace.h> #include <linux/sunrpc/auth_gss.h> #include <linux/sunrpc/gss_err.h> #include <linux/sunrpc/svcauth.h> #include <linux/sunrpc/svcauth_gss.h> #include <linux/sunrpc/cache.h> #include <linux/sunrpc/gss_krb5.h> #include <trace/events/rpcgss.h> #include "gss_rpc_upcall.h" /* * Unfortunately there isn't a maximum checksum size exported via the * GSS API. Manufacture one based on GSS mechanisms supported by this * implementation. */ #define GSS_MAX_CKSUMSIZE (GSS_KRB5_TOK_HDR_LEN + GSS_KRB5_MAX_CKSUM_LEN) /* * This value may be increased in the future to accommodate other * usage of the scratch buffer. */ #define GSS_SCRATCH_SIZE GSS_MAX_CKSUMSIZE struct gss_svc_data { /* decoded gss client cred: */ struct rpc_gss_wire_cred clcred; u32 gsd_databody_offset; struct rsc *rsci; /* for temporary results */ __be32 gsd_seq_num; u8 gsd_scratch[GSS_SCRATCH_SIZE]; }; /* The rpcsec_init cache is used for mapping RPCSEC_GSS_{,CONT_}INIT requests * into replies. * * Key is context handle (\x if empty) and gss_token. * Content is major_status minor_status (integers) context_handle, reply_token. * */ static int netobj_equal(struct xdr_netobj *a, struct xdr_netobj *b) { return a->len == b->len && 0 == memcmp(a->data, b->data, a->len); } #define RSI_HASHBITS 6 #define RSI_HASHMAX (1<<RSI_HASHBITS) struct rsi { struct cache_head h; struct xdr_netobj in_handle, in_token; struct xdr_netobj out_handle, out_token; int major_status, minor_status; struct rcu_head rcu_head; }; static struct rsi *rsi_update(struct cache_detail *cd, struct rsi *new, struct rsi *old); static struct rsi *rsi_lookup(struct cache_detail *cd, struct rsi *item); static void rsi_free(struct rsi *rsii) { kfree(rsii->in_handle.data); kfree(rsii->in_token.data); kfree(rsii->out_handle.data); kfree(rsii->out_token.data); } static void rsi_free_rcu(struct rcu_head *head) { struct rsi *rsii = container_of(head, struct rsi, rcu_head); rsi_free(rsii); kfree(rsii); } static void rsi_put(struct kref *ref) { struct rsi *rsii = container_of(ref, struct rsi, h.ref); call_rcu(&rsii->rcu_head, rsi_free_rcu); } static inline int rsi_hash(struct rsi *item) { return hash_mem(item->in_handle.data, item->in_handle.len, RSI_HASHBITS) ^ hash_mem(item->in_token.data, item->in_token.len, RSI_HASHBITS); } static int rsi_match(struct cache_head *a, struct cache_head *b) { struct rsi *item = container_of(a, struct rsi, h); struct rsi *tmp = container_of(b, struct rsi, h); return netobj_equal(&item->in_handle, &tmp->in_handle) && netobj_equal(&item->in_token, &tmp->in_token); } static int dup_to_netobj(struct xdr_netobj *dst, char *src, int len) { dst->len = len; dst->data = (len ? kmemdup(src, len, GFP_KERNEL) : NULL); if (len && !dst->data) return -ENOMEM; return 0; } static inline int dup_netobj(struct xdr_netobj *dst, struct xdr_netobj *src) { return dup_to_netobj(dst, src->data, src->len); } static void rsi_init(struct cache_head *cnew, struct cache_head *citem) { struct rsi *new = container_of(cnew, struct rsi, h); struct rsi *item = container_of(citem, struct rsi, h); new->out_handle.data = NULL; new->out_handle.len = 0; new->out_token.data = NULL; new->out_token.len = 0; new->in_handle.len = item->in_handle.len; item->in_handle.len = 0; new->in_token.len = item->in_token.len; item->in_token.len = 0; new->in_handle.data = item->in_handle.data; item->in_handle.data = NULL; new->in_token.data = item->in_token.data; item->in_token.data = NULL; } static void update_rsi(struct cache_head *cnew, struct cache_head *citem) { struct rsi *new = container_of(cnew, struct rsi, h); struct rsi *item = container_of(citem, struct rsi, h); BUG_ON(new->out_handle.data || new->out_token.data); new->out_handle.len = item->out_handle.len; item->out_handle.len = 0; new->out_token.len = item->out_token.len; item->out_token.len = 0; new->out_handle.data = item->out_handle.data; item->out_handle.data = NULL; new->out_token.data = item->out_token.data; item->out_token.data = NULL; new->major_status = item->major_status; new->minor_status = item->minor_status; } static struct cache_head *rsi_alloc(void) { struct rsi *rsii = kmalloc(sizeof(*rsii), GFP_KERNEL); if (rsii) return &rsii->h; else return NULL; } static int rsi_upcall(struct cache_detail *cd, struct cache_head *h) { return sunrpc_cache_pipe_upcall_timeout(cd, h); } static void rsi_request(struct cache_detail *cd, struct cache_head *h, char **bpp, int *blen) { struct rsi *rsii = container_of(h, struct rsi, h); qword_addhex(bpp, blen, rsii->in_handle.data, rsii->in_handle.len); qword_addhex(bpp, blen, rsii->in_token.data, rsii->in_token.len); (*bpp)[-1] = '\n'; WARN_ONCE(*blen < 0, "RPCSEC/GSS credential too large - please use gssproxy\n"); } static int rsi_parse(struct cache_detail *cd, char *mesg, int mlen) { /* context token expiry major minor context token */ char *buf = mesg; char *ep; int len; struct rsi rsii, *rsip = NULL; time64_t expiry; int status = -EINVAL; memset(&rsii, 0, sizeof(rsii)); /* handle */ len = qword_get(&mesg, buf, mlen); if (len < 0) goto out; status = -ENOMEM; if (dup_to_netobj(&rsii.in_handle, buf, len)) goto out; /* token */ len = qword_get(&mesg, buf, mlen); status = -EINVAL; if (len < 0) goto out; status = -ENOMEM; if (dup_to_netobj(&rsii.in_token, buf, len)) goto out; rsip = rsi_lookup(cd, &rsii); if (!rsip) goto out; rsii.h.flags = 0; /* expiry */ status = get_expiry(&mesg, &expiry); if (status) goto out; status = -EINVAL; /* major/minor */ len = qword_get(&mesg, buf, mlen); if (len <= 0) goto out; rsii.major_status = simple_strtoul(buf, &ep, 10); if (*ep) goto out; len = qword_get(&mesg, buf, mlen); if (len <= 0) goto out; rsii.minor_status = simple_strtoul(buf, &ep, 10); if (*ep) goto out; /* out_handle */ len = qword_get(&mesg, buf, mlen); if (len < 0) goto out; status = -ENOMEM; if (dup_to_netobj(&rsii.out_handle, buf, len)) goto out; /* out_token */ len = qword_get(&mesg, buf, mlen); status = -EINVAL; if (len < 0) goto out; status = -ENOMEM; if (dup_to_netobj(&rsii.out_token, buf, len)) goto out; rsii.h.expiry_time = expiry; rsip = rsi_update(cd, &rsii, rsip); status = 0; out: rsi_free(&rsii); if (rsip) cache_put(&rsip->h, cd); else status = -ENOMEM; return status; } static const struct cache_detail rsi_cache_template = { .owner = THIS_MODULE, .hash_size = RSI_HASHMAX, .name = "auth.rpcsec.init", .cache_put = rsi_put, .cache_upcall = rsi_upcall, .cache_request = rsi_request, .cache_parse = rsi_parse, .match = rsi_match, .init = rsi_init, .update = update_rsi, .alloc = rsi_alloc, }; static struct rsi *rsi_lookup(struct cache_detail *cd, struct rsi *item) { struct cache_head *ch; int hash = rsi_hash(item); ch = sunrpc_cache_lookup_rcu(cd, &item->h, hash); if (ch) return container_of(ch, struct rsi, h); else return NULL; } static struct rsi *rsi_update(struct cache_detail *cd, struct rsi *new, struct rsi *old) { struct cache_head *ch; int hash = rsi_hash(new); ch = sunrpc_cache_update(cd, &new->h, &old->h, hash); if (ch) return container_of(ch, struct rsi, h); else return NULL; } /* * The rpcsec_context cache is used to store a context that is * used in data exchange. * The key is a context handle. The content is: * uid, gidlist, mechanism, service-set, mech-specific-data */ #define RSC_HASHBITS 10 #define RSC_HASHMAX (1<<RSC_HASHBITS) #define GSS_SEQ_WIN 128 struct gss_svc_seq_data { /* highest seq number seen so far: */ u32 sd_max; /* for i such that sd_max-GSS_SEQ_WIN < i <= sd_max, the i-th bit of * sd_win is nonzero iff sequence number i has been seen already: */ unsigned long sd_win[GSS_SEQ_WIN/BITS_PER_LONG]; spinlock_t sd_lock; }; struct rsc { struct cache_head h; struct xdr_netobj handle; struct svc_cred cred; struct gss_svc_seq_data seqdata; struct gss_ctx *mechctx; struct rcu_head rcu_head; }; static struct rsc *rsc_update(struct cache_detail *cd, struct rsc *new, struct rsc *old); static struct rsc *rsc_lookup(struct cache_detail *cd, struct rsc *item); static void rsc_free(struct rsc *rsci) { kfree(rsci->handle.data); if (rsci->mechctx) gss_delete_sec_context(&rsci->mechctx); free_svc_cred(&rsci->cred); } static void rsc_free_rcu(struct rcu_head *head) { struct rsc *rsci = container_of(head, struct rsc, rcu_head); kfree(rsci->handle.data); kfree(rsci); } static void rsc_put(struct kref *ref) { struct rsc *rsci = container_of(ref, struct rsc, h.ref); if (rsci->mechctx) gss_delete_sec_context(&rsci->mechctx); free_svc_cred(&rsci->cred); call_rcu(&rsci->rcu_head, rsc_free_rcu); } static inline int rsc_hash(struct rsc *rsci) { return hash_mem(rsci->handle.data, rsci->handle.len, RSC_HASHBITS); } static int rsc_match(struct cache_head *a, struct cache_head *b) { struct rsc *new = container_of(a, struct rsc, h); struct rsc *tmp = container_of(b, struct rsc, h); return netobj_equal(&new->handle, &tmp->handle); } static void rsc_init(struct cache_head *cnew, struct cache_head *ctmp) { struct rsc *new = container_of(cnew, struct rsc, h); struct rsc *tmp = container_of(ctmp, struct rsc, h); new->handle.len = tmp->handle.len; tmp->handle.len = 0; new->handle.data = tmp->handle.data; tmp->handle.data = NULL; new->mechctx = NULL; init_svc_cred(&new->cred); } static void update_rsc(struct cache_head *cnew, struct cache_head *ctmp) { struct rsc *new = container_of(cnew, struct rsc, h); struct rsc *tmp = container_of(ctmp, struct rsc, h); new->mechctx = tmp->mechctx; tmp->mechctx = NULL; memset(&new->seqdata, 0, sizeof(new->seqdata)); spin_lock_init(&new->seqdata.sd_lock); new->cred = tmp->cred; init_svc_cred(&tmp->cred); } static struct cache_head * rsc_alloc(void) { struct rsc *rsci = kmalloc(sizeof(*rsci), GFP_KERNEL); if (rsci) return &rsci->h; else return NULL; } static int rsc_upcall(struct cache_detail *cd, struct cache_head *h) { return -EINVAL; } static int rsc_parse(struct cache_detail *cd, char *mesg, int mlen) { /* contexthandle expiry [ uid gid N <n gids> mechname ...mechdata... ] */ char *buf = mesg; int id; int len, rv; struct rsc rsci, *rscp = NULL; time64_t expiry; int status = -EINVAL; struct gss_api_mech *gm = NULL; memset(&rsci, 0, sizeof(rsci)); /* context handle */ len = qword_get(&mesg, buf, mlen); if (len < 0) goto out; status = -ENOMEM; if (dup_to_netobj(&rsci.handle, buf, len)) goto out; rsci.h.flags = 0; /* expiry */ status = get_expiry(&mesg, &expiry); if (status) goto out; status = -EINVAL; rscp = rsc_lookup(cd, &rsci); if (!rscp) goto out; /* uid, or NEGATIVE */ rv = get_int(&mesg, &id); if (rv == -EINVAL) goto out; if (rv == -ENOENT) set_bit(CACHE_NEGATIVE, &rsci.h.flags); else { int N, i; /* * NOTE: we skip uid_valid()/gid_valid() checks here: * instead, * -1 id's are later mapped to the * (export-specific) anonymous id by nfsd_setuser. * * (But supplementary gid's get no such special * treatment so are checked for validity here.) */ /* uid */ rsci.cred.cr_uid = make_kuid(current_user_ns(), id); /* gid */ if (get_int(&mesg, &id)) goto out; rsci.cred.cr_gid = make_kgid(current_user_ns(), id); /* number of additional gid's */ if (get_int(&mesg, &N)) goto out; if (N < 0 || N > NGROUPS_MAX) goto out; status = -ENOMEM; rsci.cred.cr_group_info = groups_alloc(N); if (rsci.cred.cr_group_info == NULL) goto out; /* gid's */ status = -EINVAL; for (i=0; i<N; i++) { kgid_t kgid; if (get_int(&mesg, &id)) goto out; kgid = make_kgid(current_user_ns(), id); if (!gid_valid(kgid)) goto out; rsci.cred.cr_group_info->gid[i] = kgid; } groups_sort(rsci.cred.cr_group_info); /* mech name */ len = qword_get(&mesg, buf, mlen); if (len < 0) goto out; gm = rsci.cred.cr_gss_mech = gss_mech_get_by_name(buf); status = -EOPNOTSUPP; if (!gm) goto out; status = -EINVAL; /* mech-specific data: */ len = qword_get(&mesg, buf, mlen); if (len < 0) goto out; status = gss_import_sec_context(buf, len, gm, &rsci.mechctx, NULL, GFP_KERNEL); if (status) goto out; /* get client name */ len = qword_get(&mesg, buf, mlen); if (len > 0) { rsci.cred.cr_principal = kstrdup(buf, GFP_KERNEL); if (!rsci.cred.cr_principal) { status = -ENOMEM; goto out; } } } rsci.h.expiry_time = expiry; rscp = rsc_update(cd, &rsci, rscp); status = 0; out: rsc_free(&rsci); if (rscp) cache_put(&rscp->h, cd); else status = -ENOMEM; return status; } static const struct cache_detail rsc_cache_template = { .owner = THIS_MODULE, .hash_size = RSC_HASHMAX, .name = "auth.rpcsec.context", .cache_put = rsc_put, .cache_upcall = rsc_upcall, .cache_parse = rsc_parse, .match = rsc_match, .init = rsc_init, .update = update_rsc, .alloc = rsc_alloc, }; static struct rsc *rsc_lookup(struct cache_detail *cd, struct rsc *item) { struct cache_head *ch; int hash = rsc_hash(item); ch = sunrpc_cache_lookup_rcu(cd, &item->h, hash); if (ch) return container_of(ch, struct rsc, h); else return NULL; } static struct rsc *rsc_update(struct cache_detail *cd, struct rsc *new, struct rsc *old) { struct cache_head *ch; int hash = rsc_hash(new); ch = sunrpc_cache_update(cd, &new->h, &old->h, hash); if (ch) return container_of(ch, struct rsc, h); else return NULL; } static struct rsc * gss_svc_searchbyctx(struct cache_detail *cd, struct xdr_netobj *handle) { struct rsc rsci; struct rsc *found; memset(&rsci, 0, sizeof(rsci)); if (dup_to_netobj(&rsci.handle, handle->data, handle->len)) return NULL; found = rsc_lookup(cd, &rsci); rsc_free(&rsci); if (!found) return NULL; if (cache_check(cd, &found->h, NULL)) return NULL; return found; } /** * gss_check_seq_num - GSS sequence number window check * @rqstp: RPC Call to use when reporting errors * @rsci: cached GSS context state (updated on return) * @seq_num: sequence number to check * * Implements sequence number algorithm as specified in * RFC 2203, Section 5.3.3.1. "Context Management". * * Return values: * %true: @rqstp's GSS sequence number is inside the window * %false: @rqstp's GSS sequence number is outside the window */ static bool gss_check_seq_num(const struct svc_rqst *rqstp, struct rsc *rsci, u32 seq_num) { struct gss_svc_seq_data *sd = &rsci->seqdata; bool result = false; spin_lock(&sd->sd_lock); if (seq_num > sd->sd_max) { if (seq_num >= sd->sd_max + GSS_SEQ_WIN) { memset(sd->sd_win, 0, sizeof(sd->sd_win)); sd->sd_max = seq_num; } else while (sd->sd_max < seq_num) { sd->sd_max++; __clear_bit(sd->sd_max % GSS_SEQ_WIN, sd->sd_win); } __set_bit(seq_num % GSS_SEQ_WIN, sd->sd_win); goto ok; } else if (seq_num + GSS_SEQ_WIN <= sd->sd_max) { goto toolow; } if (__test_and_set_bit(seq_num % GSS_SEQ_WIN, sd->sd_win)) goto alreadyseen; ok: result = true; out: spin_unlock(&sd->sd_lock); return result; toolow: trace_rpcgss_svc_seqno_low(rqstp, seq_num, sd->sd_max - GSS_SEQ_WIN, sd->sd_max); goto out; alreadyseen: trace_rpcgss_svc_seqno_seen(rqstp, seq_num); goto out; } /* * Decode and verify a Call's verifier field. For RPC_AUTH_GSS Calls, * the body of this field contains a variable length checksum. * * GSS-specific auth_stat values are mandated by RFC 2203 Section * 5.3.3.3. */ static int svcauth_gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci, __be32 *rpcstart, struct rpc_gss_wire_cred *gc) { struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct gss_ctx *ctx_id = rsci->mechctx; u32 flavor, maj_stat; struct xdr_buf rpchdr; struct xdr_netobj checksum; struct kvec iov; /* * Compute the checksum of the incoming Call from the * XID field to credential field: */ iov.iov_base = rpcstart; iov.iov_len = (u8 *)xdr->p - (u8 *)rpcstart; xdr_buf_from_iov(&iov, &rpchdr); /* Call's verf field: */ if (xdr_stream_decode_opaque_auth(xdr, &flavor, (void **)&checksum.data, &checksum.len) < 0) { rqstp->rq_auth_stat = rpc_autherr_badverf; return SVC_DENIED; } if (flavor != RPC_AUTH_GSS) { rqstp->rq_auth_stat = rpc_autherr_badverf; return SVC_DENIED; } if (rqstp->rq_deferred) return SVC_OK; maj_stat = gss_verify_mic(ctx_id, &rpchdr, &checksum); if (maj_stat != GSS_S_COMPLETE) { trace_rpcgss_svc_mic(rqstp, maj_stat); rqstp->rq_auth_stat = rpcsec_gsserr_credproblem; return SVC_DENIED; } if (gc->gc_seq > MAXSEQ) { trace_rpcgss_svc_seqno_large(rqstp, gc->gc_seq); rqstp->rq_auth_stat = rpcsec_gsserr_ctxproblem; return SVC_DENIED; } if (!gss_check_seq_num(rqstp, rsci, gc->gc_seq)) return SVC_DROP; return SVC_OK; } /* * Construct and encode a Reply's verifier field. The verifier's body * field contains a variable-length checksum of the GSS sequence * number. */ static bool svcauth_gss_encode_verf(struct svc_rqst *rqstp, struct gss_ctx *ctx_id, u32 seq) { struct gss_svc_data *gsd = rqstp->rq_auth_data; u32 maj_stat; struct xdr_buf verf_data; struct xdr_netobj checksum; struct kvec iov; gsd->gsd_seq_num = cpu_to_be32(seq); iov.iov_base = &gsd->gsd_seq_num; iov.iov_len = XDR_UNIT; xdr_buf_from_iov(&iov, &verf_data); checksum.data = gsd->gsd_scratch; maj_stat = gss_get_mic(ctx_id, &verf_data, &checksum); if (maj_stat != GSS_S_COMPLETE) goto bad_mic; return xdr_stream_encode_opaque_auth(&rqstp->rq_res_stream, RPC_AUTH_GSS, checksum.data, checksum.len) > 0; bad_mic: trace_rpcgss_svc_get_mic(rqstp, maj_stat); return false; } struct gss_domain { struct auth_domain h; u32 pseudoflavor; }; static struct auth_domain * find_gss_auth_domain(struct gss_ctx *ctx, u32 svc) { char *name; name = gss_service_to_auth_domain_name(ctx->mech_type, svc); if (!name) return NULL; return auth_domain_find(name); } static struct auth_ops svcauthops_gss; u32 svcauth_gss_flavor(struct auth_domain *dom) { struct gss_domain *gd = container_of(dom, struct gss_domain, h); return gd->pseudoflavor; } EXPORT_SYMBOL_GPL(svcauth_gss_flavor); struct auth_domain * svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name) { struct gss_domain *new; struct auth_domain *test; int stat = -ENOMEM; new = kmalloc(sizeof(*new), GFP_KERNEL); if (!new) goto out; kref_init(&new->h.ref); new->h.name = kstrdup(name, GFP_KERNEL); if (!new->h.name) goto out_free_dom; new->h.flavour = &svcauthops_gss; new->pseudoflavor = pseudoflavor; test = auth_domain_lookup(name, &new->h); if (test != &new->h) { pr_warn("svc: duplicate registration of gss pseudo flavour %s.\n", name); stat = -EADDRINUSE; auth_domain_put(test); goto out_free_name; } return test; out_free_name: kfree(new->h.name); out_free_dom: kfree(new); out: return ERR_PTR(stat); } EXPORT_SYMBOL_GPL(svcauth_gss_register_pseudoflavor); /* * RFC 2203, Section 5.3.2.2 * * struct rpc_gss_integ_data { * opaque databody_integ<>; * opaque checksum<>; * }; * * struct rpc_gss_data_t { * unsigned int seq_num; * proc_req_arg_t arg; * }; */ static noinline_for_stack int svcauth_gss_unwrap_integ(struct svc_rqst *rqstp, u32 seq, struct gss_ctx *ctx) { struct gss_svc_data *gsd = rqstp->rq_auth_data; struct xdr_stream *xdr = &rqstp->rq_arg_stream; u32 len, offset, seq_num, maj_stat; struct xdr_buf *buf = xdr->buf; struct xdr_buf databody_integ; struct xdr_netobj checksum; /* Did we already verify the signature on the original pass through? */ if (rqstp->rq_deferred) return 0; if (xdr_stream_decode_u32(xdr, &len) < 0) goto unwrap_failed; if (len & 3) goto unwrap_failed; offset = xdr_stream_pos(xdr); if (xdr_buf_subsegment(buf, &databody_integ, offset, len)) goto unwrap_failed; /* * The xdr_stream now points to the @seq_num field. The next * XDR data item is the @arg field, which contains the clear * text RPC program payload. The checksum, which follows the * @arg field, is located and decoded without updating the * xdr_stream. */ offset += len; if (xdr_decode_word(buf, offset, &checksum.len)) goto unwrap_failed; if (checksum.len > sizeof(gsd->gsd_scratch)) goto unwrap_failed; checksum.data = gsd->gsd_scratch; if (read_bytes_from_xdr_buf(buf, offset + XDR_UNIT, checksum.data, checksum.len)) goto unwrap_failed; maj_stat = gss_verify_mic(ctx, &databody_integ, &checksum); if (maj_stat != GSS_S_COMPLETE) goto bad_mic; /* The received seqno is protected by the checksum. */ if (xdr_stream_decode_u32(xdr, &seq_num) < 0) goto unwrap_failed; if (seq_num != seq) goto bad_seqno; xdr_truncate_decode(xdr, XDR_UNIT + checksum.len); return 0; unwrap_failed: trace_rpcgss_svc_unwrap_failed(rqstp); return -EINVAL; bad_seqno: trace_rpcgss_svc_seqno_bad(rqstp, seq, seq_num); return -EINVAL; bad_mic: trace_rpcgss_svc_mic(rqstp, maj_stat); return -EINVAL; } /* * RFC 2203, Section 5.3.2.3 * * struct rpc_gss_priv_data { * opaque databody_priv<> * }; * * struct rpc_gss_data_t { * unsigned int seq_num; * proc_req_arg_t arg; * }; */ static noinline_for_stack int svcauth_gss_unwrap_priv(struct svc_rqst *rqstp, u32 seq, struct gss_ctx *ctx) { struct xdr_stream *xdr = &rqstp->rq_arg_stream; u32 len, maj_stat, seq_num, offset; struct xdr_buf *buf = xdr->buf; unsigned int saved_len; if (xdr_stream_decode_u32(xdr, &len) < 0) goto unwrap_failed; if (rqstp->rq_deferred) { /* Already decrypted last time through! The sequence number * check at out_seq is unnecessary but harmless: */ goto out_seq; } if (len > xdr_stream_remaining(xdr)) goto unwrap_failed; offset = xdr_stream_pos(xdr); saved_len = buf->len; maj_stat = gss_unwrap(ctx, offset, offset + len, buf); if (maj_stat != GSS_S_COMPLETE) goto bad_unwrap; xdr->nwords -= XDR_QUADLEN(saved_len - buf->len); out_seq: /* gss_unwrap() decrypted the sequence number. */ if (xdr_stream_decode_u32(xdr, &seq_num) < 0) goto unwrap_failed; if (seq_num != seq) goto bad_seqno; return 0; unwrap_failed: trace_rpcgss_svc_unwrap_failed(rqstp); return -EINVAL; bad_seqno: trace_rpcgss_svc_seqno_bad(rqstp, seq, seq_num); return -EINVAL; bad_unwrap: trace_rpcgss_svc_unwrap(rqstp, maj_stat); return -EINVAL; } static enum svc_auth_status svcauth_gss_set_client(struct svc_rqst *rqstp) { struct gss_svc_data *svcdata = rqstp->rq_auth_data; struct rsc *rsci = svcdata->rsci; struct rpc_gss_wire_cred *gc = &svcdata->clcred; int stat; rqstp->rq_auth_stat = rpc_autherr_badcred; /* * A gss export can be specified either by: * export *(sec=krb5,rw) * or by * export gss/krb5(rw) * The latter is deprecated; but for backwards compatibility reasons * the nfsd code will still fall back on trying it if the former * doesn't work; so we try to make both available to nfsd, below. */ rqstp->rq_gssclient = find_gss_auth_domain(rsci->mechctx, gc->gc_svc); if (rqstp->rq_gssclient == NULL) return SVC_DENIED; stat = svcauth_unix_set_client(rqstp); if (stat == SVC_DROP || stat == SVC_CLOSE) return stat; rqstp->rq_auth_stat = rpc_auth_ok; return SVC_OK; } static bool svcauth_gss_proc_init_verf(struct cache_detail *cd, struct svc_rqst *rqstp, struct xdr_netobj *out_handle, int *major_status, u32 seq_num) { struct xdr_stream *xdr = &rqstp->rq_res_stream; struct rsc *rsci; bool rc; if (*major_status != GSS_S_COMPLETE) goto null_verifier; rsci = gss_svc_searchbyctx(cd, out_handle); if (rsci == NULL) { *major_status = GSS_S_NO_CONTEXT; goto null_verifier; } rc = svcauth_gss_encode_verf(rqstp, rsci->mechctx, seq_num); cache_put(&rsci->h, cd); return rc; null_verifier: return xdr_stream_encode_opaque_auth(xdr, RPC_AUTH_NULL, NULL, 0) > 0; } static void gss_free_in_token_pages(struct gssp_in_token *in_token) { int i; i = 0; while (in_token->pages[i]) put_page(in_token->pages[i++]); kfree(in_token->pages); in_token->pages = NULL; } static int gss_read_proxy_verf(struct svc_rqst *rqstp, struct rpc_gss_wire_cred *gc, struct xdr_netobj *in_handle, struct gssp_in_token *in_token) { struct xdr_stream *xdr = &rqstp->rq_arg_stream; unsigned int length, pgto_offs, pgfrom_offs; int pages, i, pgto, pgfrom; size_t to_offs, from_offs; u32 inlen; if (dup_netobj(in_handle, &gc->gc_ctx)) return SVC_CLOSE; /* * RFC 2203 Section 5.2.2 * * struct rpc_gss_init_arg { * opaque gss_token<>; * }; */ if (xdr_stream_decode_u32(xdr, &inlen) < 0) goto out_denied_free; if (inlen > xdr_stream_remaining(xdr)) goto out_denied_free; pages = DIV_ROUND_UP(inlen, PAGE_SIZE); in_token->pages = kcalloc(pages + 1, sizeof(struct page *), GFP_KERNEL); if (!in_token->pages) goto out_denied_free; in_token->page_base = 0; in_token->page_len = inlen; for (i = 0; i < pages; i++) { in_token->pages[i] = alloc_page(GFP_KERNEL); if (!in_token->pages[i]) { gss_free_in_token_pages(in_token); goto out_denied_free; } } length = min_t(unsigned int, inlen, (char *)xdr->end - (char *)xdr->p); memcpy(page_address(in_token->pages[0]), xdr->p, length); inlen -= length; to_offs = length; from_offs = rqstp->rq_arg.page_base; while (inlen) { pgto = to_offs >> PAGE_SHIFT; pgfrom = from_offs >> PAGE_SHIFT; pgto_offs = to_offs & ~PAGE_MASK; pgfrom_offs = from_offs & ~PAGE_MASK; length = min_t(unsigned int, inlen, min_t(unsigned int, PAGE_SIZE - pgto_offs, PAGE_SIZE - pgfrom_offs)); memcpy(page_address(in_token->pages[pgto]) + pgto_offs, page_address(rqstp->rq_arg.pages[pgfrom]) + pgfrom_offs, length); to_offs += length; from_offs += length; inlen -= length; } return 0; out_denied_free: kfree(in_handle->data); return SVC_DENIED; } /* * RFC 2203, Section 5.2.3.1. * * struct rpc_gss_init_res { * opaque handle<>; * unsigned int gss_major; * unsigned int gss_minor; * unsigned int seq_window; * opaque gss_token<>; * }; */ static bool svcxdr_encode_gss_init_res(struct xdr_stream *xdr, struct xdr_netobj *handle, struct xdr_netobj *gss_token, unsigned int major_status, unsigned int minor_status, u32 seq_num) { if (xdr_stream_encode_opaque(xdr, handle->data, handle->len) < 0) return false; if (xdr_stream_encode_u32(xdr, major_status) < 0) return false; if (xdr_stream_encode_u32(xdr, minor_status) < 0) return false; if (xdr_stream_encode_u32(xdr, seq_num) < 0) return false; if (xdr_stream_encode_opaque(xdr, gss_token->data, gss_token->len) < 0) return false; return true; } /* * Having read the cred already and found we're in the context * initiation case, read the verifier and initiate (or check the results * of) upcalls to userspace for help with context initiation. If * the upcall results are available, write the verifier and result. * Otherwise, drop the request pending an answer to the upcall. */ static int svcauth_gss_legacy_init(struct svc_rqst *rqstp, struct rpc_gss_wire_cred *gc) { struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct rsi *rsip, rsikey; __be32 *p; u32 len; int ret; struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id); memset(&rsikey, 0, sizeof(rsikey)); if (dup_netobj(&rsikey.in_handle, &gc->gc_ctx)) return SVC_CLOSE; /* * RFC 2203 Section 5.2.2 * * struct rpc_gss_init_arg { * opaque gss_token<>; * }; */ if (xdr_stream_decode_u32(xdr, &len) < 0) { kfree(rsikey.in_handle.data); return SVC_DENIED; } p = xdr_inline_decode(xdr, len); if (!p) { kfree(rsikey.in_handle.data); return SVC_DENIED; } rsikey.in_token.data = kmalloc(len, GFP_KERNEL); if (ZERO_OR_NULL_PTR(rsikey.in_token.data)) { kfree(rsikey.in_handle.data); return SVC_CLOSE; } memcpy(rsikey.in_token.data, p, len); rsikey.in_token.len = len; /* Perform upcall, or find upcall result: */ rsip = rsi_lookup(sn->rsi_cache, &rsikey); rsi_free(&rsikey); if (!rsip) return SVC_CLOSE; if (cache_check(sn->rsi_cache, &rsip->h, &rqstp->rq_chandle) < 0) /* No upcall result: */ return SVC_CLOSE; ret = SVC_CLOSE; if (!svcauth_gss_proc_init_verf(sn->rsc_cache, rqstp, &rsip->out_handle, &rsip->major_status, GSS_SEQ_WIN)) goto out; if (!svcxdr_set_accept_stat(rqstp)) goto out; if (!svcxdr_encode_gss_init_res(&rqstp->rq_res_stream, &rsip->out_handle, &rsip->out_token, rsip->major_status, rsip->minor_status, GSS_SEQ_WIN)) goto out; ret = SVC_COMPLETE; out: cache_put(&rsip->h, sn->rsi_cache); return ret; } static int gss_proxy_save_rsc(struct cache_detail *cd, struct gssp_upcall_data *ud, uint64_t *handle) { struct rsc rsci, *rscp = NULL; static atomic64_t ctxhctr; long long ctxh; struct gss_api_mech *gm = NULL; time64_t expiry; int status; memset(&rsci, 0, sizeof(rsci)); /* context handle */ status = -ENOMEM; /* the handle needs to be just a unique id, * use a static counter */ ctxh = atomic64_inc_return(&ctxhctr); /* make a copy for the caller */ *handle = ctxh; /* make a copy for the rsc cache */ if (dup_to_netobj(&rsci.handle, (char *)handle, sizeof(uint64_t))) goto out; rscp = rsc_lookup(cd, &rsci); if (!rscp) goto out; /* creds */ if (!ud->found_creds) { /* userspace seem buggy, we should always get at least a * mapping to nobody */ goto out; } else { struct timespec64 boot; /* steal creds */ rsci.cred = ud->creds; memset(&ud->creds, 0, sizeof(struct svc_cred)); status = -EOPNOTSUPP; /* get mech handle from OID */ gm = gss_mech_get_by_OID(&ud->mech_oid); if (!gm) goto out; rsci.cred.cr_gss_mech = gm; status = -EINVAL; /* mech-specific data: */ status = gss_import_sec_context(ud->out_handle.data, ud->out_handle.len, gm, &rsci.mechctx, &expiry, GFP_KERNEL); if (status) goto out; getboottime64(&boot); expiry -= boot.tv_sec; } rsci.h.expiry_time = expiry; rscp = rsc_update(cd, &rsci, rscp); status = 0; out: rsc_free(&rsci); if (rscp) cache_put(&rscp->h, cd); else status = -ENOMEM; return status; } static int svcauth_gss_proxy_init(struct svc_rqst *rqstp, struct rpc_gss_wire_cred *gc) { struct xdr_netobj cli_handle; struct gssp_upcall_data ud; uint64_t handle; int status; int ret; struct net *net = SVC_NET(rqstp); struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); memset(&ud, 0, sizeof(ud)); ret = gss_read_proxy_verf(rqstp, gc, &ud.in_handle, &ud.in_token); if (ret) return ret; ret = SVC_CLOSE; /* Perform synchronous upcall to gss-proxy */ status = gssp_accept_sec_context_upcall(net, &ud); if (status) goto out; trace_rpcgss_svc_accept_upcall(rqstp, ud.major_status, ud.minor_status); switch (ud.major_status) { case GSS_S_CONTINUE_NEEDED: cli_handle = ud.out_handle; break; case GSS_S_COMPLETE: status = gss_proxy_save_rsc(sn->rsc_cache, &ud, &handle); if (status) goto out; cli_handle.data = (u8 *)&handle; cli_handle.len = sizeof(handle); break; default: goto out; } if (!svcauth_gss_proc_init_verf(sn->rsc_cache, rqstp, &cli_handle, &ud.major_status, GSS_SEQ_WIN)) goto out; if (!svcxdr_set_accept_stat(rqstp)) goto out; if (!svcxdr_encode_gss_init_res(&rqstp->rq_res_stream, &cli_handle, &ud.out_token, ud.major_status, ud.minor_status, GSS_SEQ_WIN)) goto out; ret = SVC_COMPLETE; out: gss_free_in_token_pages(&ud.in_token); gssp_free_upcall_data(&ud); return ret; } /* * Try to set the sn->use_gss_proxy variable to a new value. We only allow * it to be changed if it's currently undefined (-1). If it's any other value * then return -EBUSY unless the type wouldn't have changed anyway. */ static int set_gss_proxy(struct net *net, int type) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); int ret; WARN_ON_ONCE(type != 0 && type != 1); ret = cmpxchg(&sn->use_gss_proxy, -1, type); if (ret != -1 && ret != type) return -EBUSY; return 0; } static bool use_gss_proxy(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); /* If use_gss_proxy is still undefined, then try to disable it */ if (sn->use_gss_proxy == -1) set_gss_proxy(net, 0); return sn->use_gss_proxy; } static noinline_for_stack int svcauth_gss_proc_init(struct svc_rqst *rqstp, struct rpc_gss_wire_cred *gc) { struct xdr_stream *xdr = &rqstp->rq_arg_stream; u32 flavor, len; void *body; /* Call's verf field: */ if (xdr_stream_decode_opaque_auth(xdr, &flavor, &body, &len) < 0) return SVC_GARBAGE; if (flavor != RPC_AUTH_NULL || len != 0) { rqstp->rq_auth_stat = rpc_autherr_badverf; return SVC_DENIED; } if (gc->gc_proc == RPC_GSS_PROC_INIT && gc->gc_ctx.len != 0) { rqstp->rq_auth_stat = rpc_autherr_badcred; return SVC_DENIED; } if (!use_gss_proxy(SVC_NET(rqstp))) return svcauth_gss_legacy_init(rqstp, gc); return svcauth_gss_proxy_init(rqstp, gc); } #ifdef CONFIG_PROC_FS static ssize_t write_gssp(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct net *net = pde_data(file_inode(file)); char tbuf[20]; unsigned long i; int res; if (*ppos || count > sizeof(tbuf)-1) return -EINVAL; if (copy_from_user(tbuf, buf, count)) return -EFAULT; tbuf[count] = 0; res = kstrtoul(tbuf, 0, &i); if (res) return res; if (i != 1) return -EINVAL; res = set_gssp_clnt(net); if (res) return res; res = set_gss_proxy(net, 1); if (res) return res; return count; } static ssize_t read_gssp(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct net *net = pde_data(file_inode(file)); struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); unsigned long p = *ppos; char tbuf[10]; size_t len; snprintf(tbuf, sizeof(tbuf), "%d\n", sn->use_gss_proxy); len = strlen(tbuf); if (p >= len) return 0; len -= p; if (len > count) len = count; if (copy_to_user(buf, (void *)(tbuf+p), len)) return -EFAULT; *ppos += len; return len; } static const struct proc_ops use_gss_proxy_proc_ops = { .proc_open = nonseekable_open, .proc_write = write_gssp, .proc_read = read_gssp, }; static int create_use_gss_proxy_proc_entry(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct proc_dir_entry **p = &sn->use_gssp_proc; sn->use_gss_proxy = -1; *p = proc_create_data("use-gss-proxy", S_IFREG | 0600, sn->proc_net_rpc, &use_gss_proxy_proc_ops, net); if (!*p) return -ENOMEM; init_gssp_clnt(sn); return 0; } static void destroy_use_gss_proxy_proc_entry(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); if (sn->use_gssp_proc) { remove_proc_entry("use-gss-proxy", sn->proc_net_rpc); clear_gssp_clnt(sn); } } static ssize_t read_gss_krb5_enctypes(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct rpcsec_gss_oid oid = { .len = 9, .data = "\x2a\x86\x48\x86\xf7\x12\x01\x02\x02", }; struct gss_api_mech *mech; ssize_t ret; mech = gss_mech_get_by_OID(&oid); if (!mech) return 0; if (!mech->gm_upcall_enctypes) { gss_mech_put(mech); return 0; } ret = simple_read_from_buffer(buf, count, ppos, mech->gm_upcall_enctypes, strlen(mech->gm_upcall_enctypes)); gss_mech_put(mech); return ret; } static const struct proc_ops gss_krb5_enctypes_proc_ops = { .proc_open = nonseekable_open, .proc_read = read_gss_krb5_enctypes, }; static int create_krb5_enctypes_proc_entry(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); sn->gss_krb5_enctypes = proc_create_data("gss_krb5_enctypes", S_IFREG | 0444, sn->proc_net_rpc, &gss_krb5_enctypes_proc_ops, net); return sn->gss_krb5_enctypes ? 0 : -ENOMEM; } static void destroy_krb5_enctypes_proc_entry(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); if (sn->gss_krb5_enctypes) remove_proc_entry("gss_krb5_enctypes", sn->proc_net_rpc); } #else /* CONFIG_PROC_FS */ static int create_use_gss_proxy_proc_entry(struct net *net) { return 0; } static void destroy_use_gss_proxy_proc_entry(struct net *net) {} static int create_krb5_enctypes_proc_entry(struct net *net) { return 0; } static void destroy_krb5_enctypes_proc_entry(struct net *net) {} #endif /* CONFIG_PROC_FS */ /* * The Call's credential body should contain a struct rpc_gss_cred_t. * * RFC 2203 Section 5 * * struct rpc_gss_cred_t { * union switch (unsigned int version) { * case RPCSEC_GSS_VERS_1: * struct { * rpc_gss_proc_t gss_proc; * unsigned int seq_num; * rpc_gss_service_t service; * opaque handle<>; * } rpc_gss_cred_vers_1_t; * } * }; */ static bool svcauth_gss_decode_credbody(struct xdr_stream *xdr, struct rpc_gss_wire_cred *gc, __be32 **rpcstart) { ssize_t handle_len; u32 body_len; __be32 *p; p = xdr_inline_decode(xdr, XDR_UNIT); if (!p) return false; /* * start of rpc packet is 7 u32's back from here: * xid direction rpcversion prog vers proc flavour */ *rpcstart = p - 7; body_len = be32_to_cpup(p); if (body_len > RPC_MAX_AUTH_SIZE) return false; /* struct rpc_gss_cred_t */ if (xdr_stream_decode_u32(xdr, &gc->gc_v) < 0) return false; if (xdr_stream_decode_u32(xdr, &gc->gc_proc) < 0) return false; if (xdr_stream_decode_u32(xdr, &gc->gc_seq) < 0) return false; if (xdr_stream_decode_u32(xdr, &gc->gc_svc) < 0) return false; handle_len = xdr_stream_decode_opaque_inline(xdr, (void **)&gc->gc_ctx.data, body_len); if (handle_len < 0) return false; if (body_len != XDR_UNIT * 5 + xdr_align_size(handle_len)) return false; gc->gc_ctx.len = handle_len; return true; } /** * svcauth_gss_accept - Decode and validate incoming RPC_AUTH_GSS credential * @rqstp: RPC transaction * * Return values: * %SVC_OK: Success * %SVC_COMPLETE: GSS context lifetime event * %SVC_DENIED: Credential or verifier is not valid * %SVC_GARBAGE: Failed to decode credential or verifier * %SVC_CLOSE: Temporary failure * * The rqstp->rq_auth_stat field is also set (see RFCs 2203 and 5531). */ static enum svc_auth_status svcauth_gss_accept(struct svc_rqst *rqstp) { struct gss_svc_data *svcdata = rqstp->rq_auth_data; __be32 *rpcstart; struct rpc_gss_wire_cred *gc; struct rsc *rsci = NULL; int ret; struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id); rqstp->rq_auth_stat = rpc_autherr_badcred; if (!svcdata) svcdata = kmalloc(sizeof(*svcdata), GFP_KERNEL); if (!svcdata) goto auth_err; rqstp->rq_auth_data = svcdata; svcdata->gsd_databody_offset = 0; svcdata->rsci = NULL; gc = &svcdata->clcred; if (!svcauth_gss_decode_credbody(&rqstp->rq_arg_stream, gc, &rpcstart)) goto auth_err; if (gc->gc_v != RPC_GSS_VERSION) goto auth_err; switch (gc->gc_proc) { case RPC_GSS_PROC_INIT: case RPC_GSS_PROC_CONTINUE_INIT: if (rqstp->rq_proc != 0) goto auth_err; return svcauth_gss_proc_init(rqstp, gc); case RPC_GSS_PROC_DESTROY: if (rqstp->rq_proc != 0) goto auth_err; fallthrough; case RPC_GSS_PROC_DATA: rqstp->rq_auth_stat = rpcsec_gsserr_credproblem; rsci = gss_svc_searchbyctx(sn->rsc_cache, &gc->gc_ctx); if (!rsci) goto auth_err; switch (svcauth_gss_verify_header(rqstp, rsci, rpcstart, gc)) { case SVC_OK: break; case SVC_DENIED: goto auth_err; case SVC_DROP: goto drop; } break; default: if (rqstp->rq_proc != 0) goto auth_err; rqstp->rq_auth_stat = rpc_autherr_rejectedcred; goto auth_err; } /* now act upon the command: */ switch (gc->gc_proc) { case RPC_GSS_PROC_DESTROY: if (!svcauth_gss_encode_verf(rqstp, rsci->mechctx, gc->gc_seq)) goto auth_err; if (!svcxdr_set_accept_stat(rqstp)) goto auth_err; /* Delete the entry from the cache_list and call cache_put */ sunrpc_cache_unhash(sn->rsc_cache, &rsci->h); goto complete; case RPC_GSS_PROC_DATA: rqstp->rq_auth_stat = rpcsec_gsserr_ctxproblem; if (!svcauth_gss_encode_verf(rqstp, rsci->mechctx, gc->gc_seq)) goto auth_err; if (!svcxdr_set_accept_stat(rqstp)) goto auth_err; svcdata->gsd_databody_offset = xdr_stream_pos(&rqstp->rq_res_stream); rqstp->rq_cred = rsci->cred; get_group_info(rsci->cred.cr_group_info); rqstp->rq_auth_stat = rpc_autherr_badcred; switch (gc->gc_svc) { case RPC_GSS_SVC_NONE: break; case RPC_GSS_SVC_INTEGRITY: /* placeholders for body length and seq. number: */ xdr_reserve_space(&rqstp->rq_res_stream, XDR_UNIT * 2); if (svcauth_gss_unwrap_integ(rqstp, gc->gc_seq, rsci->mechctx)) goto garbage_args; svcxdr_set_auth_slack(rqstp, RPC_MAX_AUTH_SIZE); break; case RPC_GSS_SVC_PRIVACY: /* placeholders for body length and seq. number: */ xdr_reserve_space(&rqstp->rq_res_stream, XDR_UNIT * 2); if (svcauth_gss_unwrap_priv(rqstp, gc->gc_seq, rsci->mechctx)) goto garbage_args; svcxdr_set_auth_slack(rqstp, RPC_MAX_AUTH_SIZE * 2); break; default: goto auth_err; } svcdata->rsci = rsci; cache_get(&rsci->h); rqstp->rq_cred.cr_flavor = gss_svc_to_pseudoflavor( rsci->mechctx->mech_type, GSS_C_QOP_DEFAULT, gc->gc_svc); ret = SVC_OK; trace_rpcgss_svc_authenticate(rqstp, gc); goto out; } garbage_args: ret = SVC_GARBAGE; goto out; auth_err: xdr_truncate_encode(&rqstp->rq_res_stream, XDR_UNIT * 2); ret = SVC_DENIED; goto out; complete: ret = SVC_COMPLETE; goto out; drop: ret = SVC_CLOSE; out: if (rsci) cache_put(&rsci->h, sn->rsc_cache); return ret; } static u32 svcauth_gss_prepare_to_wrap(struct svc_rqst *rqstp, struct gss_svc_data *gsd) { u32 offset; /* Release can be called twice, but we only wrap once. */ offset = gsd->gsd_databody_offset; gsd->gsd_databody_offset = 0; /* AUTH_ERROR replies are not wrapped. */ if (rqstp->rq_auth_stat != rpc_auth_ok) return 0; /* Also don't wrap if the accept_stat is nonzero: */ if (*rqstp->rq_accept_statp != rpc_success) return 0; return offset; } /* * RFC 2203, Section 5.3.2.2 * * struct rpc_gss_integ_data { * opaque databody_integ<>; * opaque checksum<>; * }; * * struct rpc_gss_data_t { * unsigned int seq_num; * proc_req_arg_t arg; * }; * * The RPC Reply message has already been XDR-encoded. rq_res_stream * is now positioned so that the checksum can be written just past * the RPC Reply message. */ static int svcauth_gss_wrap_integ(struct svc_rqst *rqstp) { struct gss_svc_data *gsd = rqstp->rq_auth_data; struct xdr_stream *xdr = &rqstp->rq_res_stream; struct rpc_gss_wire_cred *gc = &gsd->clcred; struct xdr_buf *buf = xdr->buf; struct xdr_buf databody_integ; struct xdr_netobj checksum; u32 offset, maj_stat; offset = svcauth_gss_prepare_to_wrap(rqstp, gsd); if (!offset) goto out; if (xdr_buf_subsegment(buf, &databody_integ, offset + XDR_UNIT, buf->len - offset - XDR_UNIT)) goto wrap_failed; /* Buffer space for these has already been reserved in * svcauth_gss_accept(). */ if (xdr_encode_word(buf, offset, databody_integ.len)) goto wrap_failed; if (xdr_encode_word(buf, offset + XDR_UNIT, gc->gc_seq)) goto wrap_failed; checksum.data = gsd->gsd_scratch; maj_stat = gss_get_mic(gsd->rsci->mechctx, &databody_integ, &checksum); if (maj_stat != GSS_S_COMPLETE) goto bad_mic; if (xdr_stream_encode_opaque(xdr, checksum.data, checksum.len) < 0) goto wrap_failed; xdr_commit_encode(xdr); out: return 0; bad_mic: trace_rpcgss_svc_get_mic(rqstp, maj_stat); return -EINVAL; wrap_failed: trace_rpcgss_svc_wrap_failed(rqstp); return -EINVAL; } /* * RFC 2203, Section 5.3.2.3 * * struct rpc_gss_priv_data { * opaque databody_priv<> * }; * * struct rpc_gss_data_t { * unsigned int seq_num; * proc_req_arg_t arg; * }; * * gss_wrap() expands the size of the RPC message payload in the * response buffer. The main purpose of svcauth_gss_wrap_priv() * is to ensure there is adequate space in the response buffer to * avoid overflow during the wrap. */ static int svcauth_gss_wrap_priv(struct svc_rqst *rqstp) { struct gss_svc_data *gsd = rqstp->rq_auth_data; struct rpc_gss_wire_cred *gc = &gsd->clcred; struct xdr_buf *buf = &rqstp->rq_res; struct kvec *head = buf->head; struct kvec *tail = buf->tail; u32 offset, pad, maj_stat; __be32 *p; offset = svcauth_gss_prepare_to_wrap(rqstp, gsd); if (!offset) return 0; /* * Buffer space for this field has already been reserved * in svcauth_gss_accept(). Note that the GSS sequence * number is encrypted along with the RPC reply payload. */ if (xdr_encode_word(buf, offset + XDR_UNIT, gc->gc_seq)) goto wrap_failed; /* * If there is currently tail data, make sure there is * room for the head, tail, and 2 * RPC_MAX_AUTH_SIZE in * the page, and move the current tail data such that * there is RPC_MAX_AUTH_SIZE slack space available in * both the head and tail. */ if (tail->iov_base) { if (tail->iov_base >= head->iov_base + PAGE_SIZE) goto wrap_failed; if (tail->iov_base < head->iov_base) goto wrap_failed; if (tail->iov_len + head->iov_len + 2 * RPC_MAX_AUTH_SIZE > PAGE_SIZE) goto wrap_failed; memmove(tail->iov_base + RPC_MAX_AUTH_SIZE, tail->iov_base, tail->iov_len); tail->iov_base += RPC_MAX_AUTH_SIZE; } /* * If there is no current tail data, make sure there is * room for the head data, and 2 * RPC_MAX_AUTH_SIZE in the * allotted page, and set up tail information such that there * is RPC_MAX_AUTH_SIZE slack space available in both the * head and tail. */ if (!tail->iov_base) { if (head->iov_len + 2 * RPC_MAX_AUTH_SIZE > PAGE_SIZE) goto wrap_failed; tail->iov_base = head->iov_base + head->iov_len + RPC_MAX_AUTH_SIZE; tail->iov_len = 0; } maj_stat = gss_wrap(gsd->rsci->mechctx, offset + XDR_UNIT, buf, buf->pages); if (maj_stat != GSS_S_COMPLETE) goto bad_wrap; /* Wrapping can change the size of databody_priv. */ if (xdr_encode_word(buf, offset, buf->len - offset - XDR_UNIT)) goto wrap_failed; pad = xdr_pad_size(buf->len - offset - XDR_UNIT); p = (__be32 *)(tail->iov_base + tail->iov_len); memset(p, 0, pad); tail->iov_len += pad; buf->len += pad; return 0; wrap_failed: trace_rpcgss_svc_wrap_failed(rqstp); return -EINVAL; bad_wrap: trace_rpcgss_svc_wrap(rqstp, maj_stat); return -ENOMEM; } /** * svcauth_gss_release - Wrap payload and release resources * @rqstp: RPC transaction context * * Return values: * %0: the Reply is ready to be sent * %-ENOMEM: failed to allocate memory * %-EINVAL: encoding error */ static int svcauth_gss_release(struct svc_rqst *rqstp) { struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id); struct gss_svc_data *gsd = rqstp->rq_auth_data; struct rpc_gss_wire_cred *gc; int stat; if (!gsd) goto out; gc = &gsd->clcred; if (gc->gc_proc != RPC_GSS_PROC_DATA) goto out; switch (gc->gc_svc) { case RPC_GSS_SVC_NONE: break; case RPC_GSS_SVC_INTEGRITY: stat = svcauth_gss_wrap_integ(rqstp); if (stat) goto out_err; break; case RPC_GSS_SVC_PRIVACY: stat = svcauth_gss_wrap_priv(rqstp); if (stat) goto out_err; break; /* * For any other gc_svc value, svcauth_gss_accept() already set * the auth_error appropriately; just fall through: */ } out: stat = 0; out_err: if (rqstp->rq_client) auth_domain_put(rqstp->rq_client); rqstp->rq_client = NULL; if (rqstp->rq_gssclient) auth_domain_put(rqstp->rq_gssclient); rqstp->rq_gssclient = NULL; if (rqstp->rq_cred.cr_group_info) put_group_info(rqstp->rq_cred.cr_group_info); rqstp->rq_cred.cr_group_info = NULL; if (gsd && gsd->rsci) { cache_put(&gsd->rsci->h, sn->rsc_cache); gsd->rsci = NULL; } return stat; } static void svcauth_gss_domain_release_rcu(struct rcu_head *head) { struct auth_domain *dom = container_of(head, struct auth_domain, rcu_head); struct gss_domain *gd = container_of(dom, struct gss_domain, h); kfree(dom->name); kfree(gd); } static void svcauth_gss_domain_release(struct auth_domain *dom) { call_rcu(&dom->rcu_head, svcauth_gss_domain_release_rcu); } static rpc_authflavor_t svcauth_gss_pseudoflavor(struct svc_rqst *rqstp) { return svcauth_gss_flavor(rqstp->rq_gssclient); } static struct auth_ops svcauthops_gss = { .name = "rpcsec_gss", .owner = THIS_MODULE, .flavour = RPC_AUTH_GSS, .accept = svcauth_gss_accept, .release = svcauth_gss_release, .domain_release = svcauth_gss_domain_release, .set_client = svcauth_gss_set_client, .pseudoflavor = svcauth_gss_pseudoflavor, }; static int rsi_cache_create_net(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct cache_detail *cd; int err; cd = cache_create_net(&rsi_cache_template, net); if (IS_ERR(cd)) return PTR_ERR(cd); err = cache_register_net(cd, net); if (err) { cache_destroy_net(cd, net); return err; } sn->rsi_cache = cd; return 0; } static void rsi_cache_destroy_net(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct cache_detail *cd = sn->rsi_cache; sn->rsi_cache = NULL; cache_purge(cd); cache_unregister_net(cd, net); cache_destroy_net(cd, net); } static int rsc_cache_create_net(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct cache_detail *cd; int err; cd = cache_create_net(&rsc_cache_template, net); if (IS_ERR(cd)) return PTR_ERR(cd); err = cache_register_net(cd, net); if (err) { cache_destroy_net(cd, net); return err; } sn->rsc_cache = cd; return 0; } static void rsc_cache_destroy_net(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct cache_detail *cd = sn->rsc_cache; sn->rsc_cache = NULL; cache_purge(cd); cache_unregister_net(cd, net); cache_destroy_net(cd, net); } int gss_svc_init_net(struct net *net) { int rv; rv = rsc_cache_create_net(net); if (rv) return rv; rv = rsi_cache_create_net(net); if (rv) goto out1; rv = create_use_gss_proxy_proc_entry(net); if (rv) goto out2; rv = create_krb5_enctypes_proc_entry(net); if (rv) goto out3; return 0; out3: destroy_use_gss_proxy_proc_entry(net); out2: rsi_cache_destroy_net(net); out1: rsc_cache_destroy_net(net); return rv; } void gss_svc_shutdown_net(struct net *net) { destroy_krb5_enctypes_proc_entry(net); destroy_use_gss_proxy_proc_entry(net); rsi_cache_destroy_net(net); rsc_cache_destroy_net(net); } int gss_svc_init(void) { return svc_auth_register(RPC_AUTH_GSS, &svcauthops_gss); } void gss_svc_shutdown(void) { svc_auth_unregister(RPC_AUTH_GSS); }
1 134 28 144 15 1 147 49 134 1 29 72 149 13 62 128 30 28 29 1 28 10 28 1 1 48 134 144 28 15 146 145 165 163 37 128 44 7 37 36 35 24 20 41 43 92 3 83 9 9 80 98 3 93 90 4 85 28 3 30 3 4 1 8 74 75 68 23 67 24 76 4 72 4 26 15 16 20 4 4 4 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 // SPDX-License-Identifier: GPL-2.0-or-later /* * Symmetric key cipher operations. * * Generic encrypt/decrypt wrapper for ciphers, handles operations across * multiple page boundaries by using temporary blocks. In user context, * the kernel is given a chance to schedule us once per page. * * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/internal/aead.h> #include <crypto/internal/cipher.h> #include <crypto/internal/skcipher.h> #include <crypto/scatterwalk.h> #include <linux/bug.h> #include <linux/cryptouser.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/string.h> #include <net/netlink.h> #include "skcipher.h" #define CRYPTO_ALG_TYPE_SKCIPHER_MASK 0x0000000e enum { SKCIPHER_WALK_SLOW = 1 << 0, SKCIPHER_WALK_COPY = 1 << 1, SKCIPHER_WALK_DIFF = 1 << 2, SKCIPHER_WALK_SLEEP = 1 << 3, }; static const struct crypto_type crypto_skcipher_type; static int skcipher_walk_next(struct skcipher_walk *walk); static inline void skcipher_map_src(struct skcipher_walk *walk) { walk->src.virt.addr = scatterwalk_map(&walk->in); } static inline void skcipher_map_dst(struct skcipher_walk *walk) { walk->dst.virt.addr = scatterwalk_map(&walk->out); } static inline void skcipher_unmap_src(struct skcipher_walk *walk) { scatterwalk_unmap(walk->src.virt.addr); } static inline void skcipher_unmap_dst(struct skcipher_walk *walk) { scatterwalk_unmap(walk->dst.virt.addr); } static inline gfp_t skcipher_walk_gfp(struct skcipher_walk *walk) { return walk->flags & SKCIPHER_WALK_SLEEP ? GFP_KERNEL : GFP_ATOMIC; } static inline struct skcipher_alg *__crypto_skcipher_alg( struct crypto_alg *alg) { return container_of(alg, struct skcipher_alg, base); } static int skcipher_done_slow(struct skcipher_walk *walk, unsigned int bsize) { u8 *addr = PTR_ALIGN(walk->buffer, walk->alignmask + 1); scatterwalk_copychunks(addr, &walk->out, bsize, 1); return 0; } /** * skcipher_walk_done() - finish one step of a skcipher_walk * @walk: the skcipher_walk * @res: number of bytes *not* processed (>= 0) from walk->nbytes, * or a -errno value to terminate the walk due to an error * * This function cleans up after one step of walking through the source and * destination scatterlists, and advances to the next step if applicable. * walk->nbytes is set to the number of bytes available in the next step, * walk->total is set to the new total number of bytes remaining, and * walk->{src,dst}.virt.addr is set to the next pair of data pointers. If there * is no more data, or if an error occurred (i.e. -errno return), then * walk->nbytes and walk->total are set to 0 and all resources owned by the * skcipher_walk are freed. * * Return: 0 or a -errno value. If @res was a -errno value then it will be * returned, but other errors may occur too. */ int skcipher_walk_done(struct skcipher_walk *walk, int res) { unsigned int n = walk->nbytes; /* num bytes processed this step */ unsigned int total = 0; /* new total remaining */ if (!n) goto finish; if (likely(res >= 0)) { n -= res; /* subtract num bytes *not* processed */ total = walk->total - n; } if (likely(!(walk->flags & (SKCIPHER_WALK_SLOW | SKCIPHER_WALK_COPY | SKCIPHER_WALK_DIFF)))) { unmap_src: skcipher_unmap_src(walk); } else if (walk->flags & SKCIPHER_WALK_DIFF) { skcipher_unmap_dst(walk); goto unmap_src; } else if (walk->flags & SKCIPHER_WALK_COPY) { skcipher_map_dst(walk); memcpy(walk->dst.virt.addr, walk->page, n); skcipher_unmap_dst(walk); } else { /* SKCIPHER_WALK_SLOW */ if (res > 0) { /* * Didn't process all bytes. Either the algorithm is * broken, or this was the last step and it turned out * the message wasn't evenly divisible into blocks but * the algorithm requires it. */ res = -EINVAL; total = 0; } else n = skcipher_done_slow(walk, n); } if (res > 0) res = 0; walk->total = total; walk->nbytes = 0; scatterwalk_advance(&walk->in, n); scatterwalk_advance(&walk->out, n); scatterwalk_done(&walk->in, 0, total); scatterwalk_done(&walk->out, 1, total); if (total) { if (walk->flags & SKCIPHER_WALK_SLEEP) cond_resched(); walk->flags &= ~(SKCIPHER_WALK_SLOW | SKCIPHER_WALK_COPY | SKCIPHER_WALK_DIFF); return skcipher_walk_next(walk); } finish: /* Short-circuit for the common/fast path. */ if (!((unsigned long)walk->buffer | (unsigned long)walk->page)) goto out; if (walk->iv != walk->oiv) memcpy(walk->oiv, walk->iv, walk->ivsize); if (walk->buffer != walk->page) kfree(walk->buffer); if (walk->page) free_page((unsigned long)walk->page); out: return res; } EXPORT_SYMBOL_GPL(skcipher_walk_done); static int skcipher_next_slow(struct skcipher_walk *walk, unsigned int bsize) { unsigned alignmask = walk->alignmask; unsigned n; u8 *buffer; if (!walk->buffer) walk->buffer = walk->page; buffer = walk->buffer; if (!buffer) { /* Min size for a buffer of bsize bytes aligned to alignmask */ n = bsize + (alignmask & ~(crypto_tfm_ctx_alignment() - 1)); buffer = kzalloc(n, skcipher_walk_gfp(walk)); if (!buffer) return skcipher_walk_done(walk, -ENOMEM); walk->buffer = buffer; } walk->dst.virt.addr = PTR_ALIGN(buffer, alignmask + 1); walk->src.virt.addr = walk->dst.virt.addr; scatterwalk_copychunks(walk->src.virt.addr, &walk->in, bsize, 0); walk->nbytes = bsize; walk->flags |= SKCIPHER_WALK_SLOW; return 0; } static int skcipher_next_copy(struct skcipher_walk *walk) { u8 *tmp = walk->page; skcipher_map_src(walk); memcpy(tmp, walk->src.virt.addr, walk->nbytes); skcipher_unmap_src(walk); walk->src.virt.addr = tmp; walk->dst.virt.addr = tmp; return 0; } static int skcipher_next_fast(struct skcipher_walk *walk) { unsigned long diff; diff = offset_in_page(walk->in.offset) - offset_in_page(walk->out.offset); diff |= (u8 *)scatterwalk_page(&walk->in) - (u8 *)scatterwalk_page(&walk->out); skcipher_map_src(walk); walk->dst.virt.addr = walk->src.virt.addr; if (diff) { walk->flags |= SKCIPHER_WALK_DIFF; skcipher_map_dst(walk); } return 0; } static int skcipher_walk_next(struct skcipher_walk *walk) { unsigned int bsize; unsigned int n; n = walk->total; bsize = min(walk->stride, max(n, walk->blocksize)); n = scatterwalk_clamp(&walk->in, n); n = scatterwalk_clamp(&walk->out, n); if (unlikely(n < bsize)) { if (unlikely(walk->total < walk->blocksize)) return skcipher_walk_done(walk, -EINVAL); slow_path: return skcipher_next_slow(walk, bsize); } walk->nbytes = n; if (unlikely((walk->in.offset | walk->out.offset) & walk->alignmask)) { if (!walk->page) { gfp_t gfp = skcipher_walk_gfp(walk); walk->page = (void *)__get_free_page(gfp); if (!walk->page) goto slow_path; } walk->flags |= SKCIPHER_WALK_COPY; return skcipher_next_copy(walk); } return skcipher_next_fast(walk); } static int skcipher_copy_iv(struct skcipher_walk *walk) { unsigned alignmask = walk->alignmask; unsigned ivsize = walk->ivsize; unsigned aligned_stride = ALIGN(walk->stride, alignmask + 1); unsigned size; u8 *iv; /* Min size for a buffer of stride + ivsize, aligned to alignmask */ size = aligned_stride + ivsize + (alignmask & ~(crypto_tfm_ctx_alignment() - 1)); walk->buffer = kmalloc(size, skcipher_walk_gfp(walk)); if (!walk->buffer) return -ENOMEM; iv = PTR_ALIGN(walk->buffer, alignmask + 1) + aligned_stride; walk->iv = memcpy(iv, walk->iv, walk->ivsize); return 0; } static int skcipher_walk_first(struct skcipher_walk *walk) { if (WARN_ON_ONCE(in_hardirq())) return -EDEADLK; walk->buffer = NULL; if (unlikely(((unsigned long)walk->iv & walk->alignmask))) { int err = skcipher_copy_iv(walk); if (err) return err; } walk->page = NULL; return skcipher_walk_next(walk); } int skcipher_walk_virt(struct skcipher_walk *walk, struct skcipher_request *req, bool atomic) { const struct skcipher_alg *alg = crypto_skcipher_alg(crypto_skcipher_reqtfm(req)); might_sleep_if(req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP); walk->total = req->cryptlen; walk->nbytes = 0; walk->iv = req->iv; walk->oiv = req->iv; if ((req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) && !atomic) walk->flags = SKCIPHER_WALK_SLEEP; else walk->flags = 0; if (unlikely(!walk->total)) return 0; scatterwalk_start(&walk->in, req->src); scatterwalk_start(&walk->out, req->dst); /* * Accessing 'alg' directly generates better code than using the * crypto_skcipher_blocksize() and similar helper functions here, as it * prevents the algorithm pointer from being repeatedly reloaded. */ walk->blocksize = alg->base.cra_blocksize; walk->ivsize = alg->co.ivsize; walk->alignmask = alg->base.cra_alignmask; if (alg->co.base.cra_type != &crypto_skcipher_type) walk->stride = alg->co.chunksize; else walk->stride = alg->walksize; return skcipher_walk_first(walk); } EXPORT_SYMBOL_GPL(skcipher_walk_virt); static int skcipher_walk_aead_common(struct skcipher_walk *walk, struct aead_request *req, bool atomic) { const struct aead_alg *alg = crypto_aead_alg(crypto_aead_reqtfm(req)); walk->nbytes = 0; walk->iv = req->iv; walk->oiv = req->iv; if ((req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) && !atomic) walk->flags = SKCIPHER_WALK_SLEEP; else walk->flags = 0; if (unlikely(!walk->total)) return 0; scatterwalk_start(&walk->in, req->src); scatterwalk_start(&walk->out, req->dst); scatterwalk_copychunks(NULL, &walk->in, req->assoclen, 2); scatterwalk_copychunks(NULL, &walk->out, req->assoclen, 2); scatterwalk_done(&walk->in, 0, walk->total); scatterwalk_done(&walk->out, 0, walk->total); /* * Accessing 'alg' directly generates better code than using the * crypto_aead_blocksize() and similar helper functions here, as it * prevents the algorithm pointer from being repeatedly reloaded. */ walk->blocksize = alg->base.cra_blocksize; walk->stride = alg->chunksize; walk->ivsize = alg->ivsize; walk->alignmask = alg->base.cra_alignmask; return skcipher_walk_first(walk); } int skcipher_walk_aead_encrypt(struct skcipher_walk *walk, struct aead_request *req, bool atomic) { walk->total = req->cryptlen; return skcipher_walk_aead_common(walk, req, atomic); } EXPORT_SYMBOL_GPL(skcipher_walk_aead_encrypt); int skcipher_walk_aead_decrypt(struct skcipher_walk *walk, struct aead_request *req, bool atomic) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); walk->total = req->cryptlen - crypto_aead_authsize(tfm); return skcipher_walk_aead_common(walk, req, atomic); } EXPORT_SYMBOL_GPL(skcipher_walk_aead_decrypt); static void skcipher_set_needkey(struct crypto_skcipher *tfm) { if (crypto_skcipher_max_keysize(tfm) != 0) crypto_skcipher_set_flags(tfm, CRYPTO_TFM_NEED_KEY); } static int skcipher_setkey_unaligned(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { unsigned long alignmask = crypto_skcipher_alignmask(tfm); struct skcipher_alg *cipher = crypto_skcipher_alg(tfm); u8 *buffer, *alignbuffer; unsigned long absize; int ret; absize = keylen + alignmask; buffer = kmalloc(absize, GFP_ATOMIC); if (!buffer) return -ENOMEM; alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1); memcpy(alignbuffer, key, keylen); ret = cipher->setkey(tfm, alignbuffer, keylen); kfree_sensitive(buffer); return ret; } int crypto_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { struct skcipher_alg *cipher = crypto_skcipher_alg(tfm); unsigned long alignmask = crypto_skcipher_alignmask(tfm); int err; if (cipher->co.base.cra_type != &crypto_skcipher_type) { struct crypto_lskcipher **ctx = crypto_skcipher_ctx(tfm); crypto_lskcipher_clear_flags(*ctx, CRYPTO_TFM_REQ_MASK); crypto_lskcipher_set_flags(*ctx, crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_REQ_MASK); err = crypto_lskcipher_setkey(*ctx, key, keylen); goto out; } if (keylen < cipher->min_keysize || keylen > cipher->max_keysize) return -EINVAL; if ((unsigned long)key & alignmask) err = skcipher_setkey_unaligned(tfm, key, keylen); else err = cipher->setkey(tfm, key, keylen); out: if (unlikely(err)) { skcipher_set_needkey(tfm); return err; } crypto_skcipher_clear_flags(tfm, CRYPTO_TFM_NEED_KEY); return 0; } EXPORT_SYMBOL_GPL(crypto_skcipher_setkey); int crypto_skcipher_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct skcipher_alg *alg = crypto_skcipher_alg(tfm); if (crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_NEED_KEY) return -ENOKEY; if (alg->co.base.cra_type != &crypto_skcipher_type) return crypto_lskcipher_encrypt_sg(req); return alg->encrypt(req); } EXPORT_SYMBOL_GPL(crypto_skcipher_encrypt); int crypto_skcipher_decrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct skcipher_alg *alg = crypto_skcipher_alg(tfm); if (crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_NEED_KEY) return -ENOKEY; if (alg->co.base.cra_type != &crypto_skcipher_type) return crypto_lskcipher_decrypt_sg(req); return alg->decrypt(req); } EXPORT_SYMBOL_GPL(crypto_skcipher_decrypt); static int crypto_lskcipher_export(struct skcipher_request *req, void *out) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); u8 *ivs = skcipher_request_ctx(req); ivs = PTR_ALIGN(ivs, crypto_skcipher_alignmask(tfm) + 1); memcpy(out, ivs + crypto_skcipher_ivsize(tfm), crypto_skcipher_statesize(tfm)); return 0; } static int crypto_lskcipher_import(struct skcipher_request *req, const void *in) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); u8 *ivs = skcipher_request_ctx(req); ivs = PTR_ALIGN(ivs, crypto_skcipher_alignmask(tfm) + 1); memcpy(ivs + crypto_skcipher_ivsize(tfm), in, crypto_skcipher_statesize(tfm)); return 0; } static int skcipher_noexport(struct skcipher_request *req, void *out) { return 0; } static int skcipher_noimport(struct skcipher_request *req, const void *in) { return 0; } int crypto_skcipher_export(struct skcipher_request *req, void *out) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct skcipher_alg *alg = crypto_skcipher_alg(tfm); if (alg->co.base.cra_type != &crypto_skcipher_type) return crypto_lskcipher_export(req, out); return alg->export(req, out); } EXPORT_SYMBOL_GPL(crypto_skcipher_export); int crypto_skcipher_import(struct skcipher_request *req, const void *in) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct skcipher_alg *alg = crypto_skcipher_alg(tfm); if (alg->co.base.cra_type != &crypto_skcipher_type) return crypto_lskcipher_import(req, in); return alg->import(req, in); } EXPORT_SYMBOL_GPL(crypto_skcipher_import); static void crypto_skcipher_exit_tfm(struct crypto_tfm *tfm) { struct crypto_skcipher *skcipher = __crypto_skcipher_cast(tfm); struct skcipher_alg *alg = crypto_skcipher_alg(skcipher); alg->exit(skcipher); } static int crypto_skcipher_init_tfm(struct crypto_tfm *tfm) { struct crypto_skcipher *skcipher = __crypto_skcipher_cast(tfm); struct skcipher_alg *alg = crypto_skcipher_alg(skcipher); skcipher_set_needkey(skcipher); if (tfm->__crt_alg->cra_type != &crypto_skcipher_type) { unsigned am = crypto_skcipher_alignmask(skcipher); unsigned reqsize; reqsize = am & ~(crypto_tfm_ctx_alignment() - 1); reqsize += crypto_skcipher_ivsize(skcipher); reqsize += crypto_skcipher_statesize(skcipher); crypto_skcipher_set_reqsize(skcipher, reqsize); return crypto_init_lskcipher_ops_sg(tfm); } if (alg->exit) skcipher->base.exit = crypto_skcipher_exit_tfm; if (alg->init) return alg->init(skcipher); return 0; } static unsigned int crypto_skcipher_extsize(struct crypto_alg *alg) { if (alg->cra_type != &crypto_skcipher_type) return sizeof(struct crypto_lskcipher *); return crypto_alg_extsize(alg); } static void crypto_skcipher_free_instance(struct crypto_instance *inst) { struct skcipher_instance *skcipher = container_of(inst, struct skcipher_instance, s.base); skcipher->free(skcipher); } static void crypto_skcipher_show(struct seq_file *m, struct crypto_alg *alg) __maybe_unused; static void crypto_skcipher_show(struct seq_file *m, struct crypto_alg *alg) { struct skcipher_alg *skcipher = __crypto_skcipher_alg(alg); seq_printf(m, "type : skcipher\n"); seq_printf(m, "async : %s\n", alg->cra_flags & CRYPTO_ALG_ASYNC ? "yes" : "no"); seq_printf(m, "blocksize : %u\n", alg->cra_blocksize); seq_printf(m, "min keysize : %u\n", skcipher->min_keysize); seq_printf(m, "max keysize : %u\n", skcipher->max_keysize); seq_printf(m, "ivsize : %u\n", skcipher->ivsize); seq_printf(m, "chunksize : %u\n", skcipher->chunksize); seq_printf(m, "walksize : %u\n", skcipher->walksize); seq_printf(m, "statesize : %u\n", skcipher->statesize); } static int __maybe_unused crypto_skcipher_report( struct sk_buff *skb, struct crypto_alg *alg) { struct skcipher_alg *skcipher = __crypto_skcipher_alg(alg); struct crypto_report_blkcipher rblkcipher; memset(&rblkcipher, 0, sizeof(rblkcipher)); strscpy(rblkcipher.type, "skcipher", sizeof(rblkcipher.type)); strscpy(rblkcipher.geniv, "<none>", sizeof(rblkcipher.geniv)); rblkcipher.blocksize = alg->cra_blocksize; rblkcipher.min_keysize = skcipher->min_keysize; rblkcipher.max_keysize = skcipher->max_keysize; rblkcipher.ivsize = skcipher->ivsize; return nla_put(skb, CRYPTOCFGA_REPORT_BLKCIPHER, sizeof(rblkcipher), &rblkcipher); } static const struct crypto_type crypto_skcipher_type = { .extsize = crypto_skcipher_extsize, .init_tfm = crypto_skcipher_init_tfm, .free = crypto_skcipher_free_instance, #ifdef CONFIG_PROC_FS .show = crypto_skcipher_show, #endif #if IS_ENABLED(CONFIG_CRYPTO_USER) .report = crypto_skcipher_report, #endif .maskclear = ~CRYPTO_ALG_TYPE_MASK, .maskset = CRYPTO_ALG_TYPE_SKCIPHER_MASK, .type = CRYPTO_ALG_TYPE_SKCIPHER, .tfmsize = offsetof(struct crypto_skcipher, base), }; int crypto_grab_skcipher(struct crypto_skcipher_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask) { spawn->base.frontend = &crypto_skcipher_type; return crypto_grab_spawn(&spawn->base, inst, name, type, mask); } EXPORT_SYMBOL_GPL(crypto_grab_skcipher); struct crypto_skcipher *crypto_alloc_skcipher(const char *alg_name, u32 type, u32 mask) { return crypto_alloc_tfm(alg_name, &crypto_skcipher_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_alloc_skcipher); struct crypto_sync_skcipher *crypto_alloc_sync_skcipher( const char *alg_name, u32 type, u32 mask) { struct crypto_skcipher *tfm; /* Only sync algorithms allowed. */ mask |= CRYPTO_ALG_ASYNC | CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE; tfm = crypto_alloc_tfm(alg_name, &crypto_skcipher_type, type, mask); /* * Make sure we do not allocate something that might get used with * an on-stack request: check the request size. */ if (!IS_ERR(tfm) && WARN_ON(crypto_skcipher_reqsize(tfm) > MAX_SYNC_SKCIPHER_REQSIZE)) { crypto_free_skcipher(tfm); return ERR_PTR(-EINVAL); } return (struct crypto_sync_skcipher *)tfm; } EXPORT_SYMBOL_GPL(crypto_alloc_sync_skcipher); int crypto_has_skcipher(const char *alg_name, u32 type, u32 mask) { return crypto_type_has_alg(alg_name, &crypto_skcipher_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_has_skcipher); int skcipher_prepare_alg_common(struct skcipher_alg_common *alg) { struct crypto_alg *base = &alg->base; if (alg->ivsize > PAGE_SIZE / 8 || alg->chunksize > PAGE_SIZE / 8 || alg->statesize > PAGE_SIZE / 2 || (alg->ivsize + alg->statesize) > PAGE_SIZE / 2) return -EINVAL; if (!alg->chunksize) alg->chunksize = base->cra_blocksize; base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK; return 0; } static int skcipher_prepare_alg(struct skcipher_alg *alg) { struct crypto_alg *base = &alg->base; int err; err = skcipher_prepare_alg_common(&alg->co); if (err) return err; if (alg->walksize > PAGE_SIZE / 8) return -EINVAL; if (!alg->walksize) alg->walksize = alg->chunksize; if (!alg->statesize) { alg->import = skcipher_noimport; alg->export = skcipher_noexport; } else if (!(alg->import && alg->export)) return -EINVAL; base->cra_type = &crypto_skcipher_type; base->cra_flags |= CRYPTO_ALG_TYPE_SKCIPHER; return 0; } int crypto_register_skcipher(struct skcipher_alg *alg) { struct crypto_alg *base = &alg->base; int err; err = skcipher_prepare_alg(alg); if (err) return err; return crypto_register_alg(base); } EXPORT_SYMBOL_GPL(crypto_register_skcipher); void crypto_unregister_skcipher(struct skcipher_alg *alg) { crypto_unregister_alg(&alg->base); } EXPORT_SYMBOL_GPL(crypto_unregister_skcipher); int crypto_register_skciphers(struct skcipher_alg *algs, int count) { int i, ret; for (i = 0; i < count; i++) { ret = crypto_register_skcipher(&algs[i]); if (ret) goto err; } return 0; err: for (--i; i >= 0; --i) crypto_unregister_skcipher(&algs[i]); return ret; } EXPORT_SYMBOL_GPL(crypto_register_skciphers); void crypto_unregister_skciphers(struct skcipher_alg *algs, int count) { int i; for (i = count - 1; i >= 0; --i) crypto_unregister_skcipher(&algs[i]); } EXPORT_SYMBOL_GPL(crypto_unregister_skciphers); int skcipher_register_instance(struct crypto_template *tmpl, struct skcipher_instance *inst) { int err; if (WARN_ON(!inst->free)) return -EINVAL; err = skcipher_prepare_alg(&inst->alg); if (err) return err; return crypto_register_instance(tmpl, skcipher_crypto_instance(inst)); } EXPORT_SYMBOL_GPL(skcipher_register_instance); static int skcipher_setkey_simple(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { struct crypto_cipher *cipher = skcipher_cipher_simple(tfm); crypto_cipher_clear_flags(cipher, CRYPTO_TFM_REQ_MASK); crypto_cipher_set_flags(cipher, crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_REQ_MASK); return crypto_cipher_setkey(cipher, key, keylen); } static int skcipher_init_tfm_simple(struct crypto_skcipher *tfm) { struct skcipher_instance *inst = skcipher_alg_instance(tfm); struct crypto_cipher_spawn *spawn = skcipher_instance_ctx(inst); struct skcipher_ctx_simple *ctx = crypto_skcipher_ctx(tfm); struct crypto_cipher *cipher; cipher = crypto_spawn_cipher(spawn); if (IS_ERR(cipher)) return PTR_ERR(cipher); ctx->cipher = cipher; return 0; } static void skcipher_exit_tfm_simple(struct crypto_skcipher *tfm) { struct skcipher_ctx_simple *ctx = crypto_skcipher_ctx(tfm); crypto_free_cipher(ctx->cipher); } static void skcipher_free_instance_simple(struct skcipher_instance *inst) { crypto_drop_cipher(skcipher_instance_ctx(inst)); kfree(inst); } /** * skcipher_alloc_instance_simple - allocate instance of simple block cipher mode * * Allocate an skcipher_instance for a simple block cipher mode of operation, * e.g. cbc or ecb. The instance context will have just a single crypto_spawn, * that for the underlying cipher. The {min,max}_keysize, ivsize, blocksize, * alignmask, and priority are set from the underlying cipher but can be * overridden if needed. The tfm context defaults to skcipher_ctx_simple, and * default ->setkey(), ->init(), and ->exit() methods are installed. * * @tmpl: the template being instantiated * @tb: the template parameters * * Return: a pointer to the new instance, or an ERR_PTR(). The caller still * needs to register the instance. */ struct skcipher_instance *skcipher_alloc_instance_simple( struct crypto_template *tmpl, struct rtattr **tb) { u32 mask; struct skcipher_instance *inst; struct crypto_cipher_spawn *spawn; struct crypto_alg *cipher_alg; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_SKCIPHER, &mask); if (err) return ERR_PTR(err); inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL); if (!inst) return ERR_PTR(-ENOMEM); spawn = skcipher_instance_ctx(inst); err = crypto_grab_cipher(spawn, skcipher_crypto_instance(inst), crypto_attr_alg_name(tb[1]), 0, mask); if (err) goto err_free_inst; cipher_alg = crypto_spawn_cipher_alg(spawn); err = crypto_inst_setname(skcipher_crypto_instance(inst), tmpl->name, cipher_alg); if (err) goto err_free_inst; inst->free = skcipher_free_instance_simple; /* Default algorithm properties, can be overridden */ inst->alg.base.cra_blocksize = cipher_alg->cra_blocksize; inst->alg.base.cra_alignmask = cipher_alg->cra_alignmask; inst->alg.base.cra_priority = cipher_alg->cra_priority; inst->alg.min_keysize = cipher_alg->cra_cipher.cia_min_keysize; inst->alg.max_keysize = cipher_alg->cra_cipher.cia_max_keysize; inst->alg.ivsize = cipher_alg->cra_blocksize; /* Use skcipher_ctx_simple by default, can be overridden */ inst->alg.base.cra_ctxsize = sizeof(struct skcipher_ctx_simple); inst->alg.setkey = skcipher_setkey_simple; inst->alg.init = skcipher_init_tfm_simple; inst->alg.exit = skcipher_exit_tfm_simple; return inst; err_free_inst: skcipher_free_instance_simple(inst); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(skcipher_alloc_instance_simple); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Symmetric key cipher type"); MODULE_IMPORT_NS("CRYPTO_INTERNAL");
15 5 7 1 1 1 1 1 3 2 2 20 1 1 6 11 1 17 1 1 7 1 7 1 7 6 6 1 12 11 1 5 2 2 4 4 1 3 3 7 1 1 1 2 2 2 2 2 2 1 3 5 2 2 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/act_pedit.c Generic packet editor * * Authors: Jamal Hadi Salim (2002-4) */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/module.h> #include <linux/init.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/slab.h> #include <net/ipv6.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <linux/tc_act/tc_pedit.h> #include <net/tc_act/tc_pedit.h> #include <uapi/linux/tc_act/tc_pedit.h> #include <net/pkt_cls.h> #include <net/tc_wrapper.h> static struct tc_action_ops act_pedit_ops; static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = { [TCA_PEDIT_PARMS] = { .len = sizeof(struct tc_pedit) }, [TCA_PEDIT_PARMS_EX] = { .len = sizeof(struct tc_pedit) }, [TCA_PEDIT_KEYS_EX] = { .type = NLA_NESTED }, }; static const struct nla_policy pedit_key_ex_policy[TCA_PEDIT_KEY_EX_MAX + 1] = { [TCA_PEDIT_KEY_EX_HTYPE] = NLA_POLICY_MAX(NLA_U16, TCA_PEDIT_HDR_TYPE_MAX), [TCA_PEDIT_KEY_EX_CMD] = NLA_POLICY_MAX(NLA_U16, TCA_PEDIT_CMD_MAX), }; static struct tcf_pedit_key_ex *tcf_pedit_keys_ex_parse(struct nlattr *nla, u8 n, struct netlink_ext_ack *extack) { struct tcf_pedit_key_ex *keys_ex; struct tcf_pedit_key_ex *k; const struct nlattr *ka; int err = -EINVAL; int rem; if (!nla) return NULL; keys_ex = kcalloc(n, sizeof(*k), GFP_KERNEL); if (!keys_ex) return ERR_PTR(-ENOMEM); k = keys_ex; nla_for_each_nested(ka, nla, rem) { struct nlattr *tb[TCA_PEDIT_KEY_EX_MAX + 1]; if (!n) { NL_SET_ERR_MSG_MOD(extack, "Can't parse more extended keys than requested"); err = -EINVAL; goto err_out; } n--; if (nla_type(ka) != TCA_PEDIT_KEY_EX) { NL_SET_ERR_MSG_ATTR(extack, ka, "Unknown attribute, expected extended key"); err = -EINVAL; goto err_out; } err = nla_parse_nested_deprecated(tb, TCA_PEDIT_KEY_EX_MAX, ka, pedit_key_ex_policy, NULL); if (err) goto err_out; if (NL_REQ_ATTR_CHECK(extack, nla, tb, TCA_PEDIT_KEY_EX_HTYPE)) { NL_SET_ERR_MSG(extack, "Missing required attribute"); err = -EINVAL; goto err_out; } if (NL_REQ_ATTR_CHECK(extack, nla, tb, TCA_PEDIT_KEY_EX_CMD)) { NL_SET_ERR_MSG(extack, "Missing required attribute"); err = -EINVAL; goto err_out; } k->htype = nla_get_u16(tb[TCA_PEDIT_KEY_EX_HTYPE]); k->cmd = nla_get_u16(tb[TCA_PEDIT_KEY_EX_CMD]); k++; } if (n) { NL_SET_ERR_MSG_MOD(extack, "Not enough extended keys to parse"); err = -EINVAL; goto err_out; } return keys_ex; err_out: kfree(keys_ex); return ERR_PTR(err); } static int tcf_pedit_key_ex_dump(struct sk_buff *skb, struct tcf_pedit_key_ex *keys_ex, int n) { struct nlattr *keys_start = nla_nest_start_noflag(skb, TCA_PEDIT_KEYS_EX); if (!keys_start) goto nla_failure; for (; n > 0; n--) { struct nlattr *key_start; key_start = nla_nest_start_noflag(skb, TCA_PEDIT_KEY_EX); if (!key_start) goto nla_failure; if (nla_put_u16(skb, TCA_PEDIT_KEY_EX_HTYPE, keys_ex->htype) || nla_put_u16(skb, TCA_PEDIT_KEY_EX_CMD, keys_ex->cmd)) goto nla_failure; nla_nest_end(skb, key_start); keys_ex++; } nla_nest_end(skb, keys_start); return 0; nla_failure: nla_nest_cancel(skb, keys_start); return -EINVAL; } static void tcf_pedit_cleanup_rcu(struct rcu_head *head) { struct tcf_pedit_parms *parms = container_of(head, struct tcf_pedit_parms, rcu); kfree(parms->tcfp_keys_ex); kfree(parms->tcfp_keys); kfree(parms); } static int tcf_pedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_pedit_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct tcf_chain *goto_ch = NULL; struct tcf_pedit_parms *oparms, *nparms; struct nlattr *tb[TCA_PEDIT_MAX + 1]; struct tc_pedit *parm; struct nlattr *pattr; struct tcf_pedit *p; int ret = 0, err; int i, ksize; u32 index; if (!nla) { NL_SET_ERR_MSG_MOD(extack, "Pedit requires attributes to be passed"); return -EINVAL; } err = nla_parse_nested_deprecated(tb, TCA_PEDIT_MAX, nla, pedit_policy, NULL); if (err < 0) return err; pattr = tb[TCA_PEDIT_PARMS]; if (!pattr) pattr = tb[TCA_PEDIT_PARMS_EX]; if (!pattr) { NL_SET_ERR_MSG_MOD(extack, "Missing required TCA_PEDIT_PARMS or TCA_PEDIT_PARMS_EX pedit attribute"); return -EINVAL; } parm = nla_data(pattr); index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (!err) { ret = tcf_idr_create_from_flags(tn, index, est, a, &act_pedit_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; } ret = ACT_P_CREATED; } else if (err > 0) { if (bind) return ACT_P_BOUND; if (!(flags & TCA_ACT_FLAGS_REPLACE)) { ret = -EEXIST; goto out_release; } } else { return err; } if (!parm->nkeys) { NL_SET_ERR_MSG_MOD(extack, "Pedit requires keys to be passed"); ret = -EINVAL; goto out_release; } ksize = parm->nkeys * sizeof(struct tc_pedit_key); if (nla_len(pattr) < sizeof(*parm) + ksize) { NL_SET_ERR_MSG_ATTR(extack, pattr, "Length of TCA_PEDIT_PARMS or TCA_PEDIT_PARMS_EX pedit attribute is invalid"); ret = -EINVAL; goto out_release; } nparms = kzalloc(sizeof(*nparms), GFP_KERNEL); if (!nparms) { ret = -ENOMEM; goto out_release; } nparms->tcfp_keys_ex = tcf_pedit_keys_ex_parse(tb[TCA_PEDIT_KEYS_EX], parm->nkeys, extack); if (IS_ERR(nparms->tcfp_keys_ex)) { ret = PTR_ERR(nparms->tcfp_keys_ex); goto out_free; } err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) { ret = err; goto out_free_ex; } nparms->tcfp_off_max_hint = 0; nparms->tcfp_flags = parm->flags; nparms->tcfp_nkeys = parm->nkeys; nparms->tcfp_keys = kmemdup(parm->keys, ksize, GFP_KERNEL); if (!nparms->tcfp_keys) { ret = -ENOMEM; goto put_chain; } for (i = 0; i < nparms->tcfp_nkeys; ++i) { u32 offmask = nparms->tcfp_keys[i].offmask; u32 cur = nparms->tcfp_keys[i].off; /* The AT option can be added to static offsets in the datapath */ if (!offmask && cur % 4) { NL_SET_ERR_MSG_MOD(extack, "Offsets must be on 32bit boundaries"); ret = -EINVAL; goto out_free_keys; } /* sanitize the shift value for any later use */ nparms->tcfp_keys[i].shift = min_t(size_t, BITS_PER_TYPE(int) - 1, nparms->tcfp_keys[i].shift); /* The AT option can read a single byte, we can bound the actual * value with uchar max. */ cur += (0xff & offmask) >> nparms->tcfp_keys[i].shift; /* Each key touches 4 bytes starting from the computed offset */ nparms->tcfp_off_max_hint = max(nparms->tcfp_off_max_hint, cur + 4); } p = to_pedit(*a); spin_lock_bh(&p->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); oparms = rcu_replace_pointer(p->parms, nparms, 1); spin_unlock_bh(&p->tcf_lock); if (oparms) call_rcu(&oparms->rcu, tcf_pedit_cleanup_rcu); if (goto_ch) tcf_chain_put_by_act(goto_ch); return ret; out_free_keys: kfree(nparms->tcfp_keys); put_chain: if (goto_ch) tcf_chain_put_by_act(goto_ch); out_free_ex: kfree(nparms->tcfp_keys_ex); out_free: kfree(nparms); out_release: tcf_idr_release(*a, bind); return ret; } static void tcf_pedit_cleanup(struct tc_action *a) { struct tcf_pedit *p = to_pedit(a); struct tcf_pedit_parms *parms; parms = rcu_dereference_protected(p->parms, 1); if (parms) call_rcu(&parms->rcu, tcf_pedit_cleanup_rcu); } static bool offset_valid(struct sk_buff *skb, int offset) { if (offset > 0 && offset > skb->len) return false; if (offset < 0 && -offset > skb_headroom(skb)) return false; return true; } static int pedit_l4_skb_offset(struct sk_buff *skb, int *hoffset, const int header_type) { const int noff = skb_network_offset(skb); int ret = -EINVAL; struct iphdr _iph; switch (skb->protocol) { case htons(ETH_P_IP): { const struct iphdr *iph = skb_header_pointer(skb, noff, sizeof(_iph), &_iph); if (!iph) goto out; *hoffset = noff + iph->ihl * 4; ret = 0; break; } case htons(ETH_P_IPV6): ret = ipv6_find_hdr(skb, hoffset, header_type, NULL, NULL) == header_type ? 0 : -EINVAL; break; } out: return ret; } static int pedit_skb_hdr_offset(struct sk_buff *skb, enum pedit_header_type htype, int *hoffset) { int ret = -EINVAL; /* 'htype' is validated in the netlink parsing */ switch (htype) { case TCA_PEDIT_KEY_EX_HDR_TYPE_ETH: if (skb_mac_header_was_set(skb)) { *hoffset = skb_mac_offset(skb); ret = 0; } break; case TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK: case TCA_PEDIT_KEY_EX_HDR_TYPE_IP4: case TCA_PEDIT_KEY_EX_HDR_TYPE_IP6: *hoffset = skb_network_offset(skb); ret = 0; break; case TCA_PEDIT_KEY_EX_HDR_TYPE_TCP: ret = pedit_l4_skb_offset(skb, hoffset, IPPROTO_TCP); break; case TCA_PEDIT_KEY_EX_HDR_TYPE_UDP: ret = pedit_l4_skb_offset(skb, hoffset, IPPROTO_UDP); break; default: break; } return ret; } TC_INDIRECT_SCOPE int tcf_pedit_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { enum pedit_header_type htype = TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK; enum pedit_cmd cmd = TCA_PEDIT_KEY_EX_CMD_SET; struct tcf_pedit *p = to_pedit(a); struct tcf_pedit_key_ex *tkey_ex; struct tcf_pedit_parms *parms; struct tc_pedit_key *tkey; u32 max_offset; int i; parms = rcu_dereference_bh(p->parms); max_offset = (skb_transport_header_was_set(skb) ? skb_transport_offset(skb) : skb_network_offset(skb)) + parms->tcfp_off_max_hint; if (skb_ensure_writable(skb, min(skb->len, max_offset))) goto done; tcf_lastuse_update(&p->tcf_tm); tcf_action_update_bstats(&p->common, skb); tkey = parms->tcfp_keys; tkey_ex = parms->tcfp_keys_ex; for (i = parms->tcfp_nkeys; i > 0; i--, tkey++) { int offset = tkey->off; int hoffset = 0; u32 *ptr, hdata; u32 val; int rc; if (tkey_ex) { htype = tkey_ex->htype; cmd = tkey_ex->cmd; tkey_ex++; } rc = pedit_skb_hdr_offset(skb, htype, &hoffset); if (rc) { pr_info_ratelimited("tc action pedit unable to extract header offset for header type (0x%x)\n", htype); goto bad; } if (tkey->offmask) { u8 *d, _d; if (!offset_valid(skb, hoffset + tkey->at)) { pr_info_ratelimited("tc action pedit 'at' offset %d out of bounds\n", hoffset + tkey->at); goto bad; } d = skb_header_pointer(skb, hoffset + tkey->at, sizeof(_d), &_d); if (!d) goto bad; offset += (*d & tkey->offmask) >> tkey->shift; if (offset % 4) { pr_info_ratelimited("tc action pedit offset must be on 32 bit boundaries\n"); goto bad; } } if (!offset_valid(skb, hoffset + offset)) { pr_info_ratelimited("tc action pedit offset %d out of bounds\n", hoffset + offset); goto bad; } ptr = skb_header_pointer(skb, hoffset + offset, sizeof(hdata), &hdata); if (!ptr) goto bad; /* just do it, baby */ switch (cmd) { case TCA_PEDIT_KEY_EX_CMD_SET: val = tkey->val; break; case TCA_PEDIT_KEY_EX_CMD_ADD: val = (*ptr + tkey->val) & ~tkey->mask; break; default: pr_info_ratelimited("tc action pedit bad command (%d)\n", cmd); goto bad; } *ptr = ((*ptr & tkey->mask) ^ val); if (ptr == &hdata) skb_store_bits(skb, hoffset + offset, ptr, 4); } goto done; bad: tcf_action_inc_overlimit_qstats(&p->common); done: return p->tcf_action; } static void tcf_pedit_stats_update(struct tc_action *a, u64 bytes, u64 packets, u64 drops, u64 lastuse, bool hw) { struct tcf_pedit *d = to_pedit(a); struct tcf_t *tm = &d->tcf_tm; tcf_action_update_stats(a, bytes, packets, drops, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_pedit *p = to_pedit(a); struct tcf_pedit_parms *parms; struct tc_pedit *opt; struct tcf_t t; int s; spin_lock_bh(&p->tcf_lock); parms = rcu_dereference_protected(p->parms, 1); s = struct_size(opt, keys, parms->tcfp_nkeys); opt = kzalloc(s, GFP_ATOMIC); if (unlikely(!opt)) { spin_unlock_bh(&p->tcf_lock); return -ENOBUFS; } opt->nkeys = parms->tcfp_nkeys; memcpy(opt->keys, parms->tcfp_keys, flex_array_size(opt, keys, parms->tcfp_nkeys)); opt->index = p->tcf_index; opt->flags = parms->tcfp_flags; opt->action = p->tcf_action; opt->refcnt = refcount_read(&p->tcf_refcnt) - ref; opt->bindcnt = atomic_read(&p->tcf_bindcnt) - bind; if (parms->tcfp_keys_ex) { if (tcf_pedit_key_ex_dump(skb, parms->tcfp_keys_ex, parms->tcfp_nkeys)) goto nla_put_failure; if (nla_put(skb, TCA_PEDIT_PARMS_EX, s, opt)) goto nla_put_failure; } else { if (nla_put(skb, TCA_PEDIT_PARMS, s, opt)) goto nla_put_failure; } tcf_tm_dump(&t, &p->tcf_tm); if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD)) goto nla_put_failure; spin_unlock_bh(&p->tcf_lock); kfree(opt); return skb->len; nla_put_failure: spin_unlock_bh(&p->tcf_lock); nlmsg_trim(skb, b); kfree(opt); return -1; } static int tcf_pedit_offload_act_setup(struct tc_action *act, void *entry_data, u32 *index_inc, bool bind, struct netlink_ext_ack *extack) { if (bind) { struct flow_action_entry *entry = entry_data; int k; for (k = 0; k < tcf_pedit_nkeys(act); k++) { switch (tcf_pedit_cmd(act, k)) { case TCA_PEDIT_KEY_EX_CMD_SET: entry->id = FLOW_ACTION_MANGLE; break; case TCA_PEDIT_KEY_EX_CMD_ADD: entry->id = FLOW_ACTION_ADD; break; default: NL_SET_ERR_MSG_MOD(extack, "Unsupported pedit command offload"); return -EOPNOTSUPP; } entry->mangle.htype = tcf_pedit_htype(act, k); entry->mangle.mask = tcf_pedit_mask(act, k); entry->mangle.val = tcf_pedit_val(act, k); entry->mangle.offset = tcf_pedit_offset(act, k); entry->hw_stats = tc_act_hw_stats(act->hw_stats); entry++; } *index_inc = k; } else { struct flow_offload_action *fl_action = entry_data; u32 cmd = tcf_pedit_cmd(act, 0); int k; switch (cmd) { case TCA_PEDIT_KEY_EX_CMD_SET: fl_action->id = FLOW_ACTION_MANGLE; break; case TCA_PEDIT_KEY_EX_CMD_ADD: fl_action->id = FLOW_ACTION_ADD; break; default: NL_SET_ERR_MSG_MOD(extack, "Unsupported pedit command offload"); return -EOPNOTSUPP; } for (k = 1; k < tcf_pedit_nkeys(act); k++) { if (cmd != tcf_pedit_cmd(act, k)) { NL_SET_ERR_MSG_MOD(extack, "Unsupported pedit command offload"); return -EOPNOTSUPP; } } } return 0; } static struct tc_action_ops act_pedit_ops = { .kind = "pedit", .id = TCA_ID_PEDIT, .owner = THIS_MODULE, .act = tcf_pedit_act, .stats_update = tcf_pedit_stats_update, .dump = tcf_pedit_dump, .cleanup = tcf_pedit_cleanup, .init = tcf_pedit_init, .offload_act_setup = tcf_pedit_offload_act_setup, .size = sizeof(struct tcf_pedit), }; MODULE_ALIAS_NET_ACT("pedit"); static __net_init int pedit_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_pedit_ops.net_id); return tc_action_net_init(net, tn, &act_pedit_ops); } static void __net_exit pedit_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_pedit_ops.net_id); } static struct pernet_operations pedit_net_ops = { .init = pedit_init_net, .exit_batch = pedit_exit_net, .id = &act_pedit_ops.net_id, .size = sizeof(struct tc_action_net), }; MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); MODULE_DESCRIPTION("Generic Packet Editor actions"); MODULE_LICENSE("GPL"); static int __init pedit_init_module(void) { return tcf_register_action(&act_pedit_ops, &pedit_net_ops); } static void __exit pedit_cleanup_module(void) { tcf_unregister_action(&act_pedit_ops, &pedit_net_ops); } module_init(pedit_init_module); module_exit(pedit_cleanup_module);
3 12 1 1 914 915 12 5 8 6 6 6 3 3 3 3 14 1 1 12 1 1 1 1 1 1 1 1 1 1 1 9 1 1 1 1 1 1 2 1 1 1 2 1 1 1 3 1 1 7 3 4 3 1 4 8 1 1 1 1 1 1 2 3 3 10 9 10 1 1 1 8 1 1 5 1 2 2 3 1 4 1 2 1 1 1 3 1 1 1 13 1 1 3 3 5 5 1 1 3 3 2 1 1 1 1 32 27 1 18 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 // SPDX-License-Identifier: GPL-2.0-or-later /* * * Copyright Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) * Copyright Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk) * Copyright Darryl Miles G7LED (dlm@g7led.demon.co.uk) */ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/stat.h> #include <net/ax25.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <net/net_namespace.h> #include <net/sock.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/termios.h> /* For TIOCINQ/OUTQ */ #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/notifier.h> #include <net/netrom.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <net/ip.h> #include <net/tcp_states.h> #include <net/arp.h> #include <linux/init.h> static int nr_ndevs = 4; int sysctl_netrom_default_path_quality = NR_DEFAULT_QUAL; int sysctl_netrom_obsolescence_count_initialiser = NR_DEFAULT_OBS; int sysctl_netrom_network_ttl_initialiser = NR_DEFAULT_TTL; int sysctl_netrom_transport_timeout = NR_DEFAULT_T1; int sysctl_netrom_transport_maximum_tries = NR_DEFAULT_N2; int sysctl_netrom_transport_acknowledge_delay = NR_DEFAULT_T2; int sysctl_netrom_transport_busy_delay = NR_DEFAULT_T4; int sysctl_netrom_transport_requested_window_size = NR_DEFAULT_WINDOW; int sysctl_netrom_transport_no_activity_timeout = NR_DEFAULT_IDLE; int sysctl_netrom_routing_control = NR_DEFAULT_ROUTING; int sysctl_netrom_link_fails_count = NR_DEFAULT_FAILS; int sysctl_netrom_reset_circuit = NR_DEFAULT_RESET; static unsigned short circuit = 0x101; static HLIST_HEAD(nr_list); static DEFINE_SPINLOCK(nr_list_lock); static const struct proto_ops nr_proto_ops; /* * NETROM network devices are virtual network devices encapsulating NETROM * frames into AX.25 which will be sent through an AX.25 device, so form a * special "super class" of normal net devices; split their locks off into a * separate class since they always nest. */ static struct lock_class_key nr_netdev_xmit_lock_key; static struct lock_class_key nr_netdev_addr_lock_key; static void nr_set_lockdep_one(struct net_device *dev, struct netdev_queue *txq, void *_unused) { lockdep_set_class(&txq->_xmit_lock, &nr_netdev_xmit_lock_key); } static void nr_set_lockdep_key(struct net_device *dev) { lockdep_set_class(&dev->addr_list_lock, &nr_netdev_addr_lock_key); netdev_for_each_tx_queue(dev, nr_set_lockdep_one, NULL); } /* * Socket removal during an interrupt is now safe. */ static void nr_remove_socket(struct sock *sk) { spin_lock_bh(&nr_list_lock); sk_del_node_init(sk); spin_unlock_bh(&nr_list_lock); } /* * Kill all bound sockets on a dropped device. */ static void nr_kill_by_device(struct net_device *dev) { struct sock *s; spin_lock_bh(&nr_list_lock); sk_for_each(s, &nr_list) if (nr_sk(s)->device == dev) nr_disconnect(s, ENETUNREACH); spin_unlock_bh(&nr_list_lock); } /* * Handle device status changes. */ static int nr_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; if (event != NETDEV_DOWN) return NOTIFY_DONE; nr_kill_by_device(dev); nr_rt_device_down(dev); return NOTIFY_DONE; } /* * Add a socket to the bound sockets list. */ static void nr_insert_socket(struct sock *sk) { spin_lock_bh(&nr_list_lock); sk_add_node(sk, &nr_list); spin_unlock_bh(&nr_list_lock); } /* * Find a socket that wants to accept the Connect Request we just * received. */ static struct sock *nr_find_listener(ax25_address *addr) { struct sock *s; spin_lock_bh(&nr_list_lock); sk_for_each(s, &nr_list) if (!ax25cmp(&nr_sk(s)->source_addr, addr) && s->sk_state == TCP_LISTEN) { sock_hold(s); goto found; } s = NULL; found: spin_unlock_bh(&nr_list_lock); return s; } /* * Find a connected NET/ROM socket given my circuit IDs. */ static struct sock *nr_find_socket(unsigned char index, unsigned char id) { struct sock *s; spin_lock_bh(&nr_list_lock); sk_for_each(s, &nr_list) { struct nr_sock *nr = nr_sk(s); if (nr->my_index == index && nr->my_id == id) { sock_hold(s); goto found; } } s = NULL; found: spin_unlock_bh(&nr_list_lock); return s; } /* * Find a connected NET/ROM socket given their circuit IDs. */ static struct sock *nr_find_peer(unsigned char index, unsigned char id, ax25_address *dest) { struct sock *s; spin_lock_bh(&nr_list_lock); sk_for_each(s, &nr_list) { struct nr_sock *nr = nr_sk(s); if (nr->your_index == index && nr->your_id == id && !ax25cmp(&nr->dest_addr, dest)) { sock_hold(s); goto found; } } s = NULL; found: spin_unlock_bh(&nr_list_lock); return s; } /* * Find next free circuit ID. */ static unsigned short nr_find_next_circuit(void) { unsigned short id = circuit; unsigned char i, j; struct sock *sk; for (;;) { i = id / 256; j = id % 256; if (i != 0 && j != 0) { if ((sk=nr_find_socket(i, j)) == NULL) break; sock_put(sk); } id++; } return id; } /* * Deferred destroy. */ void nr_destroy_socket(struct sock *); /* * Handler for deferred kills. */ static void nr_destroy_timer(struct timer_list *t) { struct sock *sk = from_timer(sk, t, sk_timer); bh_lock_sock(sk); sock_hold(sk); nr_destroy_socket(sk); bh_unlock_sock(sk); sock_put(sk); } /* * This is called from user mode and the timers. Thus it protects itself * against interrupt users but doesn't worry about being called during * work. Once it is removed from the queue no interrupt or bottom half * will touch it and we are (fairly 8-) ) safe. */ void nr_destroy_socket(struct sock *sk) { struct sk_buff *skb; nr_remove_socket(sk); nr_stop_heartbeat(sk); nr_stop_t1timer(sk); nr_stop_t2timer(sk); nr_stop_t4timer(sk); nr_stop_idletimer(sk); nr_clear_queues(sk); /* Flush the queues */ while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { if (skb->sk != sk) { /* A pending connection */ /* Queue the unaccepted socket for death */ sock_set_flag(skb->sk, SOCK_DEAD); nr_start_heartbeat(skb->sk); nr_sk(skb->sk)->state = NR_STATE_0; } kfree_skb(skb); } if (sk_has_allocations(sk)) { /* Defer: outstanding buffers */ sk->sk_timer.function = nr_destroy_timer; sk->sk_timer.expires = jiffies + 2 * HZ; add_timer(&sk->sk_timer); } else sock_put(sk); } /* * Handling for system calls applied via the various interfaces to a * NET/ROM socket object. */ static int nr_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct nr_sock *nr = nr_sk(sk); unsigned int opt; if (level != SOL_NETROM) return -ENOPROTOOPT; if (optlen < sizeof(unsigned int)) return -EINVAL; if (copy_from_sockptr(&opt, optval, sizeof(opt))) return -EFAULT; switch (optname) { case NETROM_T1: if (opt < 1 || opt > UINT_MAX / HZ) return -EINVAL; nr->t1 = opt * HZ; return 0; case NETROM_T2: if (opt < 1 || opt > UINT_MAX / HZ) return -EINVAL; nr->t2 = opt * HZ; return 0; case NETROM_N2: if (opt < 1 || opt > 31) return -EINVAL; nr->n2 = opt; return 0; case NETROM_T4: if (opt < 1 || opt > UINT_MAX / HZ) return -EINVAL; nr->t4 = opt * HZ; return 0; case NETROM_IDLE: if (opt > UINT_MAX / (60 * HZ)) return -EINVAL; nr->idle = opt * 60 * HZ; return 0; default: return -ENOPROTOOPT; } } static int nr_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct nr_sock *nr = nr_sk(sk); int val = 0; int len; if (level != SOL_NETROM) return -ENOPROTOOPT; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; switch (optname) { case NETROM_T1: val = nr->t1 / HZ; break; case NETROM_T2: val = nr->t2 / HZ; break; case NETROM_N2: val = nr->n2; break; case NETROM_T4: val = nr->t4 / HZ; break; case NETROM_IDLE: val = nr->idle / (60 * HZ); break; default: return -ENOPROTOOPT; } len = min_t(unsigned int, len, sizeof(int)); if (put_user(len, optlen)) return -EFAULT; return copy_to_user(optval, &val, len) ? -EFAULT : 0; } static int nr_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; lock_sock(sk); if (sock->state != SS_UNCONNECTED) { release_sock(sk); return -EINVAL; } if (sk->sk_state != TCP_LISTEN) { memset(&nr_sk(sk)->user_addr, 0, AX25_ADDR_LEN); sk->sk_max_ack_backlog = backlog; sk->sk_state = TCP_LISTEN; release_sock(sk); return 0; } release_sock(sk); return -EOPNOTSUPP; } static struct proto nr_proto = { .name = "NETROM", .owner = THIS_MODULE, .obj_size = sizeof(struct nr_sock), }; static int nr_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; struct nr_sock *nr; if (!net_eq(net, &init_net)) return -EAFNOSUPPORT; if (sock->type != SOCK_SEQPACKET || protocol != 0) return -ESOCKTNOSUPPORT; sk = sk_alloc(net, PF_NETROM, GFP_ATOMIC, &nr_proto, kern); if (sk == NULL) return -ENOMEM; nr = nr_sk(sk); sock_init_data(sock, sk); sock->ops = &nr_proto_ops; sk->sk_protocol = protocol; skb_queue_head_init(&nr->ack_queue); skb_queue_head_init(&nr->reseq_queue); skb_queue_head_init(&nr->frag_queue); nr_init_timers(sk); nr->t1 = msecs_to_jiffies(READ_ONCE(sysctl_netrom_transport_timeout)); nr->t2 = msecs_to_jiffies(READ_ONCE(sysctl_netrom_transport_acknowledge_delay)); nr->n2 = msecs_to_jiffies(READ_ONCE(sysctl_netrom_transport_maximum_tries)); nr->t4 = msecs_to_jiffies(READ_ONCE(sysctl_netrom_transport_busy_delay)); nr->idle = msecs_to_jiffies(READ_ONCE(sysctl_netrom_transport_no_activity_timeout)); nr->window = READ_ONCE(sysctl_netrom_transport_requested_window_size); nr->bpqext = 1; nr->state = NR_STATE_0; return 0; } static struct sock *nr_make_new(struct sock *osk) { struct sock *sk; struct nr_sock *nr, *onr; if (osk->sk_type != SOCK_SEQPACKET) return NULL; sk = sk_alloc(sock_net(osk), PF_NETROM, GFP_ATOMIC, osk->sk_prot, 0); if (sk == NULL) return NULL; nr = nr_sk(sk); sock_init_data(NULL, sk); sk->sk_type = osk->sk_type; sk->sk_priority = READ_ONCE(osk->sk_priority); sk->sk_protocol = osk->sk_protocol; sk->sk_rcvbuf = osk->sk_rcvbuf; sk->sk_sndbuf = osk->sk_sndbuf; sk->sk_state = TCP_ESTABLISHED; sock_copy_flags(sk, osk); skb_queue_head_init(&nr->ack_queue); skb_queue_head_init(&nr->reseq_queue); skb_queue_head_init(&nr->frag_queue); nr_init_timers(sk); onr = nr_sk(osk); nr->t1 = onr->t1; nr->t2 = onr->t2; nr->n2 = onr->n2; nr->t4 = onr->t4; nr->idle = onr->idle; nr->window = onr->window; nr->device = onr->device; nr->bpqext = onr->bpqext; return sk; } static int nr_release(struct socket *sock) { struct sock *sk = sock->sk; struct nr_sock *nr; if (sk == NULL) return 0; sock_hold(sk); sock_orphan(sk); lock_sock(sk); nr = nr_sk(sk); switch (nr->state) { case NR_STATE_0: case NR_STATE_1: case NR_STATE_2: nr_disconnect(sk, 0); nr_destroy_socket(sk); break; case NR_STATE_3: nr_clear_queues(sk); nr->n2count = 0; nr_write_internal(sk, NR_DISCREQ); nr_start_t1timer(sk); nr_stop_t2timer(sk); nr_stop_t4timer(sk); nr_stop_idletimer(sk); nr->state = NR_STATE_2; sk->sk_state = TCP_CLOSE; sk->sk_shutdown |= SEND_SHUTDOWN; sk->sk_state_change(sk); sock_set_flag(sk, SOCK_DESTROY); break; default: break; } sock->sk = NULL; release_sock(sk); sock_put(sk); return 0; } static int nr_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; struct nr_sock *nr = nr_sk(sk); struct full_sockaddr_ax25 *addr = (struct full_sockaddr_ax25 *)uaddr; struct net_device *dev; ax25_uid_assoc *user; ax25_address *source; lock_sock(sk); if (!sock_flag(sk, SOCK_ZAPPED)) { release_sock(sk); return -EINVAL; } if (addr_len < sizeof(struct sockaddr_ax25) || addr_len > sizeof(struct full_sockaddr_ax25)) { release_sock(sk); return -EINVAL; } if (addr_len < (addr->fsa_ax25.sax25_ndigis * sizeof(ax25_address) + sizeof(struct sockaddr_ax25))) { release_sock(sk); return -EINVAL; } if (addr->fsa_ax25.sax25_family != AF_NETROM) { release_sock(sk); return -EINVAL; } if ((dev = nr_dev_get(&addr->fsa_ax25.sax25_call)) == NULL) { release_sock(sk); return -EADDRNOTAVAIL; } /* * Only the super user can set an arbitrary user callsign. */ if (addr->fsa_ax25.sax25_ndigis == 1) { if (!capable(CAP_NET_BIND_SERVICE)) { dev_put(dev); release_sock(sk); return -EPERM; } nr->user_addr = addr->fsa_digipeater[0]; nr->source_addr = addr->fsa_ax25.sax25_call; } else { source = &addr->fsa_ax25.sax25_call; user = ax25_findbyuid(current_euid()); if (user) { nr->user_addr = user->call; ax25_uid_put(user); } else { if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) { release_sock(sk); dev_put(dev); return -EPERM; } nr->user_addr = *source; } nr->source_addr = *source; } nr->device = dev; nr_insert_socket(sk); sock_reset_flag(sk, SOCK_ZAPPED); dev_put(dev); release_sock(sk); return 0; } static int nr_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; struct nr_sock *nr = nr_sk(sk); struct sockaddr_ax25 *addr = (struct sockaddr_ax25 *)uaddr; const ax25_address *source = NULL; ax25_uid_assoc *user; struct net_device *dev; int err = 0; lock_sock(sk); if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) { sock->state = SS_CONNECTED; goto out_release; /* Connect completed during a ERESTARTSYS event */ } if (sk->sk_state == TCP_CLOSE && sock->state == SS_CONNECTING) { sock->state = SS_UNCONNECTED; err = -ECONNREFUSED; goto out_release; } if (sk->sk_state == TCP_ESTABLISHED) { err = -EISCONN; /* No reconnect on a seqpacket socket */ goto out_release; } if (sock->state == SS_CONNECTING) { err = -EALREADY; goto out_release; } sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; if (addr_len != sizeof(struct sockaddr_ax25) && addr_len != sizeof(struct full_sockaddr_ax25)) { err = -EINVAL; goto out_release; } if (addr->sax25_family != AF_NETROM) { err = -EINVAL; goto out_release; } if (sock_flag(sk, SOCK_ZAPPED)) { /* Must bind first - autobinding in this may or may not work */ sock_reset_flag(sk, SOCK_ZAPPED); if ((dev = nr_dev_first()) == NULL) { err = -ENETUNREACH; goto out_release; } source = (const ax25_address *)dev->dev_addr; user = ax25_findbyuid(current_euid()); if (user) { nr->user_addr = user->call; ax25_uid_put(user); } else { if (ax25_uid_policy && !capable(CAP_NET_ADMIN)) { dev_put(dev); err = -EPERM; goto out_release; } nr->user_addr = *source; } nr->source_addr = *source; nr->device = dev; dev_put(dev); nr_insert_socket(sk); /* Finish the bind */ } nr->dest_addr = addr->sax25_call; release_sock(sk); circuit = nr_find_next_circuit(); lock_sock(sk); nr->my_index = circuit / 256; nr->my_id = circuit % 256; circuit++; /* Move to connecting socket, start sending Connect Requests */ sock->state = SS_CONNECTING; sk->sk_state = TCP_SYN_SENT; nr_establish_data_link(sk); nr->state = NR_STATE_1; nr_start_heartbeat(sk); /* Now the loop */ if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) { err = -EINPROGRESS; goto out_release; } /* * A Connect Ack with Choke or timeout or failed routing will go to * closed. */ if (sk->sk_state == TCP_SYN_SENT) { DEFINE_WAIT(wait); for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (sk->sk_state != TCP_SYN_SENT) break; if (!signal_pending(current)) { release_sock(sk); schedule(); lock_sock(sk); continue; } err = -ERESTARTSYS; break; } finish_wait(sk_sleep(sk), &wait); if (err) goto out_release; } if (sk->sk_state != TCP_ESTABLISHED) { sock->state = SS_UNCONNECTED; err = sock_error(sk); /* Always set at this point */ goto out_release; } sock->state = SS_CONNECTED; out_release: release_sock(sk); return err; } static int nr_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { struct sk_buff *skb; struct sock *newsk; DEFINE_WAIT(wait); struct sock *sk; int err = 0; if ((sk = sock->sk) == NULL) return -EINVAL; lock_sock(sk); if (sk->sk_type != SOCK_SEQPACKET) { err = -EOPNOTSUPP; goto out_release; } if (sk->sk_state != TCP_LISTEN) { err = -EINVAL; goto out_release; } /* * The write queue this time is holding sockets ready to use * hooked into the SABM we saved */ for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); skb = skb_dequeue(&sk->sk_receive_queue); if (skb) break; if (arg->flags & O_NONBLOCK) { err = -EWOULDBLOCK; break; } if (!signal_pending(current)) { release_sock(sk); schedule(); lock_sock(sk); continue; } err = -ERESTARTSYS; break; } finish_wait(sk_sleep(sk), &wait); if (err) goto out_release; newsk = skb->sk; sock_graft(newsk, newsock); /* Now attach up the new socket */ kfree_skb(skb); sk_acceptq_removed(sk); out_release: release_sock(sk); return err; } static int nr_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct full_sockaddr_ax25 *sax = (struct full_sockaddr_ax25 *)uaddr; struct sock *sk = sock->sk; struct nr_sock *nr = nr_sk(sk); int uaddr_len; memset(&sax->fsa_ax25, 0, sizeof(struct sockaddr_ax25)); lock_sock(sk); if (peer != 0) { if (sk->sk_state != TCP_ESTABLISHED) { release_sock(sk); return -ENOTCONN; } sax->fsa_ax25.sax25_family = AF_NETROM; sax->fsa_ax25.sax25_ndigis = 1; sax->fsa_ax25.sax25_call = nr->user_addr; memset(sax->fsa_digipeater, 0, sizeof(sax->fsa_digipeater)); sax->fsa_digipeater[0] = nr->dest_addr; uaddr_len = sizeof(struct full_sockaddr_ax25); } else { sax->fsa_ax25.sax25_family = AF_NETROM; sax->fsa_ax25.sax25_ndigis = 0; sax->fsa_ax25.sax25_call = nr->source_addr; uaddr_len = sizeof(struct sockaddr_ax25); } release_sock(sk); return uaddr_len; } int nr_rx_frame(struct sk_buff *skb, struct net_device *dev) { struct sock *sk; struct sock *make; struct nr_sock *nr_make; ax25_address *src, *dest, *user; unsigned short circuit_index, circuit_id; unsigned short peer_circuit_index, peer_circuit_id; unsigned short frametype, flags, window, timeout; int ret; skb_orphan(skb); /* * skb->data points to the netrom frame start */ src = (ax25_address *)(skb->data + 0); dest = (ax25_address *)(skb->data + 7); circuit_index = skb->data[15]; circuit_id = skb->data[16]; peer_circuit_index = skb->data[17]; peer_circuit_id = skb->data[18]; frametype = skb->data[19] & 0x0F; flags = skb->data[19] & 0xF0; /* * Check for an incoming IP over NET/ROM frame. */ if (frametype == NR_PROTOEXT && circuit_index == NR_PROTO_IP && circuit_id == NR_PROTO_IP) { skb_pull(skb, NR_NETWORK_LEN + NR_TRANSPORT_LEN); skb_reset_transport_header(skb); return nr_rx_ip(skb, dev); } /* * Find an existing socket connection, based on circuit ID, if it's * a Connect Request base it on their circuit ID. * * Circuit ID 0/0 is not valid but it could still be a "reset" for a * circuit that no longer exists at the other end ... */ sk = NULL; if (circuit_index == 0 && circuit_id == 0) { if (frametype == NR_CONNACK && flags == NR_CHOKE_FLAG) sk = nr_find_peer(peer_circuit_index, peer_circuit_id, src); } else { if (frametype == NR_CONNREQ) sk = nr_find_peer(circuit_index, circuit_id, src); else sk = nr_find_socket(circuit_index, circuit_id); } if (sk != NULL) { bh_lock_sock(sk); skb_reset_transport_header(skb); if (frametype == NR_CONNACK && skb->len == 22) nr_sk(sk)->bpqext = 1; else nr_sk(sk)->bpqext = 0; ret = nr_process_rx_frame(sk, skb); bh_unlock_sock(sk); sock_put(sk); return ret; } /* * Now it should be a CONNREQ. */ if (frametype != NR_CONNREQ) { /* * Here it would be nice to be able to send a reset but * NET/ROM doesn't have one. We've tried to extend the protocol * by sending NR_CONNACK | NR_CHOKE_FLAGS replies but that * apparently kills BPQ boxes... :-( * So now we try to follow the established behaviour of * G8PZT's Xrouter which is sending packets with command type 7 * as an extension of the protocol. */ if (READ_ONCE(sysctl_netrom_reset_circuit) && (frametype != NR_RESET || flags != 0)) nr_transmit_reset(skb, 1); return 0; } sk = nr_find_listener(dest); user = (ax25_address *)(skb->data + 21); if (sk == NULL || sk_acceptq_is_full(sk) || (make = nr_make_new(sk)) == NULL) { nr_transmit_refusal(skb, 0); if (sk) sock_put(sk); return 0; } bh_lock_sock(sk); window = skb->data[20]; sock_hold(make); skb->sk = make; skb->destructor = sock_efree; make->sk_state = TCP_ESTABLISHED; /* Fill in his circuit details */ nr_make = nr_sk(make); nr_make->source_addr = *dest; nr_make->dest_addr = *src; nr_make->user_addr = *user; nr_make->your_index = circuit_index; nr_make->your_id = circuit_id; bh_unlock_sock(sk); circuit = nr_find_next_circuit(); bh_lock_sock(sk); nr_make->my_index = circuit / 256; nr_make->my_id = circuit % 256; circuit++; /* Window negotiation */ if (window < nr_make->window) nr_make->window = window; /* L4 timeout negotiation */ if (skb->len == 37) { timeout = skb->data[36] * 256 + skb->data[35]; if (timeout * HZ < nr_make->t1) nr_make->t1 = timeout * HZ; nr_make->bpqext = 1; } else { nr_make->bpqext = 0; } nr_write_internal(make, NR_CONNACK); nr_make->condition = 0x00; nr_make->vs = 0; nr_make->va = 0; nr_make->vr = 0; nr_make->vl = 0; nr_make->state = NR_STATE_3; sk_acceptq_added(sk); skb_queue_head(&sk->sk_receive_queue, skb); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk); bh_unlock_sock(sk); sock_put(sk); nr_insert_socket(make); nr_start_heartbeat(make); nr_start_idletimer(make); return 1; } static int nr_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct nr_sock *nr = nr_sk(sk); DECLARE_SOCKADDR(struct sockaddr_ax25 *, usax, msg->msg_name); int err; struct sockaddr_ax25 sax; struct sk_buff *skb; unsigned char *asmptr; int size; if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_CMSG_COMPAT)) return -EINVAL; lock_sock(sk); if (sock_flag(sk, SOCK_ZAPPED)) { err = -EADDRNOTAVAIL; goto out; } if (sk->sk_shutdown & SEND_SHUTDOWN) { send_sig(SIGPIPE, current, 0); err = -EPIPE; goto out; } if (nr->device == NULL) { err = -ENETUNREACH; goto out; } if (usax) { if (msg->msg_namelen < sizeof(sax)) { err = -EINVAL; goto out; } sax = *usax; if (ax25cmp(&nr->dest_addr, &sax.sax25_call) != 0) { err = -EISCONN; goto out; } if (sax.sax25_family != AF_NETROM) { err = -EINVAL; goto out; } } else { if (sk->sk_state != TCP_ESTABLISHED) { err = -ENOTCONN; goto out; } sax.sax25_family = AF_NETROM; sax.sax25_call = nr->dest_addr; } /* Build a packet - the conventional user limit is 236 bytes. We can do ludicrously large NetROM frames but must not overflow */ if (len > 65536) { err = -EMSGSIZE; goto out; } size = len + NR_NETWORK_LEN + NR_TRANSPORT_LEN; if ((skb = sock_alloc_send_skb(sk, size, msg->msg_flags & MSG_DONTWAIT, &err)) == NULL) goto out; skb_reserve(skb, size - len); skb_reset_transport_header(skb); /* * Push down the NET/ROM header */ asmptr = skb_push(skb, NR_TRANSPORT_LEN); /* Build a NET/ROM Transport header */ *asmptr++ = nr->your_index; *asmptr++ = nr->your_id; *asmptr++ = 0; /* To be filled in later */ *asmptr++ = 0; /* Ditto */ *asmptr++ = NR_INFO; /* * Put the data on the end */ skb_put(skb, len); /* User data follows immediately after the NET/ROM transport header */ if (memcpy_from_msg(skb_transport_header(skb), msg, len)) { kfree_skb(skb); err = -EFAULT; goto out; } if (sk->sk_state != TCP_ESTABLISHED) { kfree_skb(skb); err = -ENOTCONN; goto out; } nr_output(sk, skb); /* Shove it onto the queue */ err = len; out: release_sock(sk); return err; } static int nr_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; DECLARE_SOCKADDR(struct sockaddr_ax25 *, sax, msg->msg_name); size_t copied; struct sk_buff *skb; int er; /* * This works for seqpacket too. The receiver has ordered the queue for * us! We do one quick check first though */ lock_sock(sk); if (sk->sk_state != TCP_ESTABLISHED) { release_sock(sk); return -ENOTCONN; } /* Now we can treat all alike */ skb = skb_recv_datagram(sk, flags, &er); if (!skb) { release_sock(sk); return er; } skb_reset_transport_header(skb); copied = skb->len; if (copied > size) { copied = size; msg->msg_flags |= MSG_TRUNC; } er = skb_copy_datagram_msg(skb, 0, msg, copied); if (er < 0) { skb_free_datagram(sk, skb); release_sock(sk); return er; } if (sax != NULL) { memset(sax, 0, sizeof(*sax)); sax->sax25_family = AF_NETROM; skb_copy_from_linear_data_offset(skb, 7, sax->sax25_call.ax25_call, AX25_ADDR_LEN); msg->msg_namelen = sizeof(*sax); } skb_free_datagram(sk, skb); release_sock(sk); return copied; } static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; void __user *argp = (void __user *)arg; switch (cmd) { case TIOCOUTQ: { long amount; lock_sock(sk); amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk); if (amount < 0) amount = 0; release_sock(sk); return put_user(amount, (int __user *)argp); } case TIOCINQ: { struct sk_buff *skb; long amount = 0L; lock_sock(sk); /* These two are safe on a single CPU system as only user tasks fiddle here */ if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) amount = skb->len; release_sock(sk); return put_user(amount, (int __user *)argp); } case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: case SIOCGIFBRDADDR: case SIOCSIFBRDADDR: case SIOCGIFNETMASK: case SIOCSIFNETMASK: case SIOCGIFMETRIC: case SIOCSIFMETRIC: return -EINVAL; case SIOCADDRT: case SIOCDELRT: case SIOCNRDECOBS: if (!capable(CAP_NET_ADMIN)) return -EPERM; return nr_rt_ioctl(cmd, argp); default: return -ENOIOCTLCMD; } return 0; } #ifdef CONFIG_PROC_FS static void *nr_info_start(struct seq_file *seq, loff_t *pos) __acquires(&nr_list_lock) { spin_lock_bh(&nr_list_lock); return seq_hlist_start_head(&nr_list, *pos); } static void *nr_info_next(struct seq_file *seq, void *v, loff_t *pos) { return seq_hlist_next(v, &nr_list, pos); } static void nr_info_stop(struct seq_file *seq, void *v) __releases(&nr_list_lock) { spin_unlock_bh(&nr_list_lock); } static int nr_info_show(struct seq_file *seq, void *v) { struct sock *s = sk_entry(v); struct net_device *dev; struct nr_sock *nr; const char *devname; char buf[11]; if (v == SEQ_START_TOKEN) seq_puts(seq, "user_addr dest_node src_node dev my your st vs vr va t1 t2 t4 idle n2 wnd Snd-Q Rcv-Q inode\n"); else { bh_lock_sock(s); nr = nr_sk(s); if ((dev = nr->device) == NULL) devname = "???"; else devname = dev->name; seq_printf(seq, "%-9s ", ax2asc(buf, &nr->user_addr)); seq_printf(seq, "%-9s ", ax2asc(buf, &nr->dest_addr)); seq_printf(seq, "%-9s %-3s %02X/%02X %02X/%02X %2d %3d %3d %3d %3lu/%03lu %2lu/%02lu %3lu/%03lu %3lu/%03lu %2d/%02d %3d %5d %5d %ld\n", ax2asc(buf, &nr->source_addr), devname, nr->my_index, nr->my_id, nr->your_index, nr->your_id, nr->state, nr->vs, nr->vr, nr->va, ax25_display_timer(&nr->t1timer) / HZ, nr->t1 / HZ, ax25_display_timer(&nr->t2timer) / HZ, nr->t2 / HZ, ax25_display_timer(&nr->t4timer) / HZ, nr->t4 / HZ, ax25_display_timer(&nr->idletimer) / (60 * HZ), nr->idle / (60 * HZ), nr->n2count, nr->n2, nr->window, sk_wmem_alloc_get(s), sk_rmem_alloc_get(s), s->sk_socket ? SOCK_INODE(s->sk_socket)->i_ino : 0L); bh_unlock_sock(s); } return 0; } static const struct seq_operations nr_info_seqops = { .start = nr_info_start, .next = nr_info_next, .stop = nr_info_stop, .show = nr_info_show, }; #endif /* CONFIG_PROC_FS */ static const struct net_proto_family nr_family_ops = { .family = PF_NETROM, .create = nr_create, .owner = THIS_MODULE, }; static const struct proto_ops nr_proto_ops = { .family = PF_NETROM, .owner = THIS_MODULE, .release = nr_release, .bind = nr_bind, .connect = nr_connect, .socketpair = sock_no_socketpair, .accept = nr_accept, .getname = nr_getname, .poll = datagram_poll, .ioctl = nr_ioctl, .gettstamp = sock_gettstamp, .listen = nr_listen, .shutdown = sock_no_shutdown, .setsockopt = nr_setsockopt, .getsockopt = nr_getsockopt, .sendmsg = nr_sendmsg, .recvmsg = nr_recvmsg, .mmap = sock_no_mmap, }; static struct notifier_block nr_dev_notifier = { .notifier_call = nr_device_event, }; static struct net_device **dev_nr; static struct ax25_protocol nr_pid = { .pid = AX25_P_NETROM, .func = nr_route_frame }; static struct ax25_linkfail nr_linkfail_notifier = { .func = nr_link_failed, }; static int __init nr_proto_init(void) { int i; int rc = proto_register(&nr_proto, 0); if (rc) return rc; if (nr_ndevs > 0x7fffffff/sizeof(struct net_device *)) { pr_err("NET/ROM: %s - nr_ndevs parameter too large\n", __func__); rc = -EINVAL; goto unregister_proto; } dev_nr = kcalloc(nr_ndevs, sizeof(struct net_device *), GFP_KERNEL); if (!dev_nr) { pr_err("NET/ROM: %s - unable to allocate device array\n", __func__); rc = -ENOMEM; goto unregister_proto; } for (i = 0; i < nr_ndevs; i++) { char name[IFNAMSIZ]; struct net_device *dev; sprintf(name, "nr%d", i); dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, nr_setup); if (!dev) { rc = -ENOMEM; goto fail; } dev->base_addr = i; rc = register_netdev(dev); if (rc) { free_netdev(dev); goto fail; } nr_set_lockdep_key(dev); dev_nr[i] = dev; } rc = sock_register(&nr_family_ops); if (rc) goto fail; rc = register_netdevice_notifier(&nr_dev_notifier); if (rc) goto out_sock; ax25_register_pid(&nr_pid); ax25_linkfail_register(&nr_linkfail_notifier); #ifdef CONFIG_SYSCTL rc = nr_register_sysctl(); if (rc) goto out_sysctl; #endif nr_loopback_init(); rc = -ENOMEM; if (!proc_create_seq("nr", 0444, init_net.proc_net, &nr_info_seqops)) goto proc_remove1; if (!proc_create_seq("nr_neigh", 0444, init_net.proc_net, &nr_neigh_seqops)) goto proc_remove2; if (!proc_create_seq("nr_nodes", 0444, init_net.proc_net, &nr_node_seqops)) goto proc_remove3; return 0; proc_remove3: remove_proc_entry("nr_neigh", init_net.proc_net); proc_remove2: remove_proc_entry("nr", init_net.proc_net); proc_remove1: nr_loopback_clear(); nr_rt_free(); #ifdef CONFIG_SYSCTL nr_unregister_sysctl(); out_sysctl: #endif ax25_linkfail_release(&nr_linkfail_notifier); ax25_protocol_release(AX25_P_NETROM); unregister_netdevice_notifier(&nr_dev_notifier); out_sock: sock_unregister(PF_NETROM); fail: while (--i >= 0) { unregister_netdev(dev_nr[i]); free_netdev(dev_nr[i]); } kfree(dev_nr); unregister_proto: proto_unregister(&nr_proto); return rc; } module_init(nr_proto_init); module_param(nr_ndevs, int, 0); MODULE_PARM_DESC(nr_ndevs, "number of NET/ROM devices"); MODULE_AUTHOR("Jonathan Naylor G4KLX <g4klx@g4klx.demon.co.uk>"); MODULE_DESCRIPTION("The amateur radio NET/ROM network and transport layer protocol"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_NETROM); static void __exit nr_exit(void) { int i; remove_proc_entry("nr", init_net.proc_net); remove_proc_entry("nr_neigh", init_net.proc_net); remove_proc_entry("nr_nodes", init_net.proc_net); nr_loopback_clear(); nr_rt_free(); #ifdef CONFIG_SYSCTL nr_unregister_sysctl(); #endif ax25_linkfail_release(&nr_linkfail_notifier); ax25_protocol_release(AX25_P_NETROM); unregister_netdevice_notifier(&nr_dev_notifier); sock_unregister(PF_NETROM); for (i = 0; i < nr_ndevs; i++) { struct net_device *dev = dev_nr[i]; if (dev) { unregister_netdev(dev); free_netdev(dev); } } kfree(dev_nr); proto_unregister(&nr_proto); } module_exit(nr_exit);
29 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * include/linux/if_team.h - Network team device driver header * Copyright (c) 2011 Jiri Pirko <jpirko@redhat.com> */ #ifndef _LINUX_IF_TEAM_H_ #define _LINUX_IF_TEAM_H_ #include <linux/netpoll.h> #include <net/sch_generic.h> #include <linux/types.h> #include <uapi/linux/if_team.h> struct team_pcpu_stats { u64_stats_t rx_packets; u64_stats_t rx_bytes; u64_stats_t rx_multicast; u64_stats_t tx_packets; u64_stats_t tx_bytes; struct u64_stats_sync syncp; u32 rx_dropped; u32 tx_dropped; u32 rx_nohandler; }; struct team; struct team_port { struct net_device *dev; struct hlist_node hlist; /* node in enabled ports hash list */ struct list_head list; /* node in ordinary list */ struct team *team; int index; /* index of enabled port. If disabled, it's set to -1 */ bool linkup; /* either state.linkup or user.linkup */ struct { bool linkup; u32 speed; u8 duplex; } state; /* Values set by userspace */ struct { bool linkup; bool linkup_enabled; } user; /* Custom gennetlink interface related flags */ bool changed; bool removed; /* * A place for storing original values of the device before it * become a port. */ struct { unsigned char dev_addr[MAX_ADDR_LEN]; unsigned int mtu; } orig; #ifdef CONFIG_NET_POLL_CONTROLLER struct netpoll *np; #endif s32 priority; /* lower number ~ higher priority */ u16 queue_id; struct list_head qom_list; /* node in queue override mapping list */ struct rcu_head rcu; long mode_priv[]; }; static inline struct team_port *team_port_get_rcu(const struct net_device *dev) { return rcu_dereference(dev->rx_handler_data); } static inline bool team_port_enabled(struct team_port *port) { return port->index != -1; } static inline bool team_port_txable(struct team_port *port) { return port->linkup && team_port_enabled(port); } static inline bool team_port_dev_txable(const struct net_device *port_dev) { struct team_port *port; bool txable; rcu_read_lock(); port = team_port_get_rcu(port_dev); txable = port ? team_port_txable(port) : false; rcu_read_unlock(); return txable; } #ifdef CONFIG_NET_POLL_CONTROLLER static inline void team_netpoll_send_skb(struct team_port *port, struct sk_buff *skb) { netpoll_send_skb(port->np, skb); } #else static inline void team_netpoll_send_skb(struct team_port *port, struct sk_buff *skb) { } #endif struct team_mode_ops { int (*init)(struct team *team); void (*exit)(struct team *team); rx_handler_result_t (*receive)(struct team *team, struct team_port *port, struct sk_buff *skb); bool (*transmit)(struct team *team, struct sk_buff *skb); int (*port_enter)(struct team *team, struct team_port *port); void (*port_leave)(struct team *team, struct team_port *port); void (*port_change_dev_addr)(struct team *team, struct team_port *port); void (*port_enabled)(struct team *team, struct team_port *port); void (*port_disabled)(struct team *team, struct team_port *port); }; extern int team_modeop_port_enter(struct team *team, struct team_port *port); extern void team_modeop_port_change_dev_addr(struct team *team, struct team_port *port); enum team_option_type { TEAM_OPTION_TYPE_U32, TEAM_OPTION_TYPE_STRING, TEAM_OPTION_TYPE_BINARY, TEAM_OPTION_TYPE_BOOL, TEAM_OPTION_TYPE_S32, }; struct team_option_inst_info { u32 array_index; struct team_port *port; /* != NULL if per-port */ }; struct team_gsetter_ctx { union { u32 u32_val; const char *str_val; struct { const void *ptr; u32 len; } bin_val; bool bool_val; s32 s32_val; } data; struct team_option_inst_info *info; }; struct team_option { struct list_head list; const char *name; bool per_port; unsigned int array_size; /* != 0 means the option is array */ enum team_option_type type; void (*init)(struct team *team, struct team_option_inst_info *info); void (*getter)(struct team *team, struct team_gsetter_ctx *ctx); int (*setter)(struct team *team, struct team_gsetter_ctx *ctx); }; extern void team_option_inst_set_change(struct team_option_inst_info *opt_inst_info); extern void team_options_change_check(struct team *team); struct team_mode { const char *kind; struct module *owner; size_t priv_size; size_t port_priv_size; const struct team_mode_ops *ops; enum netdev_lag_tx_type lag_tx_type; }; #define TEAM_PORT_HASHBITS 4 #define TEAM_PORT_HASHENTRIES (1 << TEAM_PORT_HASHBITS) #define TEAM_MODE_PRIV_LONGS 4 #define TEAM_MODE_PRIV_SIZE (sizeof(long) * TEAM_MODE_PRIV_LONGS) struct team { struct net_device *dev; /* associated netdevice */ struct team_pcpu_stats __percpu *pcpu_stats; const struct header_ops *header_ops_cache; struct mutex lock; /* used for overall locking, e.g. port lists write */ /* * List of enabled ports and their count */ int en_port_count; struct hlist_head en_port_hlist[TEAM_PORT_HASHENTRIES]; struct list_head port_list; /* list of all ports */ struct list_head option_list; struct list_head option_inst_list; /* list of option instances */ const struct team_mode *mode; struct team_mode_ops ops; bool user_carrier_enabled; bool queue_override_enabled; struct list_head *qom_lists; /* array of queue override mapping lists */ bool port_mtu_change_allowed; bool notifier_ctx; struct { unsigned int count; unsigned int interval; /* in ms */ atomic_t count_pending; struct delayed_work dw; } notify_peers; struct { unsigned int count; unsigned int interval; /* in ms */ atomic_t count_pending; struct delayed_work dw; } mcast_rejoin; struct lock_class_key team_lock_key; long mode_priv[TEAM_MODE_PRIV_LONGS]; }; static inline int team_dev_queue_xmit(struct team *team, struct team_port *port, struct sk_buff *skb) { BUILD_BUG_ON(sizeof(skb->queue_mapping) != sizeof(qdisc_skb_cb(skb)->slave_dev_queue_mapping)); skb_set_queue_mapping(skb, qdisc_skb_cb(skb)->slave_dev_queue_mapping); skb->dev = port->dev; if (unlikely(netpoll_tx_running(team->dev))) { team_netpoll_send_skb(port, skb); return 0; } return dev_queue_xmit(skb); } static inline struct hlist_head *team_port_index_hash(struct team *team, int port_index) { return &team->en_port_hlist[port_index & (TEAM_PORT_HASHENTRIES - 1)]; } static inline struct team_port *team_get_port_by_index(struct team *team, int port_index) { struct team_port *port; struct hlist_head *head = team_port_index_hash(team, port_index); hlist_for_each_entry(port, head, hlist) if (port->index == port_index) return port; return NULL; } static inline int team_num_to_port_index(struct team *team, unsigned int num) { int en_port_count = READ_ONCE(team->en_port_count); if (unlikely(!en_port_count)) return 0; return num % en_port_count; } static inline struct team_port *team_get_port_by_index_rcu(struct team *team, int port_index) { struct team_port *port; struct hlist_head *head = team_port_index_hash(team, port_index); hlist_for_each_entry_rcu(port, head, hlist) if (port->index == port_index) return port; return NULL; } static inline struct team_port * team_get_first_port_txable_rcu(struct team *team, struct team_port *port) { struct team_port *cur; if (likely(team_port_txable(port))) return port; cur = port; list_for_each_entry_continue_rcu(cur, &team->port_list, list) if (team_port_txable(cur)) return cur; list_for_each_entry_rcu(cur, &team->port_list, list) { if (cur == port) break; if (team_port_txable(cur)) return cur; } return NULL; } extern int team_options_register(struct team *team, const struct team_option *option, size_t option_count); extern void team_options_unregister(struct team *team, const struct team_option *option, size_t option_count); extern int team_mode_register(const struct team_mode *mode); extern void team_mode_unregister(const struct team_mode *mode); #define TEAM_DEFAULT_NUM_TX_QUEUES 16 #define TEAM_DEFAULT_NUM_RX_QUEUES 16 #define MODULE_ALIAS_TEAM_MODE(kind) MODULE_ALIAS("team-mode-" kind) #endif /* _LINUX_IF_TEAM_H_ */
15 15 15 15 14 15 15 15 15 15 15 15 15 15 14 15 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620 9621 9622 9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9758 9759 9760 9761 9762 9763 9764 9765 9766 9767 9768 9769 9770 9771 9772 9773 9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784 9785 9786 9787 9788 9789 9790 9791 9792 9793 9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805 9806 9807 9808 9809 9810 9811 9812 9813 9814 9815 9816 9817 9818 9819 9820 9821 9822 9823 9824 9825 9826 9827 9828 9829 9830 9831 9832 9833 9834 9835 9836 9837 9838 9839 9840 9841 9842 9843 9844 9845 9846 9847 9848 9849 9850 9851 9852 9853 9854 9855 9856 9857 9858 9859 9860 9861 9862 9863 9864 9865 9866 9867 9868 9869 9870 9871 9872 9873 9874 9875 9876 9877 9878 9879 9880 9881 9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895 9896 9897 9898 9899 9900 9901 9902 9903 9904 9905 9906 9907 9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924 9925 9926 9927 9928 9929 9930 9931 9932 9933 9934 9935 9936 9937 9938 9939 9940 9941 9942 9943 9944 9945 9946 9947 9948 9949 9950 9951 9952 9953 9954 9955 9956 9957 9958 9959 9960 9961 9962 9963 9964 9965 9966 9967 9968 9969 9970 9971 9972 9973 9974 9975 9976 9977 9978 9979 9980 9981 9982 9983 9984 9985 9986 9987 9988 9989 9990 9991 9992 9993 9994 9995 9996 9997 9998 9999 10000 10001 10002 10003 10004 10005 10006 10007 10008 10009 10010 10011 10012 10013 10014 10015 10016 10017 10018 10019 10020 10021 10022 10023 10024 10025 10026 10027 10028 10029 10030 10031 10032 10033 10034 10035 10036 10037 10038 10039 10040 10041 10042 10043 10044 10045 10046 10047 10048 10049 10050 10051 10052 10053 10054 10055 10056 10057 10058 10059 10060 10061 10062 10063 10064 10065 10066 10067 10068 10069 10070 10071 10072 10073 10074 10075 10076 10077 10078 10079 10080 10081 10082 10083 10084 10085 10086 10087 10088 10089 10090 10091 10092 10093 10094 10095 10096 10097 10098 10099 10100 10101 10102 10103 10104 10105 10106 10107 10108 10109 10110 10111 10112 10113 10114 10115 10116 10117 10118 10119 10120 10121 10122 10123 10124 10125 10126 10127 10128 10129 10130 10131 10132 10133 10134 10135 10136 10137 10138 10139 10140 10141 10142 10143 10144 10145 10146 10147 10148 10149 10150 10151 10152 10153 10154 10155 10156 10157 10158 10159 10160 10161 10162 10163 10164 10165 10166 10167 10168 10169 10170 10171 10172 10173 10174 10175 10176 10177 10178 10179 10180 10181 10182 10183 10184 10185 10186 10187 10188 10189 10190 10191 10192 10193 10194 10195 10196 10197 10198 10199 10200 10201 10202 10203 10204 10205 10206 10207 10208 10209 10210 10211 10212 10213 10214 10215 10216 10217 10218 10219 10220 10221 10222 10223 10224 10225 10226 10227 10228 10229 10230 10231 10232 10233 10234 10235 10236 10237 10238 10239 10240 10241 10242 10243 10244 10245 10246 10247 10248 10249 10250 10251 10252 10253 10254 10255 10256 10257 10258 10259 10260 10261 10262 10263 10264 10265 10266 10267 10268 10269 10270 10271 10272 10273 10274 10275 10276 10277 10278 10279 10280 10281 10282 10283 10284 10285 10286 10287 10288 10289 10290 10291 10292 10293 10294 10295 10296 10297 10298 10299 10300 10301 10302 10303 10304 10305 10306 10307 10308 10309 10310 10311 10312 10313 10314 10315 10316 10317 10318 10319 10320 10321 10322 10323 10324 10325 10326 10327 10328 10329 10330 10331 10332 10333 10334 10335 10336 10337 10338 10339 10340 10341 10342 10343 10344 10345 10346 10347 10348 10349 10350 10351 10352 10353 10354 10355 10356 10357 10358 10359 10360 10361 10362 10363 10364 10365 10366 10367 10368 10369 10370 10371 10372 10373 10374 10375 10376 10377 10378 10379 10380 10381 10382 10383 10384 10385 10386 10387 10388 10389 10390 10391 10392 10393 10394 10395 10396 10397 10398 10399 10400 10401 10402 10403 10404 10405 10406 10407 10408 10409 10410 10411 10412 10413 10414 10415 10416 10417 10418 10419 10420 10421 10422 10423 10424 10425 10426 10427 10428 10429 10430 10431 10432 10433 10434 10435 10436 10437 10438 10439 10440 10441 10442 10443 10444 10445 10446 10447 10448 10449 10450 10451 10452 10453 10454 10455 10456 10457 10458 10459 10460 10461 10462 10463 10464 10465 10466 10467 10468 10469 10470 10471 10472 10473 10474 10475 10476 10477 10478 10479 10480 10481 10482 10483 10484 10485 10486 10487 10488 10489 10490 10491 10492 10493 10494 10495 10496 10497 10498 10499 10500 10501 10502 10503 10504 10505 10506 10507 10508 10509 10510 10511 10512 10513 10514 10515 10516 10517 10518 10519 10520 10521 10522 10523 10524 10525 10526 10527 10528 10529 10530 10531 10532 10533 10534 10535 10536 10537 10538 10539 10540 10541 10542 10543 10544 10545 10546 10547 10548 10549 10550 10551 10552 10553 10554 10555 10556 10557 10558 10559 10560 10561 10562 10563 10564 10565 10566 10567 10568 10569 10570 10571 10572 10573 10574 10575 10576 10577 10578 10579 10580 10581 10582 10583 10584 10585 10586 10587 10588 10589 10590 10591 10592 10593 10594 10595 10596 10597 10598 10599 10600 10601 10602 10603 10604 10605 10606 10607 10608 10609 10610 10611 10612 10613 10614 10615 10616 10617 10618 10619 10620 10621 10622 10623 10624 10625 10626 10627 10628 10629 10630 10631 10632 10633 10634 10635 10636 10637 10638 10639 10640 10641 10642 10643 10644 10645 10646 10647 10648 10649 10650 10651 10652 10653 10654 10655 10656 10657 10658 10659 10660 10661 10662 10663 10664 10665 10666 10667 10668 10669 10670 10671 10672 10673 10674 10675 10676 10677 10678 10679 10680 10681 10682 10683 10684 10685 10686 10687 10688 10689 10690 10691 10692 10693 10694 10695 10696 10697 10698 10699 10700 10701 10702 10703 10704 10705 10706 10707 10708 10709 10710 10711 10712 10713 10714 // SPDX-License-Identifier: GPL-2.0 /* * ring buffer based function tracer * * Copyright (C) 2007-2012 Steven Rostedt <srostedt@redhat.com> * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> * * Originally taken from the RT patch by: * Arnaldo Carvalho de Melo <acme@redhat.com> * * Based on code from the latency_tracer, that is: * Copyright (C) 2004-2006 Ingo Molnar * Copyright (C) 2004 Nadia Yvette Chambers */ #include <linux/ring_buffer.h> #include <linux/utsname.h> #include <linux/stacktrace.h> #include <linux/writeback.h> #include <linux/kallsyms.h> #include <linux/security.h> #include <linux/seq_file.h> #include <linux/irqflags.h> #include <linux/debugfs.h> #include <linux/tracefs.h> #include <linux/pagemap.h> #include <linux/hardirq.h> #include <linux/linkage.h> #include <linux/uaccess.h> #include <linux/cleanup.h> #include <linux/vmalloc.h> #include <linux/ftrace.h> #include <linux/module.h> #include <linux/percpu.h> #include <linux/splice.h> #include <linux/kdebug.h> #include <linux/string.h> #include <linux/mount.h> #include <linux/rwsem.h> #include <linux/slab.h> #include <linux/ctype.h> #include <linux/init.h> #include <linux/panic_notifier.h> #include <linux/poll.h> #include <linux/nmi.h> #include <linux/fs.h> #include <linux/trace.h> #include <linux/sched/clock.h> #include <linux/sched/rt.h> #include <linux/fsnotify.h> #include <linux/irq_work.h> #include <linux/workqueue.h> #include <asm/setup.h> /* COMMAND_LINE_SIZE */ #include "trace.h" #include "trace_output.h" #ifdef CONFIG_FTRACE_STARTUP_TEST /* * We need to change this state when a selftest is running. * A selftest will lurk into the ring-buffer to count the * entries inserted during the selftest although some concurrent * insertions into the ring-buffer such as trace_printk could occurred * at the same time, giving false positive or negative results. */ static bool __read_mostly tracing_selftest_running; /* * If boot-time tracing including tracers/events via kernel cmdline * is running, we do not want to run SELFTEST. */ bool __read_mostly tracing_selftest_disabled; void __init disable_tracing_selftest(const char *reason) { if (!tracing_selftest_disabled) { tracing_selftest_disabled = true; pr_info("Ftrace startup test is disabled due to %s\n", reason); } } #else #define tracing_selftest_running 0 #define tracing_selftest_disabled 0 #endif /* Pipe tracepoints to printk */ static struct trace_iterator *tracepoint_print_iter; int tracepoint_printk; static bool tracepoint_printk_stop_on_boot __initdata; static DEFINE_STATIC_KEY_FALSE(tracepoint_printk_key); /* For tracers that don't implement custom flags */ static struct tracer_opt dummy_tracer_opt[] = { { } }; static int dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { return 0; } /* * To prevent the comm cache from being overwritten when no * tracing is active, only save the comm when a trace event * occurred. */ DEFINE_PER_CPU(bool, trace_taskinfo_save); /* * Kill all tracing for good (never come back). * It is initialized to 1 but will turn to zero if the initialization * of the tracer is successful. But that is the only place that sets * this back to zero. */ static int tracing_disabled = 1; cpumask_var_t __read_mostly tracing_buffer_mask; /* * ftrace_dump_on_oops - variable to dump ftrace buffer on oops * * If there is an oops (or kernel panic) and the ftrace_dump_on_oops * is set, then ftrace_dump is called. This will output the contents * of the ftrace buffers to the console. This is very useful for * capturing traces that lead to crashes and outputing it to a * serial console. * * It is default off, but you can enable it with either specifying * "ftrace_dump_on_oops" in the kernel command line, or setting * /proc/sys/kernel/ftrace_dump_on_oops * Set 1 if you want to dump buffers of all CPUs * Set 2 if you want to dump the buffer of the CPU that triggered oops * Set instance name if you want to dump the specific trace instance * Multiple instance dump is also supported, and instances are seperated * by commas. */ /* Set to string format zero to disable by default */ char ftrace_dump_on_oops[MAX_TRACER_SIZE] = "0"; /* When set, tracing will stop when a WARN*() is hit */ int __disable_trace_on_warning; #ifdef CONFIG_TRACE_EVAL_MAP_FILE /* Map of enums to their values, for "eval_map" file */ struct trace_eval_map_head { struct module *mod; unsigned long length; }; union trace_eval_map_item; struct trace_eval_map_tail { /* * "end" is first and points to NULL as it must be different * than "mod" or "eval_string" */ union trace_eval_map_item *next; const char *end; /* points to NULL */ }; static DEFINE_MUTEX(trace_eval_mutex); /* * The trace_eval_maps are saved in an array with two extra elements, * one at the beginning, and one at the end. The beginning item contains * the count of the saved maps (head.length), and the module they * belong to if not built in (head.mod). The ending item contains a * pointer to the next array of saved eval_map items. */ union trace_eval_map_item { struct trace_eval_map map; struct trace_eval_map_head head; struct trace_eval_map_tail tail; }; static union trace_eval_map_item *trace_eval_maps; #endif /* CONFIG_TRACE_EVAL_MAP_FILE */ int tracing_set_tracer(struct trace_array *tr, const char *buf); static void ftrace_trace_userstack(struct trace_array *tr, struct trace_buffer *buffer, unsigned int trace_ctx); static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; static char *default_bootup_tracer; static bool allocate_snapshot; static bool snapshot_at_boot; static char boot_instance_info[COMMAND_LINE_SIZE] __initdata; static int boot_instance_index; static char boot_snapshot_info[COMMAND_LINE_SIZE] __initdata; static int boot_snapshot_index; static int __init set_cmdline_ftrace(char *str) { strscpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); default_bootup_tracer = bootup_tracer_buf; /* We are using ftrace early, expand it */ trace_set_ring_buffer_expanded(NULL); return 1; } __setup("ftrace=", set_cmdline_ftrace); int ftrace_dump_on_oops_enabled(void) { if (!strcmp("0", ftrace_dump_on_oops)) return 0; else return 1; } static int __init set_ftrace_dump_on_oops(char *str) { if (!*str) { strscpy(ftrace_dump_on_oops, "1", MAX_TRACER_SIZE); return 1; } if (*str == ',') { strscpy(ftrace_dump_on_oops, "1", MAX_TRACER_SIZE); strscpy(ftrace_dump_on_oops + 1, str, MAX_TRACER_SIZE - 1); return 1; } if (*str++ == '=') { strscpy(ftrace_dump_on_oops, str, MAX_TRACER_SIZE); return 1; } return 0; } __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); static int __init stop_trace_on_warning(char *str) { if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0)) __disable_trace_on_warning = 1; return 1; } __setup("traceoff_on_warning", stop_trace_on_warning); static int __init boot_alloc_snapshot(char *str) { char *slot = boot_snapshot_info + boot_snapshot_index; int left = sizeof(boot_snapshot_info) - boot_snapshot_index; int ret; if (str[0] == '=') { str++; if (strlen(str) >= left) return -1; ret = snprintf(slot, left, "%s\t", str); boot_snapshot_index += ret; } else { allocate_snapshot = true; /* We also need the main ring buffer expanded */ trace_set_ring_buffer_expanded(NULL); } return 1; } __setup("alloc_snapshot", boot_alloc_snapshot); static int __init boot_snapshot(char *str) { snapshot_at_boot = true; boot_alloc_snapshot(str); return 1; } __setup("ftrace_boot_snapshot", boot_snapshot); static int __init boot_instance(char *str) { char *slot = boot_instance_info + boot_instance_index; int left = sizeof(boot_instance_info) - boot_instance_index; int ret; if (strlen(str) >= left) return -1; ret = snprintf(slot, left, "%s\t", str); boot_instance_index += ret; return 1; } __setup("trace_instance=", boot_instance); static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; static int __init set_trace_boot_options(char *str) { strscpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); return 1; } __setup("trace_options=", set_trace_boot_options); static char trace_boot_clock_buf[MAX_TRACER_SIZE] __initdata; static char *trace_boot_clock __initdata; static int __init set_trace_boot_clock(char *str) { strscpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE); trace_boot_clock = trace_boot_clock_buf; return 1; } __setup("trace_clock=", set_trace_boot_clock); static int __init set_tracepoint_printk(char *str) { /* Ignore the "tp_printk_stop_on_boot" param */ if (*str == '_') return 0; if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0)) tracepoint_printk = 1; return 1; } __setup("tp_printk", set_tracepoint_printk); static int __init set_tracepoint_printk_stop(char *str) { tracepoint_printk_stop_on_boot = true; return 1; } __setup("tp_printk_stop_on_boot", set_tracepoint_printk_stop); unsigned long long ns2usecs(u64 nsec) { nsec += 500; do_div(nsec, 1000); return nsec; } static void trace_process_export(struct trace_export *export, struct ring_buffer_event *event, int flag) { struct trace_entry *entry; unsigned int size = 0; if (export->flags & flag) { entry = ring_buffer_event_data(event); size = ring_buffer_event_length(event); export->write(export, entry, size); } } static DEFINE_MUTEX(ftrace_export_lock); static struct trace_export __rcu *ftrace_exports_list __read_mostly; static DEFINE_STATIC_KEY_FALSE(trace_function_exports_enabled); static DEFINE_STATIC_KEY_FALSE(trace_event_exports_enabled); static DEFINE_STATIC_KEY_FALSE(trace_marker_exports_enabled); static inline void ftrace_exports_enable(struct trace_export *export) { if (export->flags & TRACE_EXPORT_FUNCTION) static_branch_inc(&trace_function_exports_enabled); if (export->flags & TRACE_EXPORT_EVENT) static_branch_inc(&trace_event_exports_enabled); if (export->flags & TRACE_EXPORT_MARKER) static_branch_inc(&trace_marker_exports_enabled); } static inline void ftrace_exports_disable(struct trace_export *export) { if (export->flags & TRACE_EXPORT_FUNCTION) static_branch_dec(&trace_function_exports_enabled); if (export->flags & TRACE_EXPORT_EVENT) static_branch_dec(&trace_event_exports_enabled); if (export->flags & TRACE_EXPORT_MARKER) static_branch_dec(&trace_marker_exports_enabled); } static void ftrace_exports(struct ring_buffer_event *event, int flag) { struct trace_export *export; preempt_disable_notrace(); export = rcu_dereference_raw_check(ftrace_exports_list); while (export) { trace_process_export(export, event, flag); export = rcu_dereference_raw_check(export->next); } preempt_enable_notrace(); } static inline void add_trace_export(struct trace_export **list, struct trace_export *export) { rcu_assign_pointer(export->next, *list); /* * We are entering export into the list but another * CPU might be walking that list. We need to make sure * the export->next pointer is valid before another CPU sees * the export pointer included into the list. */ rcu_assign_pointer(*list, export); } static inline int rm_trace_export(struct trace_export **list, struct trace_export *export) { struct trace_export **p; for (p = list; *p != NULL; p = &(*p)->next) if (*p == export) break; if (*p != export) return -1; rcu_assign_pointer(*p, (*p)->next); return 0; } static inline void add_ftrace_export(struct trace_export **list, struct trace_export *export) { ftrace_exports_enable(export); add_trace_export(list, export); } static inline int rm_ftrace_export(struct trace_export **list, struct trace_export *export) { int ret; ret = rm_trace_export(list, export); ftrace_exports_disable(export); return ret; } int register_ftrace_export(struct trace_export *export) { if (WARN_ON_ONCE(!export->write)) return -1; mutex_lock(&ftrace_export_lock); add_ftrace_export(&ftrace_exports_list, export); mutex_unlock(&ftrace_export_lock); return 0; } EXPORT_SYMBOL_GPL(register_ftrace_export); int unregister_ftrace_export(struct trace_export *export) { int ret; mutex_lock(&ftrace_export_lock); ret = rm_ftrace_export(&ftrace_exports_list, export); mutex_unlock(&ftrace_export_lock); return ret; } EXPORT_SYMBOL_GPL(unregister_ftrace_export); /* trace_flags holds trace_options default values */ #define TRACE_DEFAULT_FLAGS \ (FUNCTION_DEFAULT_FLAGS | \ TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | \ TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | \ TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | \ TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | \ TRACE_ITER_HASH_PTR | TRACE_ITER_TRACE_PRINTK) /* trace_options that are only supported by global_trace */ #define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK | \ TRACE_ITER_PRINTK_MSGONLY | TRACE_ITER_RECORD_CMD) /* trace_flags that are default zero for instances */ #define ZEROED_TRACE_FLAGS \ (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK | TRACE_ITER_TRACE_PRINTK) /* * The global_trace is the descriptor that holds the top-level tracing * buffers for the live tracing. */ static struct trace_array global_trace = { .trace_flags = TRACE_DEFAULT_FLAGS, }; static struct trace_array *printk_trace = &global_trace; static __always_inline bool printk_binsafe(struct trace_array *tr) { /* * The binary format of traceprintk can cause a crash if used * by a buffer from another boot. Force the use of the * non binary version of trace_printk if the trace_printk * buffer is a boot mapped ring buffer. */ return !(tr->flags & TRACE_ARRAY_FL_BOOT); } static void update_printk_trace(struct trace_array *tr) { if (printk_trace == tr) return; printk_trace->trace_flags &= ~TRACE_ITER_TRACE_PRINTK; printk_trace = tr; tr->trace_flags |= TRACE_ITER_TRACE_PRINTK; } void trace_set_ring_buffer_expanded(struct trace_array *tr) { if (!tr) tr = &global_trace; tr->ring_buffer_expanded = true; } LIST_HEAD(ftrace_trace_arrays); int trace_array_get(struct trace_array *this_tr) { struct trace_array *tr; guard(mutex)(&trace_types_lock); list_for_each_entry(tr, &ftrace_trace_arrays, list) { if (tr == this_tr) { tr->ref++; return 0; } } return -ENODEV; } static void __trace_array_put(struct trace_array *this_tr) { WARN_ON(!this_tr->ref); this_tr->ref--; } /** * trace_array_put - Decrement the reference counter for this trace array. * @this_tr : pointer to the trace array * * NOTE: Use this when we no longer need the trace array returned by * trace_array_get_by_name(). This ensures the trace array can be later * destroyed. * */ void trace_array_put(struct trace_array *this_tr) { if (!this_tr) return; mutex_lock(&trace_types_lock); __trace_array_put(this_tr); mutex_unlock(&trace_types_lock); } EXPORT_SYMBOL_GPL(trace_array_put); int tracing_check_open_get_tr(struct trace_array *tr) { int ret; ret = security_locked_down(LOCKDOWN_TRACEFS); if (ret) return ret; if (tracing_disabled) return -ENODEV; if (tr && trace_array_get(tr) < 0) return -ENODEV; return 0; } /** * trace_find_filtered_pid - check if a pid exists in a filtered_pid list * @filtered_pids: The list of pids to check * @search_pid: The PID to find in @filtered_pids * * Returns true if @search_pid is found in @filtered_pids, and false otherwise. */ bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) { return trace_pid_list_is_set(filtered_pids, search_pid); } /** * trace_ignore_this_task - should a task be ignored for tracing * @filtered_pids: The list of pids to check * @filtered_no_pids: The list of pids not to be traced * @task: The task that should be ignored if not filtered * * Checks if @task should be traced or not from @filtered_pids. * Returns true if @task should *NOT* be traced. * Returns false if @task should be traced. */ bool trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct trace_pid_list *filtered_no_pids, struct task_struct *task) { /* * If filtered_no_pids is not empty, and the task's pid is listed * in filtered_no_pids, then return true. * Otherwise, if filtered_pids is empty, that means we can * trace all tasks. If it has content, then only trace pids * within filtered_pids. */ return (filtered_pids && !trace_find_filtered_pid(filtered_pids, task->pid)) || (filtered_no_pids && trace_find_filtered_pid(filtered_no_pids, task->pid)); } /** * trace_filter_add_remove_task - Add or remove a task from a pid_list * @pid_list: The list to modify * @self: The current task for fork or NULL for exit * @task: The task to add or remove * * If adding a task, if @self is defined, the task is only added if @self * is also included in @pid_list. This happens on fork and tasks should * only be added when the parent is listed. If @self is NULL, then the * @task pid will be removed from the list, which would happen on exit * of a task. */ void trace_filter_add_remove_task(struct trace_pid_list *pid_list, struct task_struct *self, struct task_struct *task) { if (!pid_list) return; /* For forks, we only add if the forking task is listed */ if (self) { if (!trace_find_filtered_pid(pid_list, self->pid)) return; } /* "self" is set for forks, and NULL for exits */ if (self) trace_pid_list_set(pid_list, task->pid); else trace_pid_list_clear(pid_list, task->pid); } /** * trace_pid_next - Used for seq_file to get to the next pid of a pid_list * @pid_list: The pid list to show * @v: The last pid that was shown (+1 the actual pid to let zero be displayed) * @pos: The position of the file * * This is used by the seq_file "next" operation to iterate the pids * listed in a trace_pid_list structure. * * Returns the pid+1 as we want to display pid of zero, but NULL would * stop the iteration. */ void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos) { long pid = (unsigned long)v; unsigned int next; (*pos)++; /* pid already is +1 of the actual previous bit */ if (trace_pid_list_next(pid_list, pid, &next) < 0) return NULL; pid = next; /* Return pid + 1 to allow zero to be represented */ return (void *)(pid + 1); } /** * trace_pid_start - Used for seq_file to start reading pid lists * @pid_list: The pid list to show * @pos: The position of the file * * This is used by seq_file "start" operation to start the iteration * of listing pids. * * Returns the pid+1 as we want to display pid of zero, but NULL would * stop the iteration. */ void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos) { unsigned long pid; unsigned int first; loff_t l = 0; if (trace_pid_list_first(pid_list, &first) < 0) return NULL; pid = first; /* Return pid + 1 so that zero can be the exit value */ for (pid++; pid && l < *pos; pid = (unsigned long)trace_pid_next(pid_list, (void *)pid, &l)) ; return (void *)pid; } /** * trace_pid_show - show the current pid in seq_file processing * @m: The seq_file structure to write into * @v: A void pointer of the pid (+1) value to display * * Can be directly used by seq_file operations to display the current * pid value. */ int trace_pid_show(struct seq_file *m, void *v) { unsigned long pid = (unsigned long)v - 1; seq_printf(m, "%lu\n", pid); return 0; } /* 128 should be much more than enough */ #define PID_BUF_SIZE 127 int trace_pid_write(struct trace_pid_list *filtered_pids, struct trace_pid_list **new_pid_list, const char __user *ubuf, size_t cnt) { struct trace_pid_list *pid_list; struct trace_parser parser; unsigned long val; int nr_pids = 0; ssize_t read = 0; ssize_t ret; loff_t pos; pid_t pid; if (trace_parser_get_init(&parser, PID_BUF_SIZE + 1)) return -ENOMEM; /* * Always recreate a new array. The write is an all or nothing * operation. Always create a new array when adding new pids by * the user. If the operation fails, then the current list is * not modified. */ pid_list = trace_pid_list_alloc(); if (!pid_list) { trace_parser_put(&parser); return -ENOMEM; } if (filtered_pids) { /* copy the current bits to the new max */ ret = trace_pid_list_first(filtered_pids, &pid); while (!ret) { trace_pid_list_set(pid_list, pid); ret = trace_pid_list_next(filtered_pids, pid + 1, &pid); nr_pids++; } } ret = 0; while (cnt > 0) { pos = 0; ret = trace_get_user(&parser, ubuf, cnt, &pos); if (ret < 0) break; read += ret; ubuf += ret; cnt -= ret; if (!trace_parser_loaded(&parser)) break; ret = -EINVAL; if (kstrtoul(parser.buffer, 0, &val)) break; pid = (pid_t)val; if (trace_pid_list_set(pid_list, pid) < 0) { ret = -1; break; } nr_pids++; trace_parser_clear(&parser); ret = 0; } trace_parser_put(&parser); if (ret < 0) { trace_pid_list_free(pid_list); return ret; } if (!nr_pids) { /* Cleared the list of pids */ trace_pid_list_free(pid_list); pid_list = NULL; } *new_pid_list = pid_list; return read; } static u64 buffer_ftrace_now(struct array_buffer *buf, int cpu) { u64 ts; /* Early boot up does not have a buffer yet */ if (!buf->buffer) return trace_clock_local(); ts = ring_buffer_time_stamp(buf->buffer); ring_buffer_normalize_time_stamp(buf->buffer, cpu, &ts); return ts; } u64 ftrace_now(int cpu) { return buffer_ftrace_now(&global_trace.array_buffer, cpu); } /** * tracing_is_enabled - Show if global_trace has been enabled * * Shows if the global trace has been enabled or not. It uses the * mirror flag "buffer_disabled" to be used in fast paths such as for * the irqsoff tracer. But it may be inaccurate due to races. If you * need to know the accurate state, use tracing_is_on() which is a little * slower, but accurate. */ int tracing_is_enabled(void) { /* * For quick access (irqsoff uses this in fast path), just * return the mirror variable of the state of the ring buffer. * It's a little racy, but we don't really care. */ smp_rmb(); return !global_trace.buffer_disabled; } /* * trace_buf_size is the size in bytes that is allocated * for a buffer. Note, the number of bytes is always rounded * to page size. * * This number is purposely set to a low number of 16384. * If the dump on oops happens, it will be much appreciated * to not have to wait for all that output. Anyway this can be * boot time and run time configurable. */ #define TRACE_BUF_SIZE_DEFAULT 1441792UL /* 16384 * 88 (sizeof(entry)) */ static unsigned long trace_buf_size = TRACE_BUF_SIZE_DEFAULT; /* trace_types holds a link list of available tracers. */ static struct tracer *trace_types __read_mostly; /* * trace_types_lock is used to protect the trace_types list. */ DEFINE_MUTEX(trace_types_lock); /* * serialize the access of the ring buffer * * ring buffer serializes readers, but it is low level protection. * The validity of the events (which returns by ring_buffer_peek() ..etc) * are not protected by ring buffer. * * The content of events may become garbage if we allow other process consumes * these events concurrently: * A) the page of the consumed events may become a normal page * (not reader page) in ring buffer, and this page will be rewritten * by events producer. * B) The page of the consumed events may become a page for splice_read, * and this page will be returned to system. * * These primitives allow multi process access to different cpu ring buffer * concurrently. * * These primitives don't distinguish read-only and read-consume access. * Multi read-only access are also serialized. */ #ifdef CONFIG_SMP static DECLARE_RWSEM(all_cpu_access_lock); static DEFINE_PER_CPU(struct mutex, cpu_access_lock); static inline void trace_access_lock(int cpu) { if (cpu == RING_BUFFER_ALL_CPUS) { /* gain it for accessing the whole ring buffer. */ down_write(&all_cpu_access_lock); } else { /* gain it for accessing a cpu ring buffer. */ /* Firstly block other trace_access_lock(RING_BUFFER_ALL_CPUS). */ down_read(&all_cpu_access_lock); /* Secondly block other access to this @cpu ring buffer. */ mutex_lock(&per_cpu(cpu_access_lock, cpu)); } } static inline void trace_access_unlock(int cpu) { if (cpu == RING_BUFFER_ALL_CPUS) { up_write(&all_cpu_access_lock); } else { mutex_unlock(&per_cpu(cpu_access_lock, cpu)); up_read(&all_cpu_access_lock); } } static inline void trace_access_lock_init(void) { int cpu; for_each_possible_cpu(cpu) mutex_init(&per_cpu(cpu_access_lock, cpu)); } #else static DEFINE_MUTEX(access_lock); static inline void trace_access_lock(int cpu) { (void)cpu; mutex_lock(&access_lock); } static inline void trace_access_unlock(int cpu) { (void)cpu; mutex_unlock(&access_lock); } static inline void trace_access_lock_init(void) { } #endif #ifdef CONFIG_STACKTRACE static void __ftrace_trace_stack(struct trace_array *tr, struct trace_buffer *buffer, unsigned int trace_ctx, int skip, struct pt_regs *regs); static inline void ftrace_trace_stack(struct trace_array *tr, struct trace_buffer *buffer, unsigned int trace_ctx, int skip, struct pt_regs *regs); #else static inline void __ftrace_trace_stack(struct trace_array *tr, struct trace_buffer *buffer, unsigned int trace_ctx, int skip, struct pt_regs *regs) { } static inline void ftrace_trace_stack(struct trace_array *tr, struct trace_buffer *buffer, unsigned long trace_ctx, int skip, struct pt_regs *regs) { } #endif static __always_inline void trace_event_setup(struct ring_buffer_event *event, int type, unsigned int trace_ctx) { struct trace_entry *ent = ring_buffer_event_data(event); tracing_generic_entry_update(ent, type, trace_ctx); } static __always_inline struct ring_buffer_event * __trace_buffer_lock_reserve(struct trace_buffer *buffer, int type, unsigned long len, unsigned int trace_ctx) { struct ring_buffer_event *event; event = ring_buffer_lock_reserve(buffer, len); if (event != NULL) trace_event_setup(event, type, trace_ctx); return event; } void tracer_tracing_on(struct trace_array *tr) { if (tr->array_buffer.buffer) ring_buffer_record_on(tr->array_buffer.buffer); /* * This flag is looked at when buffers haven't been allocated * yet, or by some tracers (like irqsoff), that just want to * know if the ring buffer has been disabled, but it can handle * races of where it gets disabled but we still do a record. * As the check is in the fast path of the tracers, it is more * important to be fast than accurate. */ tr->buffer_disabled = 0; /* Make the flag seen by readers */ smp_wmb(); } /** * tracing_on - enable tracing buffers * * This function enables tracing buffers that may have been * disabled with tracing_off. */ void tracing_on(void) { tracer_tracing_on(&global_trace); } EXPORT_SYMBOL_GPL(tracing_on); static __always_inline void __buffer_unlock_commit(struct trace_buffer *buffer, struct ring_buffer_event *event) { __this_cpu_write(trace_taskinfo_save, true); /* If this is the temp buffer, we need to commit fully */ if (this_cpu_read(trace_buffered_event) == event) { /* Length is in event->array[0] */ ring_buffer_write(buffer, event->array[0], &event->array[1]); /* Release the temp buffer */ this_cpu_dec(trace_buffered_event_cnt); /* ring_buffer_unlock_commit() enables preemption */ preempt_enable_notrace(); } else ring_buffer_unlock_commit(buffer); } int __trace_array_puts(struct trace_array *tr, unsigned long ip, const char *str, int size) { struct ring_buffer_event *event; struct trace_buffer *buffer; struct print_entry *entry; unsigned int trace_ctx; int alloc; if (!(tr->trace_flags & TRACE_ITER_PRINTK)) return 0; if (unlikely(tracing_selftest_running && tr == &global_trace)) return 0; if (unlikely(tracing_disabled)) return 0; alloc = sizeof(*entry) + size + 2; /* possible \n added */ trace_ctx = tracing_gen_ctx(); buffer = tr->array_buffer.buffer; ring_buffer_nest_start(buffer); event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, trace_ctx); if (!event) { size = 0; goto out; } entry = ring_buffer_event_data(event); entry->ip = ip; memcpy(&entry->buf, str, size); /* Add a newline if necessary */ if (entry->buf[size - 1] != '\n') { entry->buf[size] = '\n'; entry->buf[size + 1] = '\0'; } else entry->buf[size] = '\0'; __buffer_unlock_commit(buffer, event); ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL); out: ring_buffer_nest_end(buffer); return size; } EXPORT_SYMBOL_GPL(__trace_array_puts); /** * __trace_puts - write a constant string into the trace buffer. * @ip: The address of the caller * @str: The constant string to write * @size: The size of the string. */ int __trace_puts(unsigned long ip, const char *str, int size) { return __trace_array_puts(printk_trace, ip, str, size); } EXPORT_SYMBOL_GPL(__trace_puts); /** * __trace_bputs - write the pointer to a constant string into trace buffer * @ip: The address of the caller * @str: The constant string to write to the buffer to */ int __trace_bputs(unsigned long ip, const char *str) { struct trace_array *tr = READ_ONCE(printk_trace); struct ring_buffer_event *event; struct trace_buffer *buffer; struct bputs_entry *entry; unsigned int trace_ctx; int size = sizeof(struct bputs_entry); int ret = 0; if (!printk_binsafe(tr)) return __trace_puts(ip, str, strlen(str)); if (!(tr->trace_flags & TRACE_ITER_PRINTK)) return 0; if (unlikely(tracing_selftest_running || tracing_disabled)) return 0; trace_ctx = tracing_gen_ctx(); buffer = tr->array_buffer.buffer; ring_buffer_nest_start(buffer); event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, trace_ctx); if (!event) goto out; entry = ring_buffer_event_data(event); entry->ip = ip; entry->str = str; __buffer_unlock_commit(buffer, event); ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL); ret = 1; out: ring_buffer_nest_end(buffer); return ret; } EXPORT_SYMBOL_GPL(__trace_bputs); #ifdef CONFIG_TRACER_SNAPSHOT static void tracing_snapshot_instance_cond(struct trace_array *tr, void *cond_data) { struct tracer *tracer = tr->current_trace; unsigned long flags; if (in_nmi()) { trace_array_puts(tr, "*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n"); trace_array_puts(tr, "*** snapshot is being ignored ***\n"); return; } if (!tr->allocated_snapshot) { trace_array_puts(tr, "*** SNAPSHOT NOT ALLOCATED ***\n"); trace_array_puts(tr, "*** stopping trace here! ***\n"); tracer_tracing_off(tr); return; } /* Note, snapshot can not be used when the tracer uses it */ if (tracer->use_max_tr) { trace_array_puts(tr, "*** LATENCY TRACER ACTIVE ***\n"); trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n"); return; } if (tr->mapped) { trace_array_puts(tr, "*** BUFFER MEMORY MAPPED ***\n"); trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n"); return; } local_irq_save(flags); update_max_tr(tr, current, smp_processor_id(), cond_data); local_irq_restore(flags); } void tracing_snapshot_instance(struct trace_array *tr) { tracing_snapshot_instance_cond(tr, NULL); } /** * tracing_snapshot - take a snapshot of the current buffer. * * This causes a swap between the snapshot buffer and the current live * tracing buffer. You can use this to take snapshots of the live * trace when some condition is triggered, but continue to trace. * * Note, make sure to allocate the snapshot with either * a tracing_snapshot_alloc(), or by doing it manually * with: echo 1 > /sys/kernel/tracing/snapshot * * If the snapshot buffer is not allocated, it will stop tracing. * Basically making a permanent snapshot. */ void tracing_snapshot(void) { struct trace_array *tr = &global_trace; tracing_snapshot_instance(tr); } EXPORT_SYMBOL_GPL(tracing_snapshot); /** * tracing_snapshot_cond - conditionally take a snapshot of the current buffer. * @tr: The tracing instance to snapshot * @cond_data: The data to be tested conditionally, and possibly saved * * This is the same as tracing_snapshot() except that the snapshot is * conditional - the snapshot will only happen if the * cond_snapshot.update() implementation receiving the cond_data * returns true, which means that the trace array's cond_snapshot * update() operation used the cond_data to determine whether the * snapshot should be taken, and if it was, presumably saved it along * with the snapshot. */ void tracing_snapshot_cond(struct trace_array *tr, void *cond_data) { tracing_snapshot_instance_cond(tr, cond_data); } EXPORT_SYMBOL_GPL(tracing_snapshot_cond); /** * tracing_cond_snapshot_data - get the user data associated with a snapshot * @tr: The tracing instance * * When the user enables a conditional snapshot using * tracing_snapshot_cond_enable(), the user-defined cond_data is saved * with the snapshot. This accessor is used to retrieve it. * * Should not be called from cond_snapshot.update(), since it takes * the tr->max_lock lock, which the code calling * cond_snapshot.update() has already done. * * Returns the cond_data associated with the trace array's snapshot. */ void *tracing_cond_snapshot_data(struct trace_array *tr) { void *cond_data = NULL; local_irq_disable(); arch_spin_lock(&tr->max_lock); if (tr->cond_snapshot) cond_data = tr->cond_snapshot->cond_data; arch_spin_unlock(&tr->max_lock); local_irq_enable(); return cond_data; } EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data); static int resize_buffer_duplicate_size(struct array_buffer *trace_buf, struct array_buffer *size_buf, int cpu_id); static void set_buffer_entries(struct array_buffer *buf, unsigned long val); int tracing_alloc_snapshot_instance(struct trace_array *tr) { int order; int ret; if (!tr->allocated_snapshot) { /* Make the snapshot buffer have the same order as main buffer */ order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer); ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, order); if (ret < 0) return ret; /* allocate spare buffer */ ret = resize_buffer_duplicate_size(&tr->max_buffer, &tr->array_buffer, RING_BUFFER_ALL_CPUS); if (ret < 0) return ret; tr->allocated_snapshot = true; } return 0; } static void free_snapshot(struct trace_array *tr) { /* * We don't free the ring buffer. instead, resize it because * The max_tr ring buffer has some state (e.g. ring->clock) and * we want preserve it. */ ring_buffer_subbuf_order_set(tr->max_buffer.buffer, 0); ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); set_buffer_entries(&tr->max_buffer, 1); tracing_reset_online_cpus(&tr->max_buffer); tr->allocated_snapshot = false; } static int tracing_arm_snapshot_locked(struct trace_array *tr) { int ret; lockdep_assert_held(&trace_types_lock); spin_lock(&tr->snapshot_trigger_lock); if (tr->snapshot == UINT_MAX || tr->mapped) { spin_unlock(&tr->snapshot_trigger_lock); return -EBUSY; } tr->snapshot++; spin_unlock(&tr->snapshot_trigger_lock); ret = tracing_alloc_snapshot_instance(tr); if (ret) { spin_lock(&tr->snapshot_trigger_lock); tr->snapshot--; spin_unlock(&tr->snapshot_trigger_lock); } return ret; } int tracing_arm_snapshot(struct trace_array *tr) { int ret; mutex_lock(&trace_types_lock); ret = tracing_arm_snapshot_locked(tr); mutex_unlock(&trace_types_lock); return ret; } void tracing_disarm_snapshot(struct trace_array *tr) { spin_lock(&tr->snapshot_trigger_lock); if (!WARN_ON(!tr->snapshot)) tr->snapshot--; spin_unlock(&tr->snapshot_trigger_lock); } /** * tracing_alloc_snapshot - allocate snapshot buffer. * * This only allocates the snapshot buffer if it isn't already * allocated - it doesn't also take a snapshot. * * This is meant to be used in cases where the snapshot buffer needs * to be set up for events that can't sleep but need to be able to * trigger a snapshot. */ int tracing_alloc_snapshot(void) { struct trace_array *tr = &global_trace; int ret; ret = tracing_alloc_snapshot_instance(tr); WARN_ON(ret < 0); return ret; } EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); /** * tracing_snapshot_alloc - allocate and take a snapshot of the current buffer. * * This is similar to tracing_snapshot(), but it will allocate the * snapshot buffer if it isn't already allocated. Use this only * where it is safe to sleep, as the allocation may sleep. * * This causes a swap between the snapshot buffer and the current live * tracing buffer. You can use this to take snapshots of the live * trace when some condition is triggered, but continue to trace. */ void tracing_snapshot_alloc(void) { int ret; ret = tracing_alloc_snapshot(); if (ret < 0) return; tracing_snapshot(); } EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); /** * tracing_snapshot_cond_enable - enable conditional snapshot for an instance * @tr: The tracing instance * @cond_data: User data to associate with the snapshot * @update: Implementation of the cond_snapshot update function * * Check whether the conditional snapshot for the given instance has * already been enabled, or if the current tracer is already using a * snapshot; if so, return -EBUSY, else create a cond_snapshot and * save the cond_data and update function inside. * * Returns 0 if successful, error otherwise. */ int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update) { struct cond_snapshot *cond_snapshot __free(kfree) = kzalloc(sizeof(*cond_snapshot), GFP_KERNEL); int ret; if (!cond_snapshot) return -ENOMEM; cond_snapshot->cond_data = cond_data; cond_snapshot->update = update; guard(mutex)(&trace_types_lock); if (tr->current_trace->use_max_tr) return -EBUSY; /* * The cond_snapshot can only change to NULL without the * trace_types_lock. We don't care if we race with it going * to NULL, but we want to make sure that it's not set to * something other than NULL when we get here, which we can * do safely with only holding the trace_types_lock and not * having to take the max_lock. */ if (tr->cond_snapshot) return -EBUSY; ret = tracing_arm_snapshot_locked(tr); if (ret) return ret; local_irq_disable(); arch_spin_lock(&tr->max_lock); tr->cond_snapshot = no_free_ptr(cond_snapshot); arch_spin_unlock(&tr->max_lock); local_irq_enable(); return 0; } EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable); /** * tracing_snapshot_cond_disable - disable conditional snapshot for an instance * @tr: The tracing instance * * Check whether the conditional snapshot for the given instance is * enabled; if so, free the cond_snapshot associated with it, * otherwise return -EINVAL. * * Returns 0 if successful, error otherwise. */ int tracing_snapshot_cond_disable(struct trace_array *tr) { int ret = 0; local_irq_disable(); arch_spin_lock(&tr->max_lock); if (!tr->cond_snapshot) ret = -EINVAL; else { kfree(tr->cond_snapshot); tr->cond_snapshot = NULL; } arch_spin_unlock(&tr->max_lock); local_irq_enable(); tracing_disarm_snapshot(tr); return ret; } EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); #else void tracing_snapshot(void) { WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used"); } EXPORT_SYMBOL_GPL(tracing_snapshot); void tracing_snapshot_cond(struct trace_array *tr, void *cond_data) { WARN_ONCE(1, "Snapshot feature not enabled, but internal conditional snapshot used"); } EXPORT_SYMBOL_GPL(tracing_snapshot_cond); int tracing_alloc_snapshot(void) { WARN_ONCE(1, "Snapshot feature not enabled, but snapshot allocation used"); return -ENODEV; } EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); void tracing_snapshot_alloc(void) { /* Give warning */ tracing_snapshot(); } EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); void *tracing_cond_snapshot_data(struct trace_array *tr) { return NULL; } EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data); int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update) { return -ENODEV; } EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable); int tracing_snapshot_cond_disable(struct trace_array *tr) { return false; } EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); #define free_snapshot(tr) do { } while (0) #define tracing_arm_snapshot_locked(tr) ({ -EBUSY; }) #endif /* CONFIG_TRACER_SNAPSHOT */ void tracer_tracing_off(struct trace_array *tr) { if (tr->array_buffer.buffer) ring_buffer_record_off(tr->array_buffer.buffer); /* * This flag is looked at when buffers haven't been allocated * yet, or by some tracers (like irqsoff), that just want to * know if the ring buffer has been disabled, but it can handle * races of where it gets disabled but we still do a record. * As the check is in the fast path of the tracers, it is more * important to be fast than accurate. */ tr->buffer_disabled = 1; /* Make the flag seen by readers */ smp_wmb(); } /** * tracing_off - turn off tracing buffers * * This function stops the tracing buffers from recording data. * It does not disable any overhead the tracers themselves may * be causing. This function simply causes all recording to * the ring buffers to fail. */ void tracing_off(void) { tracer_tracing_off(&global_trace); } EXPORT_SYMBOL_GPL(tracing_off); void disable_trace_on_warning(void) { if (__disable_trace_on_warning) { trace_array_printk_buf(global_trace.array_buffer.buffer, _THIS_IP_, "Disabling tracing due to warning\n"); tracing_off(); } } /** * tracer_tracing_is_on - show real state of ring buffer enabled * @tr : the trace array to know if ring buffer is enabled * * Shows real state of the ring buffer if it is enabled or not. */ bool tracer_tracing_is_on(struct trace_array *tr) { if (tr->array_buffer.buffer) return ring_buffer_record_is_set_on(tr->array_buffer.buffer); return !tr->buffer_disabled; } /** * tracing_is_on - show state of ring buffers enabled */ int tracing_is_on(void) { return tracer_tracing_is_on(&global_trace); } EXPORT_SYMBOL_GPL(tracing_is_on); static int __init set_buf_size(char *str) { unsigned long buf_size; if (!str) return 0; buf_size = memparse(str, &str); /* * nr_entries can not be zero and the startup * tests require some buffer space. Therefore * ensure we have at least 4096 bytes of buffer. */ trace_buf_size = max(4096UL, buf_size); return 1; } __setup("trace_buf_size=", set_buf_size); static int __init set_tracing_thresh(char *str) { unsigned long threshold; int ret; if (!str) return 0; ret = kstrtoul(str, 0, &threshold); if (ret < 0) return 0; tracing_thresh = threshold * 1000; return 1; } __setup("tracing_thresh=", set_tracing_thresh); unsigned long nsecs_to_usecs(unsigned long nsecs) { return nsecs / 1000; } /* * TRACE_FLAGS is defined as a tuple matching bit masks with strings. * It uses C(a, b) where 'a' is the eval (enum) name and 'b' is the string that * matches it. By defining "C(a, b) b", TRACE_FLAGS becomes a list * of strings in the order that the evals (enum) were defined. */ #undef C #define C(a, b) b /* These must match the bit positions in trace_iterator_flags */ static const char *trace_options[] = { TRACE_FLAGS NULL }; static struct { u64 (*func)(void); const char *name; int in_ns; /* is this clock in nanoseconds? */ } trace_clocks[] = { { trace_clock_local, "local", 1 }, { trace_clock_global, "global", 1 }, { trace_clock_counter, "counter", 0 }, { trace_clock_jiffies, "uptime", 0 }, { trace_clock, "perf", 1 }, { ktime_get_mono_fast_ns, "mono", 1 }, { ktime_get_raw_fast_ns, "mono_raw", 1 }, { ktime_get_boot_fast_ns, "boot", 1 }, { ktime_get_tai_fast_ns, "tai", 1 }, ARCH_TRACE_CLOCKS }; bool trace_clock_in_ns(struct trace_array *tr) { if (trace_clocks[tr->clock_id].in_ns) return true; return false; } /* * trace_parser_get_init - gets the buffer for trace parser */ int trace_parser_get_init(struct trace_parser *parser, int size) { memset(parser, 0, sizeof(*parser)); parser->buffer = kmalloc(size, GFP_KERNEL); if (!parser->buffer) return 1; parser->size = size; return 0; } /* * trace_parser_put - frees the buffer for trace parser */ void trace_parser_put(struct trace_parser *parser) { kfree(parser->buffer); parser->buffer = NULL; } /* * trace_get_user - reads the user input string separated by space * (matched by isspace(ch)) * * For each string found the 'struct trace_parser' is updated, * and the function returns. * * Returns number of bytes read. * * See kernel/trace/trace.h for 'struct trace_parser' details. */ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, size_t cnt, loff_t *ppos) { char ch; size_t read = 0; ssize_t ret; if (!*ppos) trace_parser_clear(parser); ret = get_user(ch, ubuf++); if (ret) goto out; read++; cnt--; /* * The parser is not finished with the last write, * continue reading the user input without skipping spaces. */ if (!parser->cont) { /* skip white space */ while (cnt && isspace(ch)) { ret = get_user(ch, ubuf++); if (ret) goto out; read++; cnt--; } parser->idx = 0; /* only spaces were written */ if (isspace(ch) || !ch) { *ppos += read; ret = read; goto out; } } /* read the non-space input */ while (cnt && !isspace(ch) && ch) { if (parser->idx < parser->size - 1) parser->buffer[parser->idx++] = ch; else { ret = -EINVAL; goto out; } ret = get_user(ch, ubuf++); if (ret) goto out; read++; cnt--; } /* We either got finished input or we have to wait for another call. */ if (isspace(ch) || !ch) { parser->buffer[parser->idx] = 0; parser->cont = false; } else if (parser->idx < parser->size - 1) { parser->cont = true; parser->buffer[parser->idx++] = ch; /* Make sure the parsed string always terminates with '\0'. */ parser->buffer[parser->idx] = 0; } else { ret = -EINVAL; goto out; } *ppos += read; ret = read; out: return ret; } /* TODO add a seq_buf_to_buffer() */ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) { int len; if (trace_seq_used(s) <= s->readpos) return -EBUSY; len = trace_seq_used(s) - s->readpos; if (cnt > len) cnt = len; memcpy(buf, s->buffer + s->readpos, cnt); s->readpos += cnt; return cnt; } unsigned long __read_mostly tracing_thresh; #ifdef CONFIG_TRACER_MAX_TRACE static const struct file_operations tracing_max_lat_fops; #ifdef LATENCY_FS_NOTIFY static struct workqueue_struct *fsnotify_wq; static void latency_fsnotify_workfn(struct work_struct *work) { struct trace_array *tr = container_of(work, struct trace_array, fsnotify_work); fsnotify_inode(tr->d_max_latency->d_inode, FS_MODIFY); } static void latency_fsnotify_workfn_irq(struct irq_work *iwork) { struct trace_array *tr = container_of(iwork, struct trace_array, fsnotify_irqwork); queue_work(fsnotify_wq, &tr->fsnotify_work); } static void trace_create_maxlat_file(struct trace_array *tr, struct dentry *d_tracer) { INIT_WORK(&tr->fsnotify_work, latency_fsnotify_workfn); init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq); tr->d_max_latency = trace_create_file("tracing_max_latency", TRACE_MODE_WRITE, d_tracer, tr, &tracing_max_lat_fops); } __init static int latency_fsnotify_init(void) { fsnotify_wq = alloc_workqueue("tr_max_lat_wq", WQ_UNBOUND | WQ_HIGHPRI, 0); if (!fsnotify_wq) { pr_err("Unable to allocate tr_max_lat_wq\n"); return -ENOMEM; } return 0; } late_initcall_sync(latency_fsnotify_init); void latency_fsnotify(struct trace_array *tr) { if (!fsnotify_wq) return; /* * We cannot call queue_work(&tr->fsnotify_work) from here because it's * possible that we are called from __schedule() or do_idle(), which * could cause a deadlock. */ irq_work_queue(&tr->fsnotify_irqwork); } #else /* !LATENCY_FS_NOTIFY */ #define trace_create_maxlat_file(tr, d_tracer) \ trace_create_file("tracing_max_latency", TRACE_MODE_WRITE, \ d_tracer, tr, &tracing_max_lat_fops) #endif /* * Copy the new maximum trace into the separate maximum-trace * structure. (this way the maximum trace is permanently saved, * for later retrieval via /sys/kernel/tracing/tracing_max_latency) */ static void __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { struct array_buffer *trace_buf = &tr->array_buffer; struct array_buffer *max_buf = &tr->max_buffer; struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu); struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu); max_buf->cpu = cpu; max_buf->time_start = data->preempt_timestamp; max_data->saved_latency = tr->max_latency; max_data->critical_start = data->critical_start; max_data->critical_end = data->critical_end; strscpy(max_data->comm, tsk->comm); max_data->pid = tsk->pid; /* * If tsk == current, then use current_uid(), as that does not use * RCU. The irq tracer can be called out of RCU scope. */ if (tsk == current) max_data->uid = current_uid(); else max_data->uid = task_uid(tsk); max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; max_data->policy = tsk->policy; max_data->rt_priority = tsk->rt_priority; /* record this tasks comm */ tracing_record_cmdline(tsk); latency_fsnotify(tr); } /** * update_max_tr - snapshot all trace buffers from global_trace to max_tr * @tr: tracer * @tsk: the task with the latency * @cpu: The cpu that initiated the trace. * @cond_data: User data associated with a conditional snapshot * * Flip the buffers between the @tr and the max_tr and record information * about which task was the cause of this latency. */ void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, void *cond_data) { if (tr->stop_count) return; WARN_ON_ONCE(!irqs_disabled()); if (!tr->allocated_snapshot) { /* Only the nop tracer should hit this when disabling */ WARN_ON_ONCE(tr->current_trace != &nop_trace); return; } arch_spin_lock(&tr->max_lock); /* Inherit the recordable setting from array_buffer */ if (ring_buffer_record_is_set_on(tr->array_buffer.buffer)) ring_buffer_record_on(tr->max_buffer.buffer); else ring_buffer_record_off(tr->max_buffer.buffer); #ifdef CONFIG_TRACER_SNAPSHOT if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data)) { arch_spin_unlock(&tr->max_lock); return; } #endif swap(tr->array_buffer.buffer, tr->max_buffer.buffer); __update_max_tr(tr, tsk, cpu); arch_spin_unlock(&tr->max_lock); /* Any waiters on the old snapshot buffer need to wake up */ ring_buffer_wake_waiters(tr->array_buffer.buffer, RING_BUFFER_ALL_CPUS); } /** * update_max_tr_single - only copy one trace over, and reset the rest * @tr: tracer * @tsk: task with the latency * @cpu: the cpu of the buffer to copy. * * Flip the trace of a single CPU buffer between the @tr and the max_tr. */ void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) { int ret; if (tr->stop_count) return; WARN_ON_ONCE(!irqs_disabled()); if (!tr->allocated_snapshot) { /* Only the nop tracer should hit this when disabling */ WARN_ON_ONCE(tr->current_trace != &nop_trace); return; } arch_spin_lock(&tr->max_lock); ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->array_buffer.buffer, cpu); if (ret == -EBUSY) { /* * We failed to swap the buffer due to a commit taking * place on this CPU. We fail to record, but we reset * the max trace buffer (no one writes directly to it) * and flag that it failed. * Another reason is resize is in progress. */ trace_array_printk_buf(tr->max_buffer.buffer, _THIS_IP_, "Failed to swap buffers due to commit or resize in progress\n"); } WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); __update_max_tr(tr, tsk, cpu); arch_spin_unlock(&tr->max_lock); } #endif /* CONFIG_TRACER_MAX_TRACE */ struct pipe_wait { struct trace_iterator *iter; int wait_index; }; static bool wait_pipe_cond(void *data) { struct pipe_wait *pwait = data; struct trace_iterator *iter = pwait->iter; if (atomic_read_acquire(&iter->wait_index) != pwait->wait_index) return true; return iter->closed; } static int wait_on_pipe(struct trace_iterator *iter, int full) { struct pipe_wait pwait; int ret; /* Iterators are static, they should be filled or empty */ if (trace_buffer_iter(iter, iter->cpu_file)) return 0; pwait.wait_index = atomic_read_acquire(&iter->wait_index); pwait.iter = iter; ret = ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, full, wait_pipe_cond, &pwait); #ifdef CONFIG_TRACER_MAX_TRACE /* * Make sure this is still the snapshot buffer, as if a snapshot were * to happen, this would now be the main buffer. */ if (iter->snapshot) iter->array_buffer = &iter->tr->max_buffer; #endif return ret; } #ifdef CONFIG_FTRACE_STARTUP_TEST static bool selftests_can_run; struct trace_selftests { struct list_head list; struct tracer *type; }; static LIST_HEAD(postponed_selftests); static int save_selftest(struct tracer *type) { struct trace_selftests *selftest; selftest = kmalloc(sizeof(*selftest), GFP_KERNEL); if (!selftest) return -ENOMEM; selftest->type = type; list_add(&selftest->list, &postponed_selftests); return 0; } static int run_tracer_selftest(struct tracer *type) { struct trace_array *tr = &global_trace; struct tracer *saved_tracer = tr->current_trace; int ret; if (!type->selftest || tracing_selftest_disabled) return 0; /* * If a tracer registers early in boot up (before scheduling is * initialized and such), then do not run its selftests yet. * Instead, run it a little later in the boot process. */ if (!selftests_can_run) return save_selftest(type); if (!tracing_is_on()) { pr_warn("Selftest for tracer %s skipped due to tracing disabled\n", type->name); return 0; } /* * Run a selftest on this tracer. * Here we reset the trace buffer, and set the current * tracer to be this tracer. The tracer can then run some * internal tracing to verify that everything is in order. * If we fail, we do not register this tracer. */ tracing_reset_online_cpus(&tr->array_buffer); tr->current_trace = type; #ifdef CONFIG_TRACER_MAX_TRACE if (type->use_max_tr) { /* If we expanded the buffers, make sure the max is expanded too */ if (tr->ring_buffer_expanded) ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size, RING_BUFFER_ALL_CPUS); tr->allocated_snapshot = true; } #endif /* the test is responsible for initializing and enabling */ pr_info("Testing tracer %s: ", type->name); ret = type->selftest(type, tr); /* the test is responsible for resetting too */ tr->current_trace = saved_tracer; if (ret) { printk(KERN_CONT "FAILED!\n"); /* Add the warning after printing 'FAILED' */ WARN_ON(1); return -1; } /* Only reset on passing, to avoid touching corrupted buffers */ tracing_reset_online_cpus(&tr->array_buffer); #ifdef CONFIG_TRACER_MAX_TRACE if (type->use_max_tr) { tr->allocated_snapshot = false; /* Shrink the max buffer again */ if (tr->ring_buffer_expanded) ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); } #endif printk(KERN_CONT "PASSED\n"); return 0; } static int do_run_tracer_selftest(struct tracer *type) { int ret; /* * Tests can take a long time, especially if they are run one after the * other, as does happen during bootup when all the tracers are * registered. This could cause the soft lockup watchdog to trigger. */ cond_resched(); tracing_selftest_running = true; ret = run_tracer_selftest(type); tracing_selftest_running = false; return ret; } static __init int init_trace_selftests(void) { struct trace_selftests *p, *n; struct tracer *t, **last; int ret; selftests_can_run = true; guard(mutex)(&trace_types_lock); if (list_empty(&postponed_selftests)) return 0; pr_info("Running postponed tracer tests:\n"); tracing_selftest_running = true; list_for_each_entry_safe(p, n, &postponed_selftests, list) { /* This loop can take minutes when sanitizers are enabled, so * lets make sure we allow RCU processing. */ cond_resched(); ret = run_tracer_selftest(p->type); /* If the test fails, then warn and remove from available_tracers */ if (ret < 0) { WARN(1, "tracer: %s failed selftest, disabling\n", p->type->name); last = &trace_types; for (t = trace_types; t; t = t->next) { if (t == p->type) { *last = t->next; break; } last = &t->next; } } list_del(&p->list); kfree(p); } tracing_selftest_running = false; return 0; } core_initcall(init_trace_selftests); #else static inline int do_run_tracer_selftest(struct tracer *type) { return 0; } #endif /* CONFIG_FTRACE_STARTUP_TEST */ static void add_tracer_options(struct trace_array *tr, struct tracer *t); static void __init apply_trace_boot_options(void); /** * register_tracer - register a tracer with the ftrace system. * @type: the plugin for the tracer * * Register a new plugin tracer. */ int __init register_tracer(struct tracer *type) { struct tracer *t; int ret = 0; if (!type->name) { pr_info("Tracer must have a name\n"); return -1; } if (strlen(type->name) >= MAX_TRACER_SIZE) { pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE); return -1; } if (security_locked_down(LOCKDOWN_TRACEFS)) { pr_warn("Can not register tracer %s due to lockdown\n", type->name); return -EPERM; } mutex_lock(&trace_types_lock); for (t = trace_types; t; t = t->next) { if (strcmp(type->name, t->name) == 0) { /* already found */ pr_info("Tracer %s already registered\n", type->name); ret = -1; goto out; } } if (!type->set_flag) type->set_flag = &dummy_set_flag; if (!type->flags) { /*allocate a dummy tracer_flags*/ type->flags = kmalloc(sizeof(*type->flags), GFP_KERNEL); if (!type->flags) { ret = -ENOMEM; goto out; } type->flags->val = 0; type->flags->opts = dummy_tracer_opt; } else if (!type->flags->opts) type->flags->opts = dummy_tracer_opt; /* store the tracer for __set_tracer_option */ type->flags->trace = type; ret = do_run_tracer_selftest(type); if (ret < 0) goto out; type->next = trace_types; trace_types = type; add_tracer_options(&global_trace, type); out: mutex_unlock(&trace_types_lock); if (ret || !default_bootup_tracer) goto out_unlock; if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE)) goto out_unlock; printk(KERN_INFO "Starting tracer '%s'\n", type->name); /* Do we want this tracer to start on bootup? */ tracing_set_tracer(&global_trace, type->name); default_bootup_tracer = NULL; apply_trace_boot_options(); /* disable other selftests, since this will break it. */ disable_tracing_selftest("running a tracer"); out_unlock: return ret; } static void tracing_reset_cpu(struct array_buffer *buf, int cpu) { struct trace_buffer *buffer = buf->buffer; if (!buffer) return; ring_buffer_record_disable(buffer); /* Make sure all commits have finished */ synchronize_rcu(); ring_buffer_reset_cpu(buffer, cpu); ring_buffer_record_enable(buffer); } void tracing_reset_online_cpus(struct array_buffer *buf) { struct trace_buffer *buffer = buf->buffer; if (!buffer) return; ring_buffer_record_disable(buffer); /* Make sure all commits have finished */ synchronize_rcu(); buf->time_start = buffer_ftrace_now(buf, buf->cpu); ring_buffer_reset_online_cpus(buffer); ring_buffer_record_enable(buffer); } static void tracing_reset_all_cpus(struct array_buffer *buf) { struct trace_buffer *buffer = buf->buffer; if (!buffer) return; ring_buffer_record_disable(buffer); /* Make sure all commits have finished */ synchronize_rcu(); buf->time_start = buffer_ftrace_now(buf, buf->cpu); ring_buffer_reset(buffer); ring_buffer_record_enable(buffer); } /* Must have trace_types_lock held */ void tracing_reset_all_online_cpus_unlocked(void) { struct trace_array *tr; lockdep_assert_held(&trace_types_lock); list_for_each_entry(tr, &ftrace_trace_arrays, list) { if (!tr->clear_trace) continue; tr->clear_trace = false; tracing_reset_online_cpus(&tr->array_buffer); #ifdef CONFIG_TRACER_MAX_TRACE tracing_reset_online_cpus(&tr->max_buffer); #endif } } void tracing_reset_all_online_cpus(void) { mutex_lock(&trace_types_lock); tracing_reset_all_online_cpus_unlocked(); mutex_unlock(&trace_types_lock); } int is_tracing_stopped(void) { return global_trace.stop_count; } static void tracing_start_tr(struct trace_array *tr) { struct trace_buffer *buffer; unsigned long flags; if (tracing_disabled) return; raw_spin_lock_irqsave(&tr->start_lock, flags); if (--tr->stop_count) { if (WARN_ON_ONCE(tr->stop_count < 0)) { /* Someone screwed up their debugging */ tr->stop_count = 0; } goto out; } /* Prevent the buffers from switching */ arch_spin_lock(&tr->max_lock); buffer = tr->array_buffer.buffer; if (buffer) ring_buffer_record_enable(buffer); #ifdef CONFIG_TRACER_MAX_TRACE buffer = tr->max_buffer.buffer; if (buffer) ring_buffer_record_enable(buffer); #endif arch_spin_unlock(&tr->max_lock); out: raw_spin_unlock_irqrestore(&tr->start_lock, flags); } /** * tracing_start - quick start of the tracer * * If tracing is enabled but was stopped by tracing_stop, * this will start the tracer back up. */ void tracing_start(void) { return tracing_start_tr(&global_trace); } static void tracing_stop_tr(struct trace_array *tr) { struct trace_buffer *buffer; unsigned long flags; raw_spin_lock_irqsave(&tr->start_lock, flags); if (tr->stop_count++) goto out; /* Prevent the buffers from switching */ arch_spin_lock(&tr->max_lock); buffer = tr->array_buffer.buffer; if (buffer) ring_buffer_record_disable(buffer); #ifdef CONFIG_TRACER_MAX_TRACE buffer = tr->max_buffer.buffer; if (buffer) ring_buffer_record_disable(buffer); #endif arch_spin_unlock(&tr->max_lock); out: raw_spin_unlock_irqrestore(&tr->start_lock, flags); } /** * tracing_stop - quick stop of the tracer * * Light weight way to stop tracing. Use in conjunction with * tracing_start. */ void tracing_stop(void) { return tracing_stop_tr(&global_trace); } /* * Several functions return TRACE_TYPE_PARTIAL_LINE if the trace_seq * overflowed, and TRACE_TYPE_HANDLED otherwise. This helper function * simplifies those functions and keeps them in sync. */ enum print_line_t trace_handle_return(struct trace_seq *s) { return trace_seq_has_overflowed(s) ? TRACE_TYPE_PARTIAL_LINE : TRACE_TYPE_HANDLED; } EXPORT_SYMBOL_GPL(trace_handle_return); static unsigned short migration_disable_value(void) { #if defined(CONFIG_SMP) return current->migration_disabled; #else return 0; #endif } unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status) { unsigned int trace_flags = irqs_status; unsigned int pc; pc = preempt_count(); if (pc & NMI_MASK) trace_flags |= TRACE_FLAG_NMI; if (pc & HARDIRQ_MASK) trace_flags |= TRACE_FLAG_HARDIRQ; if (in_serving_softirq()) trace_flags |= TRACE_FLAG_SOFTIRQ; if (softirq_count() >> (SOFTIRQ_SHIFT + 1)) trace_flags |= TRACE_FLAG_BH_OFF; if (tif_need_resched()) trace_flags |= TRACE_FLAG_NEED_RESCHED; if (test_preempt_need_resched()) trace_flags |= TRACE_FLAG_PREEMPT_RESCHED; if (IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY) && tif_test_bit(TIF_NEED_RESCHED_LAZY)) trace_flags |= TRACE_FLAG_NEED_RESCHED_LAZY; return (trace_flags << 16) | (min_t(unsigned int, pc & 0xff, 0xf)) | (min_t(unsigned int, migration_disable_value(), 0xf)) << 4; } struct ring_buffer_event * trace_buffer_lock_reserve(struct trace_buffer *buffer, int type, unsigned long len, unsigned int trace_ctx) { return __trace_buffer_lock_reserve(buffer, type, len, trace_ctx); } DEFINE_PER_CPU(struct ring_buffer_event *, trace_buffered_event); DEFINE_PER_CPU(int, trace_buffered_event_cnt); static int trace_buffered_event_ref; /** * trace_buffered_event_enable - enable buffering events * * When events are being filtered, it is quicker to use a temporary * buffer to write the event data into if there's a likely chance * that it will not be committed. The discard of the ring buffer * is not as fast as committing, and is much slower than copying * a commit. * * When an event is to be filtered, allocate per cpu buffers to * write the event data into, and if the event is filtered and discarded * it is simply dropped, otherwise, the entire data is to be committed * in one shot. */ void trace_buffered_event_enable(void) { struct ring_buffer_event *event; struct page *page; int cpu; WARN_ON_ONCE(!mutex_is_locked(&event_mutex)); if (trace_buffered_event_ref++) return; for_each_tracing_cpu(cpu) { page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY, 0); /* This is just an optimization and can handle failures */ if (!page) { pr_err("Failed to allocate event buffer\n"); break; } event = page_address(page); memset(event, 0, sizeof(*event)); per_cpu(trace_buffered_event, cpu) = event; preempt_disable(); if (cpu == smp_processor_id() && __this_cpu_read(trace_buffered_event) != per_cpu(trace_buffered_event, cpu)) WARN_ON_ONCE(1); preempt_enable(); } } static void enable_trace_buffered_event(void *data) { /* Probably not needed, but do it anyway */ smp_rmb(); this_cpu_dec(trace_buffered_event_cnt); } static void disable_trace_buffered_event(void *data) { this_cpu_inc(trace_buffered_event_cnt); } /** * trace_buffered_event_disable - disable buffering events * * When a filter is removed, it is faster to not use the buffered * events, and to commit directly into the ring buffer. Free up * the temp buffers when there are no more users. This requires * special synchronization with current events. */ void trace_buffered_event_disable(void) { int cpu; WARN_ON_ONCE(!mutex_is_locked(&event_mutex)); if (WARN_ON_ONCE(!trace_buffered_event_ref)) return; if (--trace_buffered_event_ref) return; /* For each CPU, set the buffer as used. */ on_each_cpu_mask(tracing_buffer_mask, disable_trace_buffered_event, NULL, true); /* Wait for all current users to finish */ synchronize_rcu(); for_each_tracing_cpu(cpu) { free_page((unsigned long)per_cpu(trace_buffered_event, cpu)); per_cpu(trace_buffered_event, cpu) = NULL; } /* * Wait for all CPUs that potentially started checking if they can use * their event buffer only after the previous synchronize_rcu() call and * they still read a valid pointer from trace_buffered_event. It must be * ensured they don't see cleared trace_buffered_event_cnt else they * could wrongly decide to use the pointed-to buffer which is now freed. */ synchronize_rcu(); /* For each CPU, relinquish the buffer */ on_each_cpu_mask(tracing_buffer_mask, enable_trace_buffered_event, NULL, true); } static struct trace_buffer *temp_buffer; struct ring_buffer_event * trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, struct trace_event_file *trace_file, int type, unsigned long len, unsigned int trace_ctx) { struct ring_buffer_event *entry; struct trace_array *tr = trace_file->tr; int val; *current_rb = tr->array_buffer.buffer; if (!tr->no_filter_buffering_ref && (trace_file->flags & (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED))) { preempt_disable_notrace(); /* * Filtering is on, so try to use the per cpu buffer first. * This buffer will simulate a ring_buffer_event, * where the type_len is zero and the array[0] will * hold the full length. * (see include/linux/ring-buffer.h for details on * how the ring_buffer_event is structured). * * Using a temp buffer during filtering and copying it * on a matched filter is quicker than writing directly * into the ring buffer and then discarding it when * it doesn't match. That is because the discard * requires several atomic operations to get right. * Copying on match and doing nothing on a failed match * is still quicker than no copy on match, but having * to discard out of the ring buffer on a failed match. */ if ((entry = __this_cpu_read(trace_buffered_event))) { int max_len = PAGE_SIZE - struct_size(entry, array, 1); val = this_cpu_inc_return(trace_buffered_event_cnt); /* * Preemption is disabled, but interrupts and NMIs * can still come in now. If that happens after * the above increment, then it will have to go * back to the old method of allocating the event * on the ring buffer, and if the filter fails, it * will have to call ring_buffer_discard_commit() * to remove it. * * Need to also check the unlikely case that the * length is bigger than the temp buffer size. * If that happens, then the reserve is pretty much * guaranteed to fail, as the ring buffer currently * only allows events less than a page. But that may * change in the future, so let the ring buffer reserve * handle the failure in that case. */ if (val == 1 && likely(len <= max_len)) { trace_event_setup(entry, type, trace_ctx); entry->array[0] = len; /* Return with preemption disabled */ return entry; } this_cpu_dec(trace_buffered_event_cnt); } /* __trace_buffer_lock_reserve() disables preemption */ preempt_enable_notrace(); } entry = __trace_buffer_lock_reserve(*current_rb, type, len, trace_ctx); /* * If tracing is off, but we have triggers enabled * we still need to look at the event data. Use the temp_buffer * to store the trace event for the trigger to use. It's recursive * safe and will not be recorded anywhere. */ if (!entry && trace_file->flags & EVENT_FILE_FL_TRIGGER_COND) { *current_rb = temp_buffer; entry = __trace_buffer_lock_reserve(*current_rb, type, len, trace_ctx); } return entry; } EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve); static DEFINE_RAW_SPINLOCK(tracepoint_iter_lock); static DEFINE_MUTEX(tracepoint_printk_mutex); static void output_printk(struct trace_event_buffer *fbuffer) { struct trace_event_call *event_call; struct trace_event_file *file; struct trace_event *event; unsigned long flags; struct trace_iterator *iter = tracepoint_print_iter; /* We should never get here if iter is NULL */ if (WARN_ON_ONCE(!iter)) return; event_call = fbuffer->trace_file->event_call; if (!event_call || !event_call->event.funcs || !event_call->event.funcs->trace) return; file = fbuffer->trace_file; if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) || (unlikely(file->flags & EVENT_FILE_FL_FILTERED) && !filter_match_preds(file->filter, fbuffer->entry))) return; event = &fbuffer->trace_file->event_call->event; raw_spin_lock_irqsave(&tracepoint_iter_lock, flags); trace_seq_init(&iter->seq); iter->ent = fbuffer->entry; event_call->event.funcs->trace(iter, 0, event); trace_seq_putc(&iter->seq, 0); printk("%s", iter->seq.buffer); raw_spin_unlock_irqrestore(&tracepoint_iter_lock, flags); } int tracepoint_printk_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int save_tracepoint_printk; int ret; guard(mutex)(&tracepoint_printk_mutex); save_tracepoint_printk = tracepoint_printk; ret = proc_dointvec(table, write, buffer, lenp, ppos); /* * This will force exiting early, as tracepoint_printk * is always zero when tracepoint_printk_iter is not allocated */ if (!tracepoint_print_iter) tracepoint_printk = 0; if (save_tracepoint_printk == tracepoint_printk) return ret; if (tracepoint_printk) static_key_enable(&tracepoint_printk_key.key); else static_key_disable(&tracepoint_printk_key.key); return ret; } void trace_event_buffer_commit(struct trace_event_buffer *fbuffer) { enum event_trigger_type tt = ETT_NONE; struct trace_event_file *file = fbuffer->trace_file; if (__event_trigger_test_discard(file, fbuffer->buffer, fbuffer->event, fbuffer->entry, &tt)) goto discard; if (static_key_false(&tracepoint_printk_key.key)) output_printk(fbuffer); if (static_branch_unlikely(&trace_event_exports_enabled)) ftrace_exports(fbuffer->event, TRACE_EXPORT_EVENT); trace_buffer_unlock_commit_regs(file->tr, fbuffer->buffer, fbuffer->event, fbuffer->trace_ctx, fbuffer->regs); discard: if (tt) event_triggers_post_call(file, tt); } EXPORT_SYMBOL_GPL(trace_event_buffer_commit); /* * Skip 3: * * trace_buffer_unlock_commit_regs() * trace_event_buffer_commit() * trace_event_raw_event_xxx() */ # define STACK_SKIP 3 void trace_buffer_unlock_commit_regs(struct trace_array *tr, struct trace_buffer *buffer, struct ring_buffer_event *event, unsigned int trace_ctx, struct pt_regs *regs) { __buffer_unlock_commit(buffer, event); /* * If regs is not set, then skip the necessary functions. * Note, we can still get here via blktrace, wakeup tracer * and mmiotrace, but that's ok if they lose a function or * two. They are not that meaningful. */ ftrace_trace_stack(tr, buffer, trace_ctx, regs ? 0 : STACK_SKIP, regs); ftrace_trace_userstack(tr, buffer, trace_ctx); } /* * Similar to trace_buffer_unlock_commit_regs() but do not dump stack. */ void trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer, struct ring_buffer_event *event) { __buffer_unlock_commit(buffer, event); } void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned int trace_ctx) { struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct ftrace_entry *entry; event = __trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), trace_ctx); if (!event) return; entry = ring_buffer_event_data(event); entry->ip = ip; entry->parent_ip = parent_ip; if (static_branch_unlikely(&trace_function_exports_enabled)) ftrace_exports(event, TRACE_EXPORT_FUNCTION); __buffer_unlock_commit(buffer, event); } #ifdef CONFIG_STACKTRACE /* Allow 4 levels of nesting: normal, softirq, irq, NMI */ #define FTRACE_KSTACK_NESTING 4 #define FTRACE_KSTACK_ENTRIES (SZ_4K / FTRACE_KSTACK_NESTING) struct ftrace_stack { unsigned long calls[FTRACE_KSTACK_ENTRIES]; }; struct ftrace_stacks { struct ftrace_stack stacks[FTRACE_KSTACK_NESTING]; }; static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks); static DEFINE_PER_CPU(int, ftrace_stack_reserve); static void __ftrace_trace_stack(struct trace_array *tr, struct trace_buffer *buffer, unsigned int trace_ctx, int skip, struct pt_regs *regs) { struct ring_buffer_event *event; unsigned int size, nr_entries; struct ftrace_stack *fstack; struct stack_entry *entry; int stackidx; /* * Add one, for this function and the call to save_stack_trace() * If regs is set, then these functions will not be in the way. */ #ifndef CONFIG_UNWINDER_ORC if (!regs) skip++; #endif preempt_disable_notrace(); stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1; /* This should never happen. If it does, yell once and skip */ if (WARN_ON_ONCE(stackidx >= FTRACE_KSTACK_NESTING)) goto out; /* * The above __this_cpu_inc_return() is 'atomic' cpu local. An * interrupt will either see the value pre increment or post * increment. If the interrupt happens pre increment it will have * restored the counter when it returns. We just need a barrier to * keep gcc from moving things around. */ barrier(); fstack = this_cpu_ptr(ftrace_stacks.stacks) + stackidx; size = ARRAY_SIZE(fstack->calls); if (regs) { nr_entries = stack_trace_save_regs(regs, fstack->calls, size, skip); } else { nr_entries = stack_trace_save(fstack->calls, size, skip); } #ifdef CONFIG_DYNAMIC_FTRACE /* Mark entry of stack trace as trampoline code */ if (tr->ops && tr->ops->trampoline) { unsigned long tramp_start = tr->ops->trampoline; unsigned long tramp_end = tramp_start + tr->ops->trampoline_size; unsigned long *calls = fstack->calls; for (int i = 0; i < nr_entries; i++) { if (calls[i] >= tramp_start && calls[i] < tramp_end) calls[i] = FTRACE_TRAMPOLINE_MARKER; } } #endif event = __trace_buffer_lock_reserve(buffer, TRACE_STACK, struct_size(entry, caller, nr_entries), trace_ctx); if (!event) goto out; entry = ring_buffer_event_data(event); entry->size = nr_entries; memcpy(&entry->caller, fstack->calls, flex_array_size(entry, caller, nr_entries)); __buffer_unlock_commit(buffer, event); out: /* Again, don't let gcc optimize things here */ barrier(); __this_cpu_dec(ftrace_stack_reserve); preempt_enable_notrace(); } static inline void ftrace_trace_stack(struct trace_array *tr, struct trace_buffer *buffer, unsigned int trace_ctx, int skip, struct pt_regs *regs) { if (!(tr->trace_flags & TRACE_ITER_STACKTRACE)) return; __ftrace_trace_stack(tr, buffer, trace_ctx, skip, regs); } void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, int skip) { struct trace_buffer *buffer = tr->array_buffer.buffer; if (rcu_is_watching()) { __ftrace_trace_stack(tr, buffer, trace_ctx, skip, NULL); return; } if (WARN_ON_ONCE(IS_ENABLED(CONFIG_GENERIC_ENTRY))) return; /* * When an NMI triggers, RCU is enabled via ct_nmi_enter(), * but if the above rcu_is_watching() failed, then the NMI * triggered someplace critical, and ct_irq_enter() should * not be called from NMI. */ if (unlikely(in_nmi())) return; ct_irq_enter_irqson(); __ftrace_trace_stack(tr, buffer, trace_ctx, skip, NULL); ct_irq_exit_irqson(); } /** * trace_dump_stack - record a stack back trace in the trace buffer * @skip: Number of functions to skip (helper handlers) */ void trace_dump_stack(int skip) { if (tracing_disabled || tracing_selftest_running) return; #ifndef CONFIG_UNWINDER_ORC /* Skip 1 to skip this function. */ skip++; #endif __ftrace_trace_stack(printk_trace, printk_trace->array_buffer.buffer, tracing_gen_ctx(), skip, NULL); } EXPORT_SYMBOL_GPL(trace_dump_stack); #ifdef CONFIG_USER_STACKTRACE_SUPPORT static DEFINE_PER_CPU(int, user_stack_count); static void ftrace_trace_userstack(struct trace_array *tr, struct trace_buffer *buffer, unsigned int trace_ctx) { struct ring_buffer_event *event; struct userstack_entry *entry; if (!(tr->trace_flags & TRACE_ITER_USERSTACKTRACE)) return; /* * NMIs can not handle page faults, even with fix ups. * The save user stack can (and often does) fault. */ if (unlikely(in_nmi())) return; /* * prevent recursion, since the user stack tracing may * trigger other kernel events. */ preempt_disable(); if (__this_cpu_read(user_stack_count)) goto out; __this_cpu_inc(user_stack_count); event = __trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, sizeof(*entry), trace_ctx); if (!event) goto out_drop_count; entry = ring_buffer_event_data(event); entry->tgid = current->tgid; memset(&entry->caller, 0, sizeof(entry->caller)); stack_trace_save_user(entry->caller, FTRACE_STACK_ENTRIES); __buffer_unlock_commit(buffer, event); out_drop_count: __this_cpu_dec(user_stack_count); out: preempt_enable(); } #else /* CONFIG_USER_STACKTRACE_SUPPORT */ static void ftrace_trace_userstack(struct trace_array *tr, struct trace_buffer *buffer, unsigned int trace_ctx) { } #endif /* !CONFIG_USER_STACKTRACE_SUPPORT */ #endif /* CONFIG_STACKTRACE */ static inline void func_repeats_set_delta_ts(struct func_repeats_entry *entry, unsigned long long delta) { entry->bottom_delta_ts = delta & U32_MAX; entry->top_delta_ts = (delta >> 32); } void trace_last_func_repeats(struct trace_array *tr, struct trace_func_repeats *last_info, unsigned int trace_ctx) { struct trace_buffer *buffer = tr->array_buffer.buffer; struct func_repeats_entry *entry; struct ring_buffer_event *event; u64 delta; event = __trace_buffer_lock_reserve(buffer, TRACE_FUNC_REPEATS, sizeof(*entry), trace_ctx); if (!event) return; delta = ring_buffer_event_time_stamp(buffer, event) - last_info->ts_last_call; entry = ring_buffer_event_data(event); entry->ip = last_info->ip; entry->parent_ip = last_info->parent_ip; entry->count = last_info->count; func_repeats_set_delta_ts(entry, delta); __buffer_unlock_commit(buffer, event); } /* created for use with alloc_percpu */ struct trace_buffer_struct { int nesting; char buffer[4][TRACE_BUF_SIZE]; }; static struct trace_buffer_struct __percpu *trace_percpu_buffer; /* * This allows for lockless recording. If we're nested too deeply, then * this returns NULL. */ static char *get_trace_buf(void) { struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer); if (!trace_percpu_buffer || buffer->nesting >= 4) return NULL; buffer->nesting++; /* Interrupts must see nesting incremented before we use the buffer */ barrier(); return &buffer->buffer[buffer->nesting - 1][0]; } static void put_trace_buf(void) { /* Don't let the decrement of nesting leak before this */ barrier(); this_cpu_dec(trace_percpu_buffer->nesting); } static int alloc_percpu_trace_buffer(void) { struct trace_buffer_struct __percpu *buffers; if (trace_percpu_buffer) return 0; buffers = alloc_percpu(struct trace_buffer_struct); if (MEM_FAIL(!buffers, "Could not allocate percpu trace_printk buffer")) return -ENOMEM; trace_percpu_buffer = buffers; return 0; } static int buffers_allocated; void trace_printk_init_buffers(void) { if (buffers_allocated) return; if (alloc_percpu_trace_buffer()) return; /* trace_printk() is for debug use only. Don't use it in production. */ pr_warn("\n"); pr_warn("**********************************************************\n"); pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); pr_warn("** **\n"); pr_warn("** trace_printk() being used. Allocating extra memory. **\n"); pr_warn("** **\n"); pr_warn("** This means that this is a DEBUG kernel and it is **\n"); pr_warn("** unsafe for production use. **\n"); pr_warn("** **\n"); pr_warn("** If you see this message and you are not debugging **\n"); pr_warn("** the kernel, report this immediately to your vendor! **\n"); pr_warn("** **\n"); pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); pr_warn("**********************************************************\n"); /* Expand the buffers to set size */ tracing_update_buffers(&global_trace); buffers_allocated = 1; /* * trace_printk_init_buffers() can be called by modules. * If that happens, then we need to start cmdline recording * directly here. If the global_trace.buffer is already * allocated here, then this was called by module code. */ if (global_trace.array_buffer.buffer) tracing_start_cmdline_record(); } EXPORT_SYMBOL_GPL(trace_printk_init_buffers); void trace_printk_start_comm(void) { /* Start tracing comms if trace printk is set */ if (!buffers_allocated) return; tracing_start_cmdline_record(); } static void trace_printk_start_stop_comm(int enabled) { if (!buffers_allocated) return; if (enabled) tracing_start_cmdline_record(); else tracing_stop_cmdline_record(); } /** * trace_vbprintk - write binary msg to tracing buffer * @ip: The address of the caller * @fmt: The string format to write to the buffer * @args: Arguments for @fmt */ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) { struct ring_buffer_event *event; struct trace_buffer *buffer; struct trace_array *tr = READ_ONCE(printk_trace); struct bprint_entry *entry; unsigned int trace_ctx; char *tbuffer; int len = 0, size; if (!printk_binsafe(tr)) return trace_vprintk(ip, fmt, args); if (unlikely(tracing_selftest_running || tracing_disabled)) return 0; /* Don't pollute graph traces with trace_vprintk internals */ pause_graph_tracing(); trace_ctx = tracing_gen_ctx(); preempt_disable_notrace(); tbuffer = get_trace_buf(); if (!tbuffer) { len = 0; goto out_nobuffer; } len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args); if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0) goto out_put; size = sizeof(*entry) + sizeof(u32) * len; buffer = tr->array_buffer.buffer; ring_buffer_nest_start(buffer); event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, trace_ctx); if (!event) goto out; entry = ring_buffer_event_data(event); entry->ip = ip; entry->fmt = fmt; memcpy(entry->buf, tbuffer, sizeof(u32) * len); __buffer_unlock_commit(buffer, event); ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL); out: ring_buffer_nest_end(buffer); out_put: put_trace_buf(); out_nobuffer: preempt_enable_notrace(); unpause_graph_tracing(); return len; } EXPORT_SYMBOL_GPL(trace_vbprintk); __printf(3, 0) static int __trace_array_vprintk(struct trace_buffer *buffer, unsigned long ip, const char *fmt, va_list args) { struct ring_buffer_event *event; int len = 0, size; struct print_entry *entry; unsigned int trace_ctx; char *tbuffer; if (tracing_disabled) return 0; /* Don't pollute graph traces with trace_vprintk internals */ pause_graph_tracing(); trace_ctx = tracing_gen_ctx(); preempt_disable_notrace(); tbuffer = get_trace_buf(); if (!tbuffer) { len = 0; goto out_nobuffer; } len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); size = sizeof(*entry) + len + 1; ring_buffer_nest_start(buffer); event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, trace_ctx); if (!event) goto out; entry = ring_buffer_event_data(event); entry->ip = ip; memcpy(&entry->buf, tbuffer, len + 1); __buffer_unlock_commit(buffer, event); ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL); out: ring_buffer_nest_end(buffer); put_trace_buf(); out_nobuffer: preempt_enable_notrace(); unpause_graph_tracing(); return len; } __printf(3, 0) int trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args) { if (tracing_selftest_running && tr == &global_trace) return 0; return __trace_array_vprintk(tr->array_buffer.buffer, ip, fmt, args); } /** * trace_array_printk - Print a message to a specific instance * @tr: The instance trace_array descriptor * @ip: The instruction pointer that this is called from. * @fmt: The format to print (printf format) * * If a subsystem sets up its own instance, they have the right to * printk strings into their tracing instance buffer using this * function. Note, this function will not write into the top level * buffer (use trace_printk() for that), as writing into the top level * buffer should only have events that can be individually disabled. * trace_printk() is only used for debugging a kernel, and should not * be ever incorporated in normal use. * * trace_array_printk() can be used, as it will not add noise to the * top level tracing buffer. * * Note, trace_array_init_printk() must be called on @tr before this * can be used. */ __printf(3, 0) int trace_array_printk(struct trace_array *tr, unsigned long ip, const char *fmt, ...) { int ret; va_list ap; if (!tr) return -ENOENT; /* This is only allowed for created instances */ if (tr == &global_trace) return 0; if (!(tr->trace_flags & TRACE_ITER_PRINTK)) return 0; va_start(ap, fmt); ret = trace_array_vprintk(tr, ip, fmt, ap); va_end(ap); return ret; } EXPORT_SYMBOL_GPL(trace_array_printk); /** * trace_array_init_printk - Initialize buffers for trace_array_printk() * @tr: The trace array to initialize the buffers for * * As trace_array_printk() only writes into instances, they are OK to * have in the kernel (unlike trace_printk()). This needs to be called * before trace_array_printk() can be used on a trace_array. */ int trace_array_init_printk(struct trace_array *tr) { if (!tr) return -ENOENT; /* This is only allowed for created instances */ if (tr == &global_trace) return -EINVAL; return alloc_percpu_trace_buffer(); } EXPORT_SYMBOL_GPL(trace_array_init_printk); __printf(3, 4) int trace_array_printk_buf(struct trace_buffer *buffer, unsigned long ip, const char *fmt, ...) { int ret; va_list ap; if (!(printk_trace->trace_flags & TRACE_ITER_PRINTK)) return 0; va_start(ap, fmt); ret = __trace_array_vprintk(buffer, ip, fmt, ap); va_end(ap); return ret; } __printf(2, 0) int trace_vprintk(unsigned long ip, const char *fmt, va_list args) { return trace_array_vprintk(printk_trace, ip, fmt, args); } EXPORT_SYMBOL_GPL(trace_vprintk); static void trace_iterator_increment(struct trace_iterator *iter) { struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, iter->cpu); iter->idx++; if (buf_iter) ring_buffer_iter_advance(buf_iter); } static struct trace_entry * peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts, unsigned long *lost_events) { struct ring_buffer_event *event; struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, cpu); if (buf_iter) { event = ring_buffer_iter_peek(buf_iter, ts); if (lost_events) *lost_events = ring_buffer_iter_dropped(buf_iter) ? (unsigned long)-1 : 0; } else { event = ring_buffer_peek(iter->array_buffer->buffer, cpu, ts, lost_events); } if (event) { iter->ent_size = ring_buffer_event_length(event); return ring_buffer_event_data(event); } iter->ent_size = 0; return NULL; } static struct trace_entry * __find_next_entry(struct trace_iterator *iter, int *ent_cpu, unsigned long *missing_events, u64 *ent_ts) { struct trace_buffer *buffer = iter->array_buffer->buffer; struct trace_entry *ent, *next = NULL; unsigned long lost_events = 0, next_lost = 0; int cpu_file = iter->cpu_file; u64 next_ts = 0, ts; int next_cpu = -1; int next_size = 0; int cpu; /* * If we are in a per_cpu trace file, don't bother by iterating over * all cpu and peek directly. */ if (cpu_file > RING_BUFFER_ALL_CPUS) { if (ring_buffer_empty_cpu(buffer, cpu_file)) return NULL; ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events); if (ent_cpu) *ent_cpu = cpu_file; return ent; } for_each_tracing_cpu(cpu) { if (ring_buffer_empty_cpu(buffer, cpu)) continue; ent = peek_next_entry(iter, cpu, &ts, &lost_events); /* * Pick the entry with the smallest timestamp: */ if (ent && (!next || ts < next_ts)) { next = ent; next_cpu = cpu; next_ts = ts; next_lost = lost_events; next_size = iter->ent_size; } } iter->ent_size = next_size; if (ent_cpu) *ent_cpu = next_cpu; if (ent_ts) *ent_ts = next_ts; if (missing_events) *missing_events = next_lost; return next; } #define STATIC_FMT_BUF_SIZE 128 static char static_fmt_buf[STATIC_FMT_BUF_SIZE]; char *trace_iter_expand_format(struct trace_iterator *iter) { char *tmp; /* * iter->tr is NULL when used with tp_printk, which makes * this get called where it is not safe to call krealloc(). */ if (!iter->tr || iter->fmt == static_fmt_buf) return NULL; tmp = krealloc(iter->fmt, iter->fmt_size + STATIC_FMT_BUF_SIZE, GFP_KERNEL); if (tmp) { iter->fmt_size += STATIC_FMT_BUF_SIZE; iter->fmt = tmp; } return tmp; } /* Returns true if the string is safe to dereference from an event */ static bool trace_safe_str(struct trace_iterator *iter, const char *str) { unsigned long addr = (unsigned long)str; struct trace_event *trace_event; struct trace_event_call *event; /* OK if part of the event data */ if ((addr >= (unsigned long)iter->ent) && (addr < (unsigned long)iter->ent + iter->ent_size)) return true; /* OK if part of the temp seq buffer */ if ((addr >= (unsigned long)iter->tmp_seq.buffer) && (addr < (unsigned long)iter->tmp_seq.buffer + TRACE_SEQ_BUFFER_SIZE)) return true; /* Core rodata can not be freed */ if (is_kernel_rodata(addr)) return true; if (trace_is_tracepoint_string(str)) return true; /* * Now this could be a module event, referencing core module * data, which is OK. */ if (!iter->ent) return false; trace_event = ftrace_find_event(iter->ent->type); if (!trace_event) return false; event = container_of(trace_event, struct trace_event_call, event); if ((event->flags & TRACE_EVENT_FL_DYNAMIC) || !event->module) return false; /* Would rather have rodata, but this will suffice */ if (within_module_core(addr, event->module)) return true; return false; } /** * ignore_event - Check dereferenced fields while writing to the seq buffer * @iter: The iterator that holds the seq buffer and the event being printed * * At boot up, test_event_printk() will flag any event that dereferences * a string with "%s" that does exist in the ring buffer. It may still * be valid, as the string may point to a static string in the kernel * rodata that never gets freed. But if the string pointer is pointing * to something that was allocated, there's a chance that it can be freed * by the time the user reads the trace. This would cause a bad memory * access by the kernel and possibly crash the system. * * This function will check if the event has any fields flagged as needing * to be checked at runtime and perform those checks. * * If it is found that a field is unsafe, it will write into the @iter->seq * a message stating what was found to be unsafe. * * @return: true if the event is unsafe and should be ignored, * false otherwise. */ bool ignore_event(struct trace_iterator *iter) { struct ftrace_event_field *field; struct trace_event *trace_event; struct trace_event_call *event; struct list_head *head; struct trace_seq *seq; const void *ptr; trace_event = ftrace_find_event(iter->ent->type); seq = &iter->seq; if (!trace_event) { trace_seq_printf(seq, "EVENT ID %d NOT FOUND?\n", iter->ent->type); return true; } event = container_of(trace_event, struct trace_event_call, event); if (!(event->flags & TRACE_EVENT_FL_TEST_STR)) return false; head = trace_get_fields(event); if (!head) { trace_seq_printf(seq, "FIELDS FOR EVENT '%s' NOT FOUND?\n", trace_event_name(event)); return true; } /* Offsets are from the iter->ent that points to the raw event */ ptr = iter->ent; list_for_each_entry(field, head, link) { const char *str; bool good; if (!field->needs_test) continue; str = *(const char **)(ptr + field->offset); good = trace_safe_str(iter, str); /* * If you hit this warning, it is likely that the * trace event in question used %s on a string that * was saved at the time of the event, but may not be * around when the trace is read. Use __string(), * __assign_str() and __get_str() helpers in the TRACE_EVENT() * instead. See samples/trace_events/trace-events-sample.h * for reference. */ if (WARN_ONCE(!good, "event '%s' has unsafe pointer field '%s'", trace_event_name(event), field->name)) { trace_seq_printf(seq, "EVENT %s: HAS UNSAFE POINTER FIELD '%s'\n", trace_event_name(event), field->name); return true; } } return false; } const char *trace_event_format(struct trace_iterator *iter, const char *fmt) { const char *p, *new_fmt; char *q; if (WARN_ON_ONCE(!fmt)) return fmt; if (!iter->tr || iter->tr->trace_flags & TRACE_ITER_HASH_PTR) return fmt; p = fmt; new_fmt = q = iter->fmt; while (*p) { if (unlikely(q - new_fmt + 3 > iter->fmt_size)) { if (!trace_iter_expand_format(iter)) return fmt; q += iter->fmt - new_fmt; new_fmt = iter->fmt; } *q++ = *p++; /* Replace %p with %px */ if (p[-1] == '%') { if (p[0] == '%') { *q++ = *p++; } else if (p[0] == 'p' && !isalnum(p[1])) { *q++ = *p++; *q++ = 'x'; } } } *q = '\0'; return new_fmt; } #define STATIC_TEMP_BUF_SIZE 128 static char static_temp_buf[STATIC_TEMP_BUF_SIZE] __aligned(4); /* Find the next real entry, without updating the iterator itself */ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) { /* __find_next_entry will reset ent_size */ int ent_size = iter->ent_size; struct trace_entry *entry; /* * If called from ftrace_dump(), then the iter->temp buffer * will be the static_temp_buf and not created from kmalloc. * If the entry size is greater than the buffer, we can * not save it. Just return NULL in that case. This is only * used to add markers when two consecutive events' time * stamps have a large delta. See trace_print_lat_context() */ if (iter->temp == static_temp_buf && STATIC_TEMP_BUF_SIZE < ent_size) return NULL; /* * The __find_next_entry() may call peek_next_entry(), which may * call ring_buffer_peek() that may make the contents of iter->ent * undefined. Need to copy iter->ent now. */ if (iter->ent && iter->ent != iter->temp) { if ((!iter->temp || iter->temp_size < iter->ent_size) && !WARN_ON_ONCE(iter->temp == static_temp_buf)) { void *temp; temp = kmalloc(iter->ent_size, GFP_KERNEL); if (!temp) return NULL; kfree(iter->temp); iter->temp = temp; iter->temp_size = iter->ent_size; } memcpy(iter->temp, iter->ent, iter->ent_size); iter->ent = iter->temp; } entry = __find_next_entry(iter, ent_cpu, NULL, ent_ts); /* Put back the original ent_size */ iter->ent_size = ent_size; return entry; } /* Find the next real entry, and increment the iterator to the next entry */ void *trace_find_next_entry_inc(struct trace_iterator *iter) { iter->ent = __find_next_entry(iter, &iter->cpu, &iter->lost_events, &iter->ts); if (iter->ent) trace_iterator_increment(iter); return iter->ent ? iter : NULL; } static void trace_consume(struct trace_iterator *iter) { ring_buffer_consume(iter->array_buffer->buffer, iter->cpu, &iter->ts, &iter->lost_events); } static void *s_next(struct seq_file *m, void *v, loff_t *pos) { struct trace_iterator *iter = m->private; int i = (int)*pos; void *ent; WARN_ON_ONCE(iter->leftover); (*pos)++; /* can't go backwards */ if (iter->idx > i) return NULL; if (iter->idx < 0) ent = trace_find_next_entry_inc(iter); else ent = iter; while (ent && iter->idx < i) ent = trace_find_next_entry_inc(iter); iter->pos = *pos; return ent; } void tracing_iter_reset(struct trace_iterator *iter, int cpu) { struct ring_buffer_iter *buf_iter; unsigned long entries = 0; u64 ts; per_cpu_ptr(iter->array_buffer->data, cpu)->skipped_entries = 0; buf_iter = trace_buffer_iter(iter, cpu); if (!buf_iter) return; ring_buffer_iter_reset(buf_iter); /* * We could have the case with the max latency tracers * that a reset never took place on a cpu. This is evident * by the timestamp being before the start of the buffer. */ while (ring_buffer_iter_peek(buf_iter, &ts)) { if (ts >= iter->array_buffer->time_start) break; entries++; ring_buffer_iter_advance(buf_iter); /* This could be a big loop */ cond_resched(); } per_cpu_ptr(iter->array_buffer->data, cpu)->skipped_entries = entries; } /* * The current tracer is copied to avoid a global locking * all around. */ static void *s_start(struct seq_file *m, loff_t *pos) { struct trace_iterator *iter = m->private; struct trace_array *tr = iter->tr; int cpu_file = iter->cpu_file; void *p = NULL; loff_t l = 0; int cpu; mutex_lock(&trace_types_lock); if (unlikely(tr->current_trace != iter->trace)) { /* Close iter->trace before switching to the new current tracer */ if (iter->trace->close) iter->trace->close(iter); iter->trace = tr->current_trace; /* Reopen the new current tracer */ if (iter->trace->open) iter->trace->open(iter); } mutex_unlock(&trace_types_lock); #ifdef CONFIG_TRACER_MAX_TRACE if (iter->snapshot && iter->trace->use_max_tr) return ERR_PTR(-EBUSY); #endif if (*pos != iter->pos) { iter->ent = NULL; iter->cpu = 0; iter->idx = -1; if (cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) tracing_iter_reset(iter, cpu); } else tracing_iter_reset(iter, cpu_file); iter->leftover = 0; for (p = iter; p && l < *pos; p = s_next(m, p, &l)) ; } else { /* * If we overflowed the seq_file before, then we want * to just reuse the trace_seq buffer again. */ if (iter->leftover) p = iter; else { l = *pos - 1; p = s_next(m, p, &l); } } trace_event_read_lock(); trace_access_lock(cpu_file); return p; } static void s_stop(struct seq_file *m, void *p) { struct trace_iterator *iter = m->private; #ifdef CONFIG_TRACER_MAX_TRACE if (iter->snapshot && iter->trace->use_max_tr) return; #endif trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); } static void get_total_entries_cpu(struct array_buffer *buf, unsigned long *total, unsigned long *entries, int cpu) { unsigned long count; count = ring_buffer_entries_cpu(buf->buffer, cpu); /* * If this buffer has skipped entries, then we hold all * entries for the trace and we need to ignore the * ones before the time stamp. */ if (per_cpu_ptr(buf->data, cpu)->skipped_entries) { count -= per_cpu_ptr(buf->data, cpu)->skipped_entries; /* total is the same as the entries */ *total = count; } else *total = count + ring_buffer_overrun_cpu(buf->buffer, cpu); *entries = count; } static void get_total_entries(struct array_buffer *buf, unsigned long *total, unsigned long *entries) { unsigned long t, e; int cpu; *total = 0; *entries = 0; for_each_tracing_cpu(cpu) { get_total_entries_cpu(buf, &t, &e, cpu); *total += t; *entries += e; } } unsigned long trace_total_entries_cpu(struct trace_array *tr, int cpu) { unsigned long total, entries; if (!tr) tr = &global_trace; get_total_entries_cpu(&tr->array_buffer, &total, &entries, cpu); return entries; } unsigned long trace_total_entries(struct trace_array *tr) { unsigned long total, entries; if (!tr) tr = &global_trace; get_total_entries(&tr->array_buffer, &total, &entries); return entries; } static void print_lat_help_header(struct seq_file *m) { seq_puts(m, "# _------=> CPU# \n" "# / _-----=> irqs-off/BH-disabled\n" "# | / _----=> need-resched \n" "# || / _---=> hardirq/softirq \n" "# ||| / _--=> preempt-depth \n" "# |||| / _-=> migrate-disable \n" "# ||||| / delay \n" "# cmd pid |||||| time | caller \n" "# \\ / |||||| \\ | / \n"); } static void print_event_info(struct array_buffer *buf, struct seq_file *m) { unsigned long total; unsigned long entries; get_total_entries(buf, &total, &entries); seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu #P:%d\n", entries, total, num_online_cpus()); seq_puts(m, "#\n"); } static void print_func_help_header(struct array_buffer *buf, struct seq_file *m, unsigned int flags) { bool tgid = flags & TRACE_ITER_RECORD_TGID; print_event_info(buf, m); seq_printf(m, "# TASK-PID %s CPU# TIMESTAMP FUNCTION\n", tgid ? " TGID " : ""); seq_printf(m, "# | | %s | | |\n", tgid ? " | " : ""); } static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file *m, unsigned int flags) { bool tgid = flags & TRACE_ITER_RECORD_TGID; static const char space[] = " "; int prec = tgid ? 12 : 2; print_event_info(buf, m); seq_printf(m, "# %.*s _-----=> irqs-off/BH-disabled\n", prec, space); seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space); seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space); seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space); seq_printf(m, "# %.*s||| / _-=> migrate-disable\n", prec, space); seq_printf(m, "# %.*s|||| / delay\n", prec, space); seq_printf(m, "# TASK-PID %.*s CPU# ||||| TIMESTAMP FUNCTION\n", prec, " TGID "); seq_printf(m, "# | | %.*s | ||||| | |\n", prec, " | "); } void print_trace_header(struct seq_file *m, struct trace_iterator *iter) { unsigned long sym_flags = (global_trace.trace_flags & TRACE_ITER_SYM_MASK); struct array_buffer *buf = iter->array_buffer; struct trace_array_cpu *data = per_cpu_ptr(buf->data, buf->cpu); struct tracer *type = iter->trace; unsigned long entries; unsigned long total; const char *name = type->name; get_total_entries(buf, &total, &entries); seq_printf(m, "# %s latency trace v1.1.5 on %s\n", name, init_utsname()->release); seq_puts(m, "# -----------------------------------" "---------------------------------\n"); seq_printf(m, "# latency: %lu us, #%lu/%lu, CPU#%d |" " (M:%s VP:%d, KP:%d, SP:%d HP:%d", nsecs_to_usecs(data->saved_latency), entries, total, buf->cpu, preempt_model_none() ? "server" : preempt_model_voluntary() ? "desktop" : preempt_model_full() ? "preempt" : preempt_model_lazy() ? "lazy" : preempt_model_rt() ? "preempt_rt" : "unknown", /* These are reserved for later use */ 0, 0, 0, 0); #ifdef CONFIG_SMP seq_printf(m, " #P:%d)\n", num_online_cpus()); #else seq_puts(m, ")\n"); #endif seq_puts(m, "# -----------------\n"); seq_printf(m, "# | task: %.16s-%d " "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n", data->comm, data->pid, from_kuid_munged(seq_user_ns(m), data->uid), data->nice, data->policy, data->rt_priority); seq_puts(m, "# -----------------\n"); if (data->critical_start) { seq_puts(m, "# => started at: "); seq_print_ip_sym(&iter->seq, data->critical_start, sym_flags); trace_print_seq(m, &iter->seq); seq_puts(m, "\n# => ended at: "); seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags); trace_print_seq(m, &iter->seq); seq_puts(m, "\n#\n"); } seq_puts(m, "#\n"); } static void test_cpu_buff_start(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; struct trace_array *tr = iter->tr; if (!(tr->trace_flags & TRACE_ITER_ANNOTATE)) return; if (!(iter->iter_flags & TRACE_FILE_ANNOTATE)) return; if (cpumask_available(iter->started) && cpumask_test_cpu(iter->cpu, iter->started)) return; if (per_cpu_ptr(iter->array_buffer->data, iter->cpu)->skipped_entries) return; if (cpumask_available(iter->started)) cpumask_set_cpu(iter->cpu, iter->started); /* Don't print started cpu buffer for the first entry of the trace */ if (iter->idx > 1) trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu); } static enum print_line_t print_trace_fmt(struct trace_iterator *iter) { struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; unsigned long sym_flags = (tr->trace_flags & TRACE_ITER_SYM_MASK); struct trace_entry *entry; struct trace_event *event; entry = iter->ent; test_cpu_buff_start(iter); event = ftrace_find_event(entry->type); if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) { if (iter->iter_flags & TRACE_FILE_LAT_FMT) trace_print_lat_context(iter); else trace_print_context(iter); } if (trace_seq_has_overflowed(s)) return TRACE_TYPE_PARTIAL_LINE; if (event) { if (tr->trace_flags & TRACE_ITER_FIELDS) return print_event_fields(iter, event); /* * For TRACE_EVENT() events, the print_fmt is not * safe to use if the array has delta offsets * Force printing via the fields. */ if ((tr->text_delta || tr->data_delta) && event->type > __TRACE_LAST_TYPE) return print_event_fields(iter, event); return event->funcs->trace(iter, sym_flags, event); } trace_seq_printf(s, "Unknown type %d\n", entry->type); return trace_handle_return(s); } static enum print_line_t print_raw_fmt(struct trace_iterator *iter) { struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; struct trace_entry *entry; struct trace_event *event; entry = iter->ent; if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) trace_seq_printf(s, "%d %d %llu ", entry->pid, iter->cpu, iter->ts); if (trace_seq_has_overflowed(s)) return TRACE_TYPE_PARTIAL_LINE; event = ftrace_find_event(entry->type); if (event) return event->funcs->raw(iter, 0, event); trace_seq_printf(s, "%d ?\n", entry->type); return trace_handle_return(s); } static enum print_line_t print_hex_fmt(struct trace_iterator *iter) { struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; unsigned char newline = '\n'; struct trace_entry *entry; struct trace_event *event; entry = iter->ent; if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) { SEQ_PUT_HEX_FIELD(s, entry->pid); SEQ_PUT_HEX_FIELD(s, iter->cpu); SEQ_PUT_HEX_FIELD(s, iter->ts); if (trace_seq_has_overflowed(s)) return TRACE_TYPE_PARTIAL_LINE; } event = ftrace_find_event(entry->type); if (event) { enum print_line_t ret = event->funcs->hex(iter, 0, event); if (ret != TRACE_TYPE_HANDLED) return ret; } SEQ_PUT_FIELD(s, newline); return trace_handle_return(s); } static enum print_line_t print_bin_fmt(struct trace_iterator *iter) { struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; struct trace_entry *entry; struct trace_event *event; entry = iter->ent; if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) { SEQ_PUT_FIELD(s, entry->pid); SEQ_PUT_FIELD(s, iter->cpu); SEQ_PUT_FIELD(s, iter->ts); if (trace_seq_has_overflowed(s)) return TRACE_TYPE_PARTIAL_LINE; } event = ftrace_find_event(entry->type); return event ? event->funcs->binary(iter, 0, event) : TRACE_TYPE_HANDLED; } int trace_empty(struct trace_iterator *iter) { struct ring_buffer_iter *buf_iter; int cpu; /* If we are looking at one CPU buffer, only check that one */ if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { cpu = iter->cpu_file; buf_iter = trace_buffer_iter(iter, cpu); if (buf_iter) { if (!ring_buffer_iter_empty(buf_iter)) return 0; } else { if (!ring_buffer_empty_cpu(iter->array_buffer->buffer, cpu)) return 0; } return 1; } for_each_tracing_cpu(cpu) { buf_iter = trace_buffer_iter(iter, cpu); if (buf_iter) { if (!ring_buffer_iter_empty(buf_iter)) return 0; } else { if (!ring_buffer_empty_cpu(iter->array_buffer->buffer, cpu)) return 0; } } return 1; } /* Called with trace_event_read_lock() held. */ enum print_line_t print_trace_line(struct trace_iterator *iter) { struct trace_array *tr = iter->tr; unsigned long trace_flags = tr->trace_flags; enum print_line_t ret; if (iter->lost_events) { if (iter->lost_events == (unsigned long)-1) trace_seq_printf(&iter->seq, "CPU:%d [LOST EVENTS]\n", iter->cpu); else trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", iter->cpu, iter->lost_events); if (trace_seq_has_overflowed(&iter->seq)) return TRACE_TYPE_PARTIAL_LINE; } if (iter->trace && iter->trace->print_line) { ret = iter->trace->print_line(iter); if (ret != TRACE_TYPE_UNHANDLED) return ret; } if (iter->ent->type == TRACE_BPUTS && trace_flags & TRACE_ITER_PRINTK && trace_flags & TRACE_ITER_PRINTK_MSGONLY) return trace_print_bputs_msg_only(iter); if (iter->ent->type == TRACE_BPRINT && trace_flags & TRACE_ITER_PRINTK && trace_flags & TRACE_ITER_PRINTK_MSGONLY) return trace_print_bprintk_msg_only(iter); if (iter->ent->type == TRACE_PRINT && trace_flags & TRACE_ITER_PRINTK && trace_flags & TRACE_ITER_PRINTK_MSGONLY) return trace_print_printk_msg_only(iter); if (trace_flags & TRACE_ITER_BIN) return print_bin_fmt(iter); if (trace_flags & TRACE_ITER_HEX) return print_hex_fmt(iter); if (trace_flags & TRACE_ITER_RAW) return print_raw_fmt(iter); return print_trace_fmt(iter); } void trace_latency_header(struct seq_file *m) { struct trace_iterator *iter = m->private; struct trace_array *tr = iter->tr; /* print nothing if the buffers are empty */ if (trace_empty(iter)) return; if (iter->iter_flags & TRACE_FILE_LAT_FMT) print_trace_header(m, iter); if (!(tr->trace_flags & TRACE_ITER_VERBOSE)) print_lat_help_header(m); } void trace_default_header(struct seq_file *m) { struct trace_iterator *iter = m->private; struct trace_array *tr = iter->tr; unsigned long trace_flags = tr->trace_flags; if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) return; if (iter->iter_flags & TRACE_FILE_LAT_FMT) { /* print nothing if the buffers are empty */ if (trace_empty(iter)) return; print_trace_header(m, iter); if (!(trace_flags & TRACE_ITER_VERBOSE)) print_lat_help_header(m); } else { if (!(trace_flags & TRACE_ITER_VERBOSE)) { if (trace_flags & TRACE_ITER_IRQ_INFO) print_func_help_header_irq(iter->array_buffer, m, trace_flags); else print_func_help_header(iter->array_buffer, m, trace_flags); } } } static void test_ftrace_alive(struct seq_file *m) { if (!ftrace_is_dead()) return; seq_puts(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n" "# MAY BE MISSING FUNCTION EVENTS\n"); } #ifdef CONFIG_TRACER_MAX_TRACE static void show_snapshot_main_help(struct seq_file *m) { seq_puts(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n" "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n" "# Takes a snapshot of the main buffer.\n" "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n" "# (Doesn't have to be '2' works with any number that\n" "# is not a '0' or '1')\n"); } static void show_snapshot_percpu_help(struct seq_file *m) { seq_puts(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n"); #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP seq_puts(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n" "# Takes a snapshot of the main buffer for this cpu.\n"); #else seq_puts(m, "# echo 1 > snapshot : Not supported with this kernel.\n" "# Must use main snapshot file to allocate.\n"); #endif seq_puts(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n" "# (Doesn't have to be '2' works with any number that\n" "# is not a '0' or '1')\n"); } static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { if (iter->tr->allocated_snapshot) seq_puts(m, "#\n# * Snapshot is allocated *\n#\n"); else seq_puts(m, "#\n# * Snapshot is freed *\n#\n"); seq_puts(m, "# Snapshot commands:\n"); if (iter->cpu_file == RING_BUFFER_ALL_CPUS) show_snapshot_main_help(m); else show_snapshot_percpu_help(m); } #else /* Should never be called */ static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { } #endif static int s_show(struct seq_file *m, void *v) { struct trace_iterator *iter = v; int ret; if (iter->ent == NULL) { if (iter->tr) { seq_printf(m, "# tracer: %s\n", iter->trace->name); seq_puts(m, "#\n"); test_ftrace_alive(m); } if (iter->snapshot && trace_empty(iter)) print_snapshot_help(m, iter); else if (iter->trace && iter->trace->print_header) iter->trace->print_header(m); else trace_default_header(m); } else if (iter->leftover) { /* * If we filled the seq_file buffer earlier, we * want to just show it now. */ ret = trace_print_seq(m, &iter->seq); /* ret should this time be zero, but you never know */ iter->leftover = ret; } else { ret = print_trace_line(iter); if (ret == TRACE_TYPE_PARTIAL_LINE) { iter->seq.full = 0; trace_seq_puts(&iter->seq, "[LINE TOO BIG]\n"); } ret = trace_print_seq(m, &iter->seq); /* * If we overflow the seq_file buffer, then it will * ask us for this data again at start up. * Use that instead. * ret is 0 if seq_file write succeeded. * -1 otherwise. */ iter->leftover = ret; } return 0; } /* * Should be used after trace_array_get(), trace_types_lock * ensures that i_cdev was already initialized. */ static inline int tracing_get_cpu(struct inode *inode) { if (inode->i_cdev) /* See trace_create_cpu_file() */ return (long)inode->i_cdev - 1; return RING_BUFFER_ALL_CPUS; } static const struct seq_operations tracer_seq_ops = { .start = s_start, .next = s_next, .stop = s_stop, .show = s_show, }; /* * Note, as iter itself can be allocated and freed in different * ways, this function is only used to free its content, and not * the iterator itself. The only requirement to all the allocations * is that it must zero all fields (kzalloc), as freeing works with * ethier allocated content or NULL. */ static void free_trace_iter_content(struct trace_iterator *iter) { /* The fmt is either NULL, allocated or points to static_fmt_buf */ if (iter->fmt != static_fmt_buf) kfree(iter->fmt); kfree(iter->temp); kfree(iter->buffer_iter); mutex_destroy(&iter->mutex); free_cpumask_var(iter->started); } static struct trace_iterator * __tracing_open(struct inode *inode, struct file *file, bool snapshot) { struct trace_array *tr = inode->i_private; struct trace_iterator *iter; int cpu; if (tracing_disabled) return ERR_PTR(-ENODEV); iter = __seq_open_private(file, &tracer_seq_ops, sizeof(*iter)); if (!iter) return ERR_PTR(-ENOMEM); iter->buffer_iter = kcalloc(nr_cpu_ids, sizeof(*iter->buffer_iter), GFP_KERNEL); if (!iter->buffer_iter) goto release; /* * trace_find_next_entry() may need to save off iter->ent. * It will place it into the iter->temp buffer. As most * events are less than 128, allocate a buffer of that size. * If one is greater, then trace_find_next_entry() will * allocate a new buffer to adjust for the bigger iter->ent. * It's not critical if it fails to get allocated here. */ iter->temp = kmalloc(128, GFP_KERNEL); if (iter->temp) iter->temp_size = 128; /* * trace_event_printf() may need to modify given format * string to replace %p with %px so that it shows real address * instead of hash value. However, that is only for the event * tracing, other tracer may not need. Defer the allocation * until it is needed. */ iter->fmt = NULL; iter->fmt_size = 0; mutex_lock(&trace_types_lock); iter->trace = tr->current_trace; if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) goto fail; iter->tr = tr; #ifdef CONFIG_TRACER_MAX_TRACE /* Currently only the top directory has a snapshot */ if (tr->current_trace->print_max || snapshot) iter->array_buffer = &tr->max_buffer; else #endif iter->array_buffer = &tr->array_buffer; iter->snapshot = snapshot; iter->pos = -1; iter->cpu_file = tracing_get_cpu(inode); mutex_init(&iter->mutex); /* Notify the tracer early; before we stop tracing. */ if (iter->trace->open) iter->trace->open(iter); /* Annotate start of buffers if we had overruns */ if (ring_buffer_overruns(iter->array_buffer->buffer)) iter->iter_flags |= TRACE_FILE_ANNOTATE; /* Output in nanoseconds only if we are using a clock in nanoseconds. */ if (trace_clocks[tr->clock_id].in_ns) iter->iter_flags |= TRACE_FILE_TIME_IN_NS; /* * If pause-on-trace is enabled, then stop the trace while * dumping, unless this is the "snapshot" file */ if (!iter->snapshot && (tr->trace_flags & TRACE_ITER_PAUSE_ON_TRACE)) tracing_stop_tr(tr); if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter->buffer_iter[cpu] = ring_buffer_read_prepare(iter->array_buffer->buffer, cpu, GFP_KERNEL); } ring_buffer_read_prepare_sync(); for_each_tracing_cpu(cpu) { ring_buffer_read_start(iter->buffer_iter[cpu]); tracing_iter_reset(iter, cpu); } } else { cpu = iter->cpu_file; iter->buffer_iter[cpu] = ring_buffer_read_prepare(iter->array_buffer->buffer, cpu, GFP_KERNEL); ring_buffer_read_prepare_sync(); ring_buffer_read_start(iter->buffer_iter[cpu]); tracing_iter_reset(iter, cpu); } mutex_unlock(&trace_types_lock); return iter; fail: mutex_unlock(&trace_types_lock); free_trace_iter_content(iter); release: seq_release_private(inode, file); return ERR_PTR(-ENOMEM); } int tracing_open_generic(struct inode *inode, struct file *filp) { int ret; ret = tracing_check_open_get_tr(NULL); if (ret) return ret; filp->private_data = inode->i_private; return 0; } bool tracing_is_disabled(void) { return (tracing_disabled) ? true: false; } /* * Open and update trace_array ref count. * Must have the current trace_array passed to it. */ int tracing_open_generic_tr(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; int ret; ret = tracing_check_open_get_tr(tr); if (ret) return ret; filp->private_data = inode->i_private; return 0; } /* * The private pointer of the inode is the trace_event_file. * Update the tr ref count associated to it. */ int tracing_open_file_tr(struct inode *inode, struct file *filp) { struct trace_event_file *file = inode->i_private; int ret; ret = tracing_check_open_get_tr(file->tr); if (ret) return ret; mutex_lock(&event_mutex); /* Fail if the file is marked for removal */ if (file->flags & EVENT_FILE_FL_FREED) { trace_array_put(file->tr); ret = -ENODEV; } else { event_file_get(file); } mutex_unlock(&event_mutex); if (ret) return ret; filp->private_data = inode->i_private; return 0; } int tracing_release_file_tr(struct inode *inode, struct file *filp) { struct trace_event_file *file = inode->i_private; trace_array_put(file->tr); event_file_put(file); return 0; } int tracing_single_release_file_tr(struct inode *inode, struct file *filp) { tracing_release_file_tr(inode, filp); return single_release(inode, filp); } static int tracing_mark_open(struct inode *inode, struct file *filp) { stream_open(inode, filp); return tracing_open_generic_tr(inode, filp); } static int tracing_release(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; struct seq_file *m = file->private_data; struct trace_iterator *iter; int cpu; if (!(file->f_mode & FMODE_READ)) { trace_array_put(tr); return 0; } /* Writes do not use seq_file */ iter = m->private; mutex_lock(&trace_types_lock); for_each_tracing_cpu(cpu) { if (iter->buffer_iter[cpu]) ring_buffer_read_finish(iter->buffer_iter[cpu]); } if (iter->trace && iter->trace->close) iter->trace->close(iter); if (!iter->snapshot && tr->stop_count) /* reenable tracing if it was previously enabled */ tracing_start_tr(tr); __trace_array_put(tr); mutex_unlock(&trace_types_lock); free_trace_iter_content(iter); seq_release_private(inode, file); return 0; } int tracing_release_generic_tr(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; trace_array_put(tr); return 0; } static int tracing_single_release_tr(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; trace_array_put(tr); return single_release(inode, file); } static int tracing_open(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; struct trace_iterator *iter; int ret; ret = tracing_check_open_get_tr(tr); if (ret) return ret; /* If this file was open for write, then erase contents */ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { int cpu = tracing_get_cpu(inode); struct array_buffer *trace_buf = &tr->array_buffer; #ifdef CONFIG_TRACER_MAX_TRACE if (tr->current_trace->print_max) trace_buf = &tr->max_buffer; #endif if (cpu == RING_BUFFER_ALL_CPUS) tracing_reset_online_cpus(trace_buf); else tracing_reset_cpu(trace_buf, cpu); } if (file->f_mode & FMODE_READ) { iter = __tracing_open(inode, file, false); if (IS_ERR(iter)) ret = PTR_ERR(iter); else if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) iter->iter_flags |= TRACE_FILE_LAT_FMT; } if (ret < 0) trace_array_put(tr); return ret; } /* * Some tracers are not suitable for instance buffers. * A tracer is always available for the global array (toplevel) * or if it explicitly states that it is. */ static bool trace_ok_for_array(struct tracer *t, struct trace_array *tr) { #ifdef CONFIG_TRACER_SNAPSHOT /* arrays with mapped buffer range do not have snapshots */ if (tr->range_addr_start && t->use_max_tr) return false; #endif return (tr->flags & TRACE_ARRAY_FL_GLOBAL) || t->allow_instances; } /* Find the next tracer that this trace array may use */ static struct tracer * get_tracer_for_array(struct trace_array *tr, struct tracer *t) { while (t && !trace_ok_for_array(t, tr)) t = t->next; return t; } static void * t_next(struct seq_file *m, void *v, loff_t *pos) { struct trace_array *tr = m->private; struct tracer *t = v; (*pos)++; if (t) t = get_tracer_for_array(tr, t->next); return t; } static void *t_start(struct seq_file *m, loff_t *pos) { struct trace_array *tr = m->private; struct tracer *t; loff_t l = 0; mutex_lock(&trace_types_lock); t = get_tracer_for_array(tr, trace_types); for (; t && l < *pos; t = t_next(m, t, &l)) ; return t; } static void t_stop(struct seq_file *m, void *p) { mutex_unlock(&trace_types_lock); } static int t_show(struct seq_file *m, void *v) { struct tracer *t = v; if (!t) return 0; seq_puts(m, t->name); if (t->next) seq_putc(m, ' '); else seq_putc(m, '\n'); return 0; } static const struct seq_operations show_traces_seq_ops = { .start = t_start, .next = t_next, .stop = t_stop, .show = t_show, }; static int show_traces_open(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; struct seq_file *m; int ret; ret = tracing_check_open_get_tr(tr); if (ret) return ret; ret = seq_open(file, &show_traces_seq_ops); if (ret) { trace_array_put(tr); return ret; } m = file->private_data; m->private = tr; return 0; } static int tracing_seq_release(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; trace_array_put(tr); return seq_release(inode, file); } static ssize_t tracing_write_stub(struct file *filp, const char __user *ubuf, size_t count, loff_t *ppos) { return count; } loff_t tracing_lseek(struct file *file, loff_t offset, int whence) { int ret; if (file->f_mode & FMODE_READ) ret = seq_lseek(file, offset, whence); else file->f_pos = ret = 0; return ret; } static const struct file_operations tracing_fops = { .open = tracing_open, .read = seq_read, .read_iter = seq_read_iter, .splice_read = copy_splice_read, .write = tracing_write_stub, .llseek = tracing_lseek, .release = tracing_release, }; static const struct file_operations show_traces_fops = { .open = show_traces_open, .read = seq_read, .llseek = seq_lseek, .release = tracing_seq_release, }; static ssize_t tracing_cpumask_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { struct trace_array *tr = file_inode(filp)->i_private; char *mask_str; int len; len = snprintf(NULL, 0, "%*pb\n", cpumask_pr_args(tr->tracing_cpumask)) + 1; mask_str = kmalloc(len, GFP_KERNEL); if (!mask_str) return -ENOMEM; len = snprintf(mask_str, len, "%*pb\n", cpumask_pr_args(tr->tracing_cpumask)); if (len >= count) { count = -EINVAL; goto out_err; } count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len); out_err: kfree(mask_str); return count; } int tracing_set_cpumask(struct trace_array *tr, cpumask_var_t tracing_cpumask_new) { int cpu; if (!tr) return -EINVAL; local_irq_disable(); arch_spin_lock(&tr->max_lock); for_each_tracing_cpu(cpu) { /* * Increase/decrease the disabled counter if we are * about to flip a bit in the cpumask: */ if (cpumask_test_cpu(cpu, tr->tracing_cpumask) && !cpumask_test_cpu(cpu, tracing_cpumask_new)) { atomic_inc(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled); ring_buffer_record_disable_cpu(tr->array_buffer.buffer, cpu); #ifdef CONFIG_TRACER_MAX_TRACE ring_buffer_record_disable_cpu(tr->max_buffer.buffer, cpu); #endif } if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) && cpumask_test_cpu(cpu, tracing_cpumask_new)) { atomic_dec(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled); ring_buffer_record_enable_cpu(tr->array_buffer.buffer, cpu); #ifdef CONFIG_TRACER_MAX_TRACE ring_buffer_record_enable_cpu(tr->max_buffer.buffer, cpu); #endif } } arch_spin_unlock(&tr->max_lock); local_irq_enable(); cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new); return 0; } static ssize_t tracing_cpumask_write(struct file *filp, const char __user *ubuf, size_t count, loff_t *ppos) { struct trace_array *tr = file_inode(filp)->i_private; cpumask_var_t tracing_cpumask_new; int err; if (count == 0 || count > KMALLOC_MAX_SIZE) return -EINVAL; if (!zalloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) return -ENOMEM; err = cpumask_parse_user(ubuf, count, tracing_cpumask_new); if (err) goto err_free; err = tracing_set_cpumask(tr, tracing_cpumask_new); if (err) goto err_free; free_cpumask_var(tracing_cpumask_new); return count; err_free: free_cpumask_var(tracing_cpumask_new); return err; } static const struct file_operations tracing_cpumask_fops = { .open = tracing_open_generic_tr, .read = tracing_cpumask_read, .write = tracing_cpumask_write, .release = tracing_release_generic_tr, .llseek = generic_file_llseek, }; static int tracing_trace_options_show(struct seq_file *m, void *v) { struct tracer_opt *trace_opts; struct trace_array *tr = m->private; u32 tracer_flags; int i; guard(mutex)(&trace_types_lock); tracer_flags = tr->current_trace->flags->val; trace_opts = tr->current_trace->flags->opts; for (i = 0; trace_options[i]; i++) { if (tr->trace_flags & (1 << i)) seq_printf(m, "%s\n", trace_options[i]); else seq_printf(m, "no%s\n", trace_options[i]); } for (i = 0; trace_opts[i].name; i++) { if (tracer_flags & trace_opts[i].bit) seq_printf(m, "%s\n", trace_opts[i].name); else seq_printf(m, "no%s\n", trace_opts[i].name); } return 0; } static int __set_tracer_option(struct trace_array *tr, struct tracer_flags *tracer_flags, struct tracer_opt *opts, int neg) { struct tracer *trace = tracer_flags->trace; int ret; ret = trace->set_flag(tr, tracer_flags->val, opts->bit, !neg); if (ret) return ret; if (neg) tracer_flags->val &= ~opts->bit; else tracer_flags->val |= opts->bit; return 0; } /* Try to assign a tracer specific option */ static int set_tracer_option(struct trace_array *tr, char *cmp, int neg) { struct tracer *trace = tr->current_trace; struct tracer_flags *tracer_flags = trace->flags; struct tracer_opt *opts = NULL; int i; for (i = 0; tracer_flags->opts[i].name; i++) { opts = &tracer_flags->opts[i]; if (strcmp(cmp, opts->name) == 0) return __set_tracer_option(tr, trace->flags, opts, neg); } return -EINVAL; } /* Some tracers require overwrite to stay enabled */ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set) { if (tracer->enabled && (mask & TRACE_ITER_OVERWRITE) && !set) return -1; return 0; } int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) { if ((mask == TRACE_ITER_RECORD_TGID) || (mask == TRACE_ITER_RECORD_CMD) || (mask == TRACE_ITER_TRACE_PRINTK)) lockdep_assert_held(&event_mutex); /* do nothing if flag is already set */ if (!!(tr->trace_flags & mask) == !!enabled) return 0; /* Give the tracer a chance to approve the change */ if (tr->current_trace->flag_changed) if (tr->current_trace->flag_changed(tr, mask, !!enabled)) return -EINVAL; if (mask == TRACE_ITER_TRACE_PRINTK) { if (enabled) { update_printk_trace(tr); } else { /* * The global_trace cannot clear this. * It's flag only gets cleared if another instance sets it. */ if (printk_trace == &global_trace) return -EINVAL; /* * An instance must always have it set. * by default, that's the global_trace instane. */ if (printk_trace == tr) update_printk_trace(&global_trace); } } if (enabled) tr->trace_flags |= mask; else tr->trace_flags &= ~mask; if (mask == TRACE_ITER_RECORD_CMD) trace_event_enable_cmd_record(enabled); if (mask == TRACE_ITER_RECORD_TGID) { if (trace_alloc_tgid_map() < 0) { tr->trace_flags &= ~TRACE_ITER_RECORD_TGID; return -ENOMEM; } trace_event_enable_tgid_record(enabled); } if (mask == TRACE_ITER_EVENT_FORK) trace_event_follow_fork(tr, enabled); if (mask == TRACE_ITER_FUNC_FORK) ftrace_pid_follow_fork(tr, enabled); if (mask == TRACE_ITER_OVERWRITE) { ring_buffer_change_overwrite(tr->array_buffer.buffer, enabled); #ifdef CONFIG_TRACER_MAX_TRACE ring_buffer_change_overwrite(tr->max_buffer.buffer, enabled); #endif } if (mask == TRACE_ITER_PRINTK) { trace_printk_start_stop_comm(enabled); trace_printk_control(enabled); } return 0; } int trace_set_options(struct trace_array *tr, char *option) { char *cmp; int neg = 0; int ret; size_t orig_len = strlen(option); int len; cmp = strstrip(option); len = str_has_prefix(cmp, "no"); if (len) neg = 1; cmp += len; mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); ret = match_string(trace_options, -1, cmp); /* If no option could be set, test the specific tracer options */ if (ret < 0) ret = set_tracer_option(tr, cmp, neg); else ret = set_tracer_flag(tr, 1 << ret, !neg); mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); /* * If the first trailing whitespace is replaced with '\0' by strstrip, * turn it back into a space. */ if (orig_len > strlen(option)) option[strlen(option)] = ' '; return ret; } static void __init apply_trace_boot_options(void) { char *buf = trace_boot_options_buf; char *option; while (true) { option = strsep(&buf, ","); if (!option) break; if (*option) trace_set_options(&global_trace, option); /* Put back the comma to allow this to be called again */ if (buf) *(buf - 1) = ','; } } static ssize_t tracing_trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct seq_file *m = filp->private_data; struct trace_array *tr = m->private; char buf[64]; int ret; if (cnt >= sizeof(buf)) return -EINVAL; if (copy_from_user(buf, ubuf, cnt)) return -EFAULT; buf[cnt] = 0; ret = trace_set_options(tr, buf); if (ret < 0) return ret; *ppos += cnt; return cnt; } static int tracing_trace_options_open(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; int ret; ret = tracing_check_open_get_tr(tr); if (ret) return ret; ret = single_open(file, tracing_trace_options_show, inode->i_private); if (ret < 0) trace_array_put(tr); return ret; } static const struct file_operations tracing_iter_fops = { .open = tracing_trace_options_open, .read = seq_read, .llseek = seq_lseek, .release = tracing_single_release_tr, .write = tracing_trace_options_write, }; static const char readme_msg[] = "tracing mini-HOWTO:\n\n" "By default tracefs removes all OTH file permission bits.\n" "When mounting tracefs an optional group id can be specified\n" "which adds the group to every directory and file in tracefs:\n\n" "\t e.g. mount -t tracefs [-o [gid=<gid>]] nodev /sys/kernel/tracing\n\n" "# echo 0 > tracing_on : quick way to disable tracing\n" "# echo 1 > tracing_on : quick way to re-enable tracing\n\n" " Important files:\n" " trace\t\t\t- The static contents of the buffer\n" "\t\t\t To clear the buffer write into this file: echo > trace\n" " trace_pipe\t\t- A consuming read to see the contents of the buffer\n" " current_tracer\t- function and latency tracers\n" " available_tracers\t- list of configured tracers for current_tracer\n" " error_log\t- error log for failed commands (that support it)\n" " buffer_size_kb\t- view and modify size of per cpu buffer\n" " buffer_total_size_kb - view total size of all cpu buffers\n\n" " trace_clock\t\t- change the clock used to order events\n" " local: Per cpu clock but may not be synced across CPUs\n" " global: Synced across CPUs but slows tracing down.\n" " counter: Not a clock, but just an increment\n" " uptime: Jiffy counter from time of boot\n" " perf: Same clock that perf events use\n" #ifdef CONFIG_X86_64 " x86-tsc: TSC cycle counter\n" #endif "\n timestamp_mode\t- view the mode used to timestamp events\n" " delta: Delta difference against a buffer-wide timestamp\n" " absolute: Absolute (standalone) timestamp\n" "\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n" "\n trace_marker_raw\t\t- Writes into this file writes binary data into the kernel buffer\n" " tracing_cpumask\t- Limit which CPUs to trace\n" " instances\t\t- Make sub-buffers with: mkdir instances/foo\n" "\t\t\t Remove sub-buffer with rmdir\n" " trace_options\t\t- Set format or modify how tracing happens\n" "\t\t\t Disable an option by prefixing 'no' to the\n" "\t\t\t option name\n" " saved_cmdlines_size\t- echo command number in here to store comm-pid list\n" #ifdef CONFIG_DYNAMIC_FTRACE "\n available_filter_functions - list of functions that can be filtered on\n" " set_ftrace_filter\t- echo function name in here to only trace these\n" "\t\t\t functions\n" "\t accepts: func_full_name or glob-matching-pattern\n" "\t modules: Can select a group via module\n" "\t Format: :mod:<module-name>\n" "\t example: echo :mod:ext3 > set_ftrace_filter\n" "\t triggers: a command to perform when function is hit\n" "\t Format: <function>:<trigger>[:count]\n" "\t trigger: traceon, traceoff\n" "\t\t enable_event:<system>:<event>\n" "\t\t disable_event:<system>:<event>\n" #ifdef CONFIG_STACKTRACE "\t\t stacktrace\n" #endif #ifdef CONFIG_TRACER_SNAPSHOT "\t\t snapshot\n" #endif "\t\t dump\n" "\t\t cpudump\n" "\t example: echo do_fault:traceoff > set_ftrace_filter\n" "\t echo do_trap:traceoff:3 > set_ftrace_filter\n" "\t The first one will disable tracing every time do_fault is hit\n" "\t The second will disable tracing at most 3 times when do_trap is hit\n" "\t The first time do trap is hit and it disables tracing, the\n" "\t counter will decrement to 2. If tracing is already disabled,\n" "\t the counter will not decrement. It only decrements when the\n" "\t trigger did work\n" "\t To remove trigger without count:\n" "\t echo '!<function>:<trigger> > set_ftrace_filter\n" "\t To remove trigger with a count:\n" "\t echo '!<function>:<trigger>:0 > set_ftrace_filter\n" " set_ftrace_notrace\t- echo function name in here to never trace.\n" "\t accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" "\t modules: Can select a group via module command :mod:\n" "\t Does not accept triggers\n" #endif /* CONFIG_DYNAMIC_FTRACE */ #ifdef CONFIG_FUNCTION_TRACER " set_ftrace_pid\t- Write pid(s) to only function trace those pids\n" "\t\t (function)\n" " set_ftrace_notrace_pid\t- Write pid(s) to not function trace those pids\n" "\t\t (function)\n" #endif #ifdef CONFIG_FUNCTION_GRAPH_TRACER " set_graph_function\t- Trace the nested calls of a function (function_graph)\n" " set_graph_notrace\t- Do not trace the nested calls of a function (function_graph)\n" " max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n" #endif #ifdef CONFIG_TRACER_SNAPSHOT "\n snapshot\t\t- Like 'trace' but shows the content of the static\n" "\t\t\t snapshot buffer. Read the contents for more\n" "\t\t\t information\n" #endif #ifdef CONFIG_STACK_TRACER " stack_trace\t\t- Shows the max stack trace when active\n" " stack_max_size\t- Shows current max stack size that was traced\n" "\t\t\t Write into this file to reset the max size (trigger a\n" "\t\t\t new trace)\n" #ifdef CONFIG_DYNAMIC_FTRACE " stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace\n" "\t\t\t traces\n" #endif #endif /* CONFIG_STACK_TRACER */ #ifdef CONFIG_DYNAMIC_EVENTS " dynamic_events\t\t- Create/append/remove/show the generic dynamic events\n" "\t\t\t Write into this file to define/undefine new trace events.\n" #endif #ifdef CONFIG_KPROBE_EVENTS " kprobe_events\t\t- Create/append/remove/show the kernel dynamic events\n" "\t\t\t Write into this file to define/undefine new trace events.\n" #endif #ifdef CONFIG_UPROBE_EVENTS " uprobe_events\t\t- Create/append/remove/show the userspace dynamic events\n" "\t\t\t Write into this file to define/undefine new trace events.\n" #endif #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) || \ defined(CONFIG_FPROBE_EVENTS) "\t accepts: event-definitions (one definition per line)\n" #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) "\t Format: p[:[<group>/][<event>]] <place> [<args>]\n" "\t r[maxactive][:[<group>/][<event>]] <place> [<args>]\n" #endif #ifdef CONFIG_FPROBE_EVENTS "\t f[:[<group>/][<event>]] <func-name>[%return] [<args>]\n" "\t t[:[<group>/][<event>]] <tracepoint> [<args>]\n" #endif #ifdef CONFIG_HIST_TRIGGERS "\t s:[synthetic/]<event> <field> [<field>]\n" #endif "\t e[:[<group>/][<event>]] <attached-group>.<attached-event> [<args>] [if <filter>]\n" "\t -:[<group>/][<event>]\n" #ifdef CONFIG_KPROBE_EVENTS "\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n" "place (kretprobe): [<module>:]<symbol>[+<offset>]%return|<memaddr>\n" #endif #ifdef CONFIG_UPROBE_EVENTS " place (uprobe): <path>:<offset>[%return][(ref_ctr_offset)]\n" #endif "\t args: <name>=fetcharg[:type]\n" "\t fetcharg: (%<register>|$<efield>), @<address>, @<symbol>[+|-<offset>],\n" #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API "\t $stack<index>, $stack, $retval, $comm, $arg<N>,\n" #ifdef CONFIG_PROBE_EVENTS_BTF_ARGS "\t <argname>[->field[->field|.field...]],\n" #endif #else "\t $stack<index>, $stack, $retval, $comm,\n" #endif "\t +|-[u]<offset>(<fetcharg>), \\imm-value, \\\"imm-string\"\n" "\t kernel return probes support: $retval, $arg<N>, $comm\n" "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, char, string, symbol,\n" "\t b<bit-width>@<bit-offset>/<container-size>, ustring,\n" "\t symstr, %pd/%pD, <type>\\[<array-size>\\]\n" #ifdef CONFIG_HIST_TRIGGERS "\t field: <stype> <name>;\n" "\t stype: u8/u16/u32/u64, s8/s16/s32/s64, pid_t,\n" "\t [unsigned] char/int/long\n" #endif "\t efield: For event probes ('e' types), the field is on of the fields\n" "\t of the <attached-group>/<attached-event>.\n" #endif " set_event\t\t- Enables events by name written into it\n" "\t\t\t Can enable module events via: :mod:<module>\n" " events/\t\t- Directory containing all trace event subsystems:\n" " enable\t\t- Write 0/1 to enable/disable tracing of all events\n" " events/<system>/\t- Directory containing all trace events for <system>:\n" " enable\t\t- Write 0/1 to enable/disable tracing of all <system>\n" "\t\t\t events\n" " filter\t\t- If set, only events passing filter are traced\n" " events/<system>/<event>/\t- Directory containing control files for\n" "\t\t\t <event>:\n" " enable\t\t- Write 0/1 to enable/disable tracing of <event>\n" " filter\t\t- If set, only events passing filter are traced\n" " trigger\t\t- If set, a command to perform when event is hit\n" "\t Format: <trigger>[:count][if <filter>]\n" "\t trigger: traceon, traceoff\n" "\t enable_event:<system>:<event>\n" "\t disable_event:<system>:<event>\n" #ifdef CONFIG_HIST_TRIGGERS "\t enable_hist:<system>:<event>\n" "\t disable_hist:<system>:<event>\n" #endif #ifdef CONFIG_STACKTRACE "\t\t stacktrace\n" #endif #ifdef CONFIG_TRACER_SNAPSHOT "\t\t snapshot\n" #endif #ifdef CONFIG_HIST_TRIGGERS "\t\t hist (see below)\n" #endif "\t example: echo traceoff > events/block/block_unplug/trigger\n" "\t echo traceoff:3 > events/block/block_unplug/trigger\n" "\t echo 'enable_event:kmem:kmalloc:3 if nr_rq > 1' > \\\n" "\t events/block/block_unplug/trigger\n" "\t The first disables tracing every time block_unplug is hit.\n" "\t The second disables tracing the first 3 times block_unplug is hit.\n" "\t The third enables the kmalloc event the first 3 times block_unplug\n" "\t is hit and has value of greater than 1 for the 'nr_rq' event field.\n" "\t Like function triggers, the counter is only decremented if it\n" "\t enabled or disabled tracing.\n" "\t To remove a trigger without a count:\n" "\t echo '!<trigger> > <system>/<event>/trigger\n" "\t To remove a trigger with a count:\n" "\t echo '!<trigger>:0 > <system>/<event>/trigger\n" "\t Filters can be ignored when removing a trigger.\n" #ifdef CONFIG_HIST_TRIGGERS " hist trigger\t- If set, event hits are aggregated into a hash table\n" "\t Format: hist:keys=<field1[,field2,...]>\n" "\t [:<var1>=<field|var_ref|numeric_literal>[,<var2>=...]]\n" "\t [:values=<field1[,field2,...]>]\n" "\t [:sort=<field1[,field2,...]>]\n" "\t [:size=#entries]\n" "\t [:pause][:continue][:clear]\n" "\t [:name=histname1]\n" "\t [:nohitcount]\n" "\t [:<handler>.<action>]\n" "\t [if <filter>]\n\n" "\t Note, special fields can be used as well:\n" "\t common_timestamp - to record current timestamp\n" "\t common_cpu - to record the CPU the event happened on\n" "\n" "\t A hist trigger variable can be:\n" "\t - a reference to a field e.g. x=current_timestamp,\n" "\t - a reference to another variable e.g. y=$x,\n" "\t - a numeric literal: e.g. ms_per_sec=1000,\n" "\t - an arithmetic expression: e.g. time_secs=current_timestamp/1000\n" "\n" "\t hist trigger arithmetic expressions support addition(+), subtraction(-),\n" "\t multiplication(*) and division(/) operators. An operand can be either a\n" "\t variable reference, field or numeric literal.\n" "\n" "\t When a matching event is hit, an entry is added to a hash\n" "\t table using the key(s) and value(s) named, and the value of a\n" "\t sum called 'hitcount' is incremented. Keys and values\n" "\t correspond to fields in the event's format description. Keys\n" "\t can be any field, or the special string 'common_stacktrace'.\n" "\t Compound keys consisting of up to two fields can be specified\n" "\t by the 'keys' keyword. Values must correspond to numeric\n" "\t fields. Sort keys consisting of up to two fields can be\n" "\t specified using the 'sort' keyword. The sort direction can\n" "\t be modified by appending '.descending' or '.ascending' to a\n" "\t sort field. The 'size' parameter can be used to specify more\n" "\t or fewer than the default 2048 entries for the hashtable size.\n" "\t If a hist trigger is given a name using the 'name' parameter,\n" "\t its histogram data will be shared with other triggers of the\n" "\t same name, and trigger hits will update this common data.\n\n" "\t Reading the 'hist' file for the event will dump the hash\n" "\t table in its entirety to stdout. If there are multiple hist\n" "\t triggers attached to an event, there will be a table for each\n" "\t trigger in the output. The table displayed for a named\n" "\t trigger will be the same as any other instance having the\n" "\t same name. The default format used to display a given field\n" "\t can be modified by appending any of the following modifiers\n" "\t to the field name, as applicable:\n\n" "\t .hex display a number as a hex value\n" "\t .sym display an address as a symbol\n" "\t .sym-offset display an address as a symbol and offset\n" "\t .execname display a common_pid as a program name\n" "\t .syscall display a syscall id as a syscall name\n" "\t .log2 display log2 value rather than raw number\n" "\t .buckets=size display values in groups of size rather than raw number\n" "\t .usecs display a common_timestamp in microseconds\n" "\t .percent display a number of percentage value\n" "\t .graph display a bar-graph of a value\n\n" "\t The 'pause' parameter can be used to pause an existing hist\n" "\t trigger or to start a hist trigger but not log any events\n" "\t until told to do so. 'continue' can be used to start or\n" "\t restart a paused hist trigger.\n\n" "\t The 'clear' parameter will clear the contents of a running\n" "\t hist trigger and leave its current paused/active state\n" "\t unchanged.\n\n" "\t The 'nohitcount' (or NOHC) parameter will suppress display of\n" "\t raw hitcount in the histogram.\n\n" "\t The enable_hist and disable_hist triggers can be used to\n" "\t have one event conditionally start and stop another event's\n" "\t already-attached hist trigger. The syntax is analogous to\n" "\t the enable_event and disable_event triggers.\n\n" "\t Hist trigger handlers and actions are executed whenever a\n" "\t a histogram entry is added or updated. They take the form:\n\n" "\t <handler>.<action>\n\n" "\t The available handlers are:\n\n" "\t onmatch(matching.event) - invoke on addition or update\n" "\t onmax(var) - invoke if var exceeds current max\n" "\t onchange(var) - invoke action if var changes\n\n" "\t The available actions are:\n\n" "\t trace(<synthetic_event>,param list) - generate synthetic event\n" "\t save(field,...) - save current event fields\n" #ifdef CONFIG_TRACER_SNAPSHOT "\t snapshot() - snapshot the trace buffer\n\n" #endif #ifdef CONFIG_SYNTH_EVENTS " events/synthetic_events\t- Create/append/remove/show synthetic events\n" "\t Write into this file to define/undefine new synthetic events.\n" "\t example: echo 'myevent u64 lat; char name[]; long[] stack' >> synthetic_events\n" #endif #endif ; static ssize_t tracing_readme_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { return simple_read_from_buffer(ubuf, cnt, ppos, readme_msg, strlen(readme_msg)); } static const struct file_operations tracing_readme_fops = { .open = tracing_open_generic, .read = tracing_readme_read, .llseek = generic_file_llseek, }; #ifdef CONFIG_TRACE_EVAL_MAP_FILE static union trace_eval_map_item * update_eval_map(union trace_eval_map_item *ptr) { if (!ptr->map.eval_string) { if (ptr->tail.next) { ptr = ptr->tail.next; /* Set ptr to the next real item (skip head) */ ptr++; } else return NULL; } return ptr; } static void *eval_map_next(struct seq_file *m, void *v, loff_t *pos) { union trace_eval_map_item *ptr = v; /* * Paranoid! If ptr points to end, we don't want to increment past it. * This really should never happen. */ (*pos)++; ptr = update_eval_map(ptr); if (WARN_ON_ONCE(!ptr)) return NULL; ptr++; ptr = update_eval_map(ptr); return ptr; } static void *eval_map_start(struct seq_file *m, loff_t *pos) { union trace_eval_map_item *v; loff_t l = 0; mutex_lock(&trace_eval_mutex); v = trace_eval_maps; if (v) v++; while (v && l < *pos) { v = eval_map_next(m, v, &l); } return v; } static void eval_map_stop(struct seq_file *m, void *v) { mutex_unlock(&trace_eval_mutex); } static int eval_map_show(struct seq_file *m, void *v) { union trace_eval_map_item *ptr = v; seq_printf(m, "%s %ld (%s)\n", ptr->map.eval_string, ptr->map.eval_value, ptr->map.system); return 0; } static const struct seq_operations tracing_eval_map_seq_ops = { .start = eval_map_start, .next = eval_map_next, .stop = eval_map_stop, .show = eval_map_show, }; static int tracing_eval_map_open(struct inode *inode, struct file *filp) { int ret; ret = tracing_check_open_get_tr(NULL); if (ret) return ret; return seq_open(filp, &tracing_eval_map_seq_ops); } static const struct file_operations tracing_eval_map_fops = { .open = tracing_eval_map_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; static inline union trace_eval_map_item * trace_eval_jmp_to_tail(union trace_eval_map_item *ptr) { /* Return tail of array given the head */ return ptr + ptr->head.length + 1; } static void trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start, int len) { struct trace_eval_map **stop; struct trace_eval_map **map; union trace_eval_map_item *map_array; union trace_eval_map_item *ptr; stop = start + len; /* * The trace_eval_maps contains the map plus a head and tail item, * where the head holds the module and length of array, and the * tail holds a pointer to the next list. */ map_array = kmalloc_array(len + 2, sizeof(*map_array), GFP_KERNEL); if (!map_array) { pr_warn("Unable to allocate trace eval mapping\n"); return; } guard(mutex)(&trace_eval_mutex); if (!trace_eval_maps) trace_eval_maps = map_array; else { ptr = trace_eval_maps; for (;;) { ptr = trace_eval_jmp_to_tail(ptr); if (!ptr->tail.next) break; ptr = ptr->tail.next; } ptr->tail.next = map_array; } map_array->head.mod = mod; map_array->head.length = len; map_array++; for (map = start; (unsigned long)map < (unsigned long)stop; map++) { map_array->map = **map; map_array++; } memset(map_array, 0, sizeof(*map_array)); } static void trace_create_eval_file(struct dentry *d_tracer) { trace_create_file("eval_map", TRACE_MODE_READ, d_tracer, NULL, &tracing_eval_map_fops); } #else /* CONFIG_TRACE_EVAL_MAP_FILE */ static inline void trace_create_eval_file(struct dentry *d_tracer) { } static inline void trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start, int len) { } #endif /* !CONFIG_TRACE_EVAL_MAP_FILE */ static void trace_insert_eval_map(struct module *mod, struct trace_eval_map **start, int len) { struct trace_eval_map **map; if (len <= 0) return; map = start; trace_event_eval_update(map, len); trace_insert_eval_map_file(mod, start, len); } static ssize_t tracing_set_trace_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; char buf[MAX_TRACER_SIZE+2]; int r; mutex_lock(&trace_types_lock); r = sprintf(buf, "%s\n", tr->current_trace->name); mutex_unlock(&trace_types_lock); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } int tracer_init(struct tracer *t, struct trace_array *tr) { tracing_reset_online_cpus(&tr->array_buffer); return t->init(tr); } static void set_buffer_entries(struct array_buffer *buf, unsigned long val) { int cpu; for_each_tracing_cpu(cpu) per_cpu_ptr(buf->data, cpu)->entries = val; } static void update_buffer_entries(struct array_buffer *buf, int cpu) { if (cpu == RING_BUFFER_ALL_CPUS) { set_buffer_entries(buf, ring_buffer_size(buf->buffer, 0)); } else { per_cpu_ptr(buf->data, cpu)->entries = ring_buffer_size(buf->buffer, cpu); } } #ifdef CONFIG_TRACER_MAX_TRACE /* resize @tr's buffer to the size of @size_tr's entries */ static int resize_buffer_duplicate_size(struct array_buffer *trace_buf, struct array_buffer *size_buf, int cpu_id) { int cpu, ret = 0; if (cpu_id == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { ret = ring_buffer_resize(trace_buf->buffer, per_cpu_ptr(size_buf->data, cpu)->entries, cpu); if (ret < 0) break; per_cpu_ptr(trace_buf->data, cpu)->entries = per_cpu_ptr(size_buf->data, cpu)->entries; } } else { ret = ring_buffer_resize(trace_buf->buffer, per_cpu_ptr(size_buf->data, cpu_id)->entries, cpu_id); if (ret == 0) per_cpu_ptr(trace_buf->data, cpu_id)->entries = per_cpu_ptr(size_buf->data, cpu_id)->entries; } return ret; } #endif /* CONFIG_TRACER_MAX_TRACE */ static int __tracing_resize_ring_buffer(struct trace_array *tr, unsigned long size, int cpu) { int ret; /* * If kernel or user changes the size of the ring buffer * we use the size that was given, and we can forget about * expanding it later. */ trace_set_ring_buffer_expanded(tr); /* May be called before buffers are initialized */ if (!tr->array_buffer.buffer) return 0; /* Do not allow tracing while resizing ring buffer */ tracing_stop_tr(tr); ret = ring_buffer_resize(tr->array_buffer.buffer, size, cpu); if (ret < 0) goto out_start; #ifdef CONFIG_TRACER_MAX_TRACE if (!tr->allocated_snapshot) goto out; ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu); if (ret < 0) { int r = resize_buffer_duplicate_size(&tr->array_buffer, &tr->array_buffer, cpu); if (r < 0) { /* * AARGH! We are left with different * size max buffer!!!! * The max buffer is our "snapshot" buffer. * When a tracer needs a snapshot (one of the * latency tracers), it swaps the max buffer * with the saved snap shot. We succeeded to * update the size of the main buffer, but failed to * update the size of the max buffer. But when we tried * to reset the main buffer to the original size, we * failed there too. This is very unlikely to * happen, but if it does, warn and kill all * tracing. */ WARN_ON(1); tracing_disabled = 1; } goto out_start; } update_buffer_entries(&tr->max_buffer, cpu); out: #endif /* CONFIG_TRACER_MAX_TRACE */ update_buffer_entries(&tr->array_buffer, cpu); out_start: tracing_start_tr(tr); return ret; } ssize_t tracing_resize_ring_buffer(struct trace_array *tr, unsigned long size, int cpu_id) { int ret; guard(mutex)(&trace_types_lock); if (cpu_id != RING_BUFFER_ALL_CPUS) { /* make sure, this cpu is enabled in the mask */ if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) return -EINVAL; } ret = __tracing_resize_ring_buffer(tr, size, cpu_id); if (ret < 0) ret = -ENOMEM; return ret; } static void update_last_data(struct trace_array *tr) { if (!tr->text_delta && !tr->data_delta) return; /* * Need to clear all CPU buffers as there cannot be events * from the previous boot mixed with events with this boot * as that will cause a confusing trace. Need to clear all * CPU buffers, even for those that may currently be offline. */ tracing_reset_all_cpus(&tr->array_buffer); /* Using current data now */ tr->text_delta = 0; tr->data_delta = 0; } /** * tracing_update_buffers - used by tracing facility to expand ring buffers * @tr: The tracing instance * * To save on memory when the tracing is never used on a system with it * configured in. The ring buffers are set to a minimum size. But once * a user starts to use the tracing facility, then they need to grow * to their default size. * * This function is to be called when a tracer is about to be used. */ int tracing_update_buffers(struct trace_array *tr) { int ret = 0; mutex_lock(&trace_types_lock); update_last_data(tr); if (!tr->ring_buffer_expanded) ret = __tracing_resize_ring_buffer(tr, trace_buf_size, RING_BUFFER_ALL_CPUS); mutex_unlock(&trace_types_lock); return ret; } struct trace_option_dentry; static void create_trace_option_files(struct trace_array *tr, struct tracer *tracer); /* * Used to clear out the tracer before deletion of an instance. * Must have trace_types_lock held. */ static void tracing_set_nop(struct trace_array *tr) { if (tr->current_trace == &nop_trace) return; tr->current_trace->enabled--; if (tr->current_trace->reset) tr->current_trace->reset(tr); tr->current_trace = &nop_trace; } static bool tracer_options_updated; static void add_tracer_options(struct trace_array *tr, struct tracer *t) { /* Only enable if the directory has been created already. */ if (!tr->dir) return; /* Only create trace option files after update_tracer_options finish */ if (!tracer_options_updated) return; create_trace_option_files(tr, t); } int tracing_set_tracer(struct trace_array *tr, const char *buf) { struct tracer *t; #ifdef CONFIG_TRACER_MAX_TRACE bool had_max_tr; #endif int ret; guard(mutex)(&trace_types_lock); update_last_data(tr); if (!tr->ring_buffer_expanded) { ret = __tracing_resize_ring_buffer(tr, trace_buf_size, RING_BUFFER_ALL_CPUS); if (ret < 0) return ret; ret = 0; } for (t = trace_types; t; t = t->next) { if (strcmp(t->name, buf) == 0) break; } if (!t) return -EINVAL; if (t == tr->current_trace) return 0; #ifdef CONFIG_TRACER_SNAPSHOT if (t->use_max_tr) { local_irq_disable(); arch_spin_lock(&tr->max_lock); ret = tr->cond_snapshot ? -EBUSY : 0; arch_spin_unlock(&tr->max_lock); local_irq_enable(); if (ret) return ret; } #endif /* Some tracers won't work on kernel command line */ if (system_state < SYSTEM_RUNNING && t->noboot) { pr_warn("Tracer '%s' is not allowed on command line, ignored\n", t->name); return -EINVAL; } /* Some tracers are only allowed for the top level buffer */ if (!trace_ok_for_array(t, tr)) return -EINVAL; /* If trace pipe files are being read, we can't change the tracer */ if (tr->trace_ref) return -EBUSY; trace_branch_disable(); tr->current_trace->enabled--; if (tr->current_trace->reset) tr->current_trace->reset(tr); #ifdef CONFIG_TRACER_MAX_TRACE had_max_tr = tr->current_trace->use_max_tr; /* Current trace needs to be nop_trace before synchronize_rcu */ tr->current_trace = &nop_trace; if (had_max_tr && !t->use_max_tr) { /* * We need to make sure that the update_max_tr sees that * current_trace changed to nop_trace to keep it from * swapping the buffers after we resize it. * The update_max_tr is called from interrupts disabled * so a synchronized_sched() is sufficient. */ synchronize_rcu(); free_snapshot(tr); tracing_disarm_snapshot(tr); } if (!had_max_tr && t->use_max_tr) { ret = tracing_arm_snapshot_locked(tr); if (ret) return ret; } #else tr->current_trace = &nop_trace; #endif if (t->init) { ret = tracer_init(t, tr); if (ret) { #ifdef CONFIG_TRACER_MAX_TRACE if (t->use_max_tr) tracing_disarm_snapshot(tr); #endif return ret; } } tr->current_trace = t; tr->current_trace->enabled++; trace_branch_enable(tr); return 0; } static ssize_t tracing_set_trace_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; char buf[MAX_TRACER_SIZE+1]; char *name; size_t ret; int err; ret = cnt; if (cnt > MAX_TRACER_SIZE) cnt = MAX_TRACER_SIZE; if (copy_from_user(buf, ubuf, cnt)) return -EFAULT; buf[cnt] = 0; name = strim(buf); err = tracing_set_tracer(tr, name); if (err) return err; *ppos += ret; return ret; } static ssize_t tracing_nsecs_read(unsigned long *ptr, char __user *ubuf, size_t cnt, loff_t *ppos) { char buf[64]; int r; r = snprintf(buf, sizeof(buf), "%ld\n", *ptr == (unsigned long)-1 ? -1 : nsecs_to_usecs(*ptr)); if (r > sizeof(buf)) r = sizeof(buf); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } static ssize_t tracing_nsecs_write(unsigned long *ptr, const char __user *ubuf, size_t cnt, loff_t *ppos) { unsigned long val; int ret; ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; *ptr = val * 1000; return cnt; } static ssize_t tracing_thresh_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { return tracing_nsecs_read(&tracing_thresh, ubuf, cnt, ppos); } static ssize_t tracing_thresh_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; int ret; guard(mutex)(&trace_types_lock); ret = tracing_nsecs_write(&tracing_thresh, ubuf, cnt, ppos); if (ret < 0) return ret; if (tr->current_trace->update_thresh) { ret = tr->current_trace->update_thresh(tr); if (ret < 0) return ret; } return cnt; } #ifdef CONFIG_TRACER_MAX_TRACE static ssize_t tracing_max_lat_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; return tracing_nsecs_read(&tr->max_latency, ubuf, cnt, ppos); } static ssize_t tracing_max_lat_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; return tracing_nsecs_write(&tr->max_latency, ubuf, cnt, ppos); } #endif static int open_pipe_on_cpu(struct trace_array *tr, int cpu) { if (cpu == RING_BUFFER_ALL_CPUS) { if (cpumask_empty(tr->pipe_cpumask)) { cpumask_setall(tr->pipe_cpumask); return 0; } } else if (!cpumask_test_cpu(cpu, tr->pipe_cpumask)) { cpumask_set_cpu(cpu, tr->pipe_cpumask); return 0; } return -EBUSY; } static void close_pipe_on_cpu(struct trace_array *tr, int cpu) { if (cpu == RING_BUFFER_ALL_CPUS) { WARN_ON(!cpumask_full(tr->pipe_cpumask)); cpumask_clear(tr->pipe_cpumask); } else { WARN_ON(!cpumask_test_cpu(cpu, tr->pipe_cpumask)); cpumask_clear_cpu(cpu, tr->pipe_cpumask); } } static int tracing_open_pipe(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; struct trace_iterator *iter; int cpu; int ret; ret = tracing_check_open_get_tr(tr); if (ret) return ret; mutex_lock(&trace_types_lock); cpu = tracing_get_cpu(inode); ret = open_pipe_on_cpu(tr, cpu); if (ret) goto fail_pipe_on_cpu; /* create a buffer to store the information to pass to userspace */ iter = kzalloc(sizeof(*iter), GFP_KERNEL); if (!iter) { ret = -ENOMEM; goto fail_alloc_iter; } trace_seq_init(&iter->seq); iter->trace = tr->current_trace; if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { ret = -ENOMEM; goto fail; } /* trace pipe does not show start of buffer */ cpumask_setall(iter->started); if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) iter->iter_flags |= TRACE_FILE_LAT_FMT; /* Output in nanoseconds only if we are using a clock in nanoseconds. */ if (trace_clocks[tr->clock_id].in_ns) iter->iter_flags |= TRACE_FILE_TIME_IN_NS; iter->tr = tr; iter->array_buffer = &tr->array_buffer; iter->cpu_file = cpu; mutex_init(&iter->mutex); filp->private_data = iter; if (iter->trace->pipe_open) iter->trace->pipe_open(iter); nonseekable_open(inode, filp); tr->trace_ref++; mutex_unlock(&trace_types_lock); return ret; fail: kfree(iter); fail_alloc_iter: close_pipe_on_cpu(tr, cpu); fail_pipe_on_cpu: __trace_array_put(tr); mutex_unlock(&trace_types_lock); return ret; } static int tracing_release_pipe(struct inode *inode, struct file *file) { struct trace_iterator *iter = file->private_data; struct trace_array *tr = inode->i_private; mutex_lock(&trace_types_lock); tr->trace_ref--; if (iter->trace->pipe_close) iter->trace->pipe_close(iter); close_pipe_on_cpu(tr, iter->cpu_file); mutex_unlock(&trace_types_lock); free_trace_iter_content(iter); kfree(iter); trace_array_put(tr); return 0; } static __poll_t trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table) { struct trace_array *tr = iter->tr; /* Iterators are static, they should be filled or empty */ if (trace_buffer_iter(iter, iter->cpu_file)) return EPOLLIN | EPOLLRDNORM; if (tr->trace_flags & TRACE_ITER_BLOCK) /* * Always select as readable when in blocking mode */ return EPOLLIN | EPOLLRDNORM; else return ring_buffer_poll_wait(iter->array_buffer->buffer, iter->cpu_file, filp, poll_table, iter->tr->buffer_percent); } static __poll_t tracing_poll_pipe(struct file *filp, poll_table *poll_table) { struct trace_iterator *iter = filp->private_data; return trace_poll(iter, filp, poll_table); } /* Must be called with iter->mutex held. */ static int tracing_wait_pipe(struct file *filp) { struct trace_iterator *iter = filp->private_data; int ret; while (trace_empty(iter)) { if ((filp->f_flags & O_NONBLOCK)) { return -EAGAIN; } /* * We block until we read something and tracing is disabled. * We still block if tracing is disabled, but we have never * read anything. This allows a user to cat this file, and * then enable tracing. But after we have read something, * we give an EOF when tracing is again disabled. * * iter->pos will be 0 if we haven't read anything. */ if (!tracer_tracing_is_on(iter->tr) && iter->pos) break; mutex_unlock(&iter->mutex); ret = wait_on_pipe(iter, 0); mutex_lock(&iter->mutex); if (ret) return ret; } return 1; } /* * Consumer reader. */ static ssize_t tracing_read_pipe(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_iterator *iter = filp->private_data; ssize_t sret; /* * Avoid more than one consumer on a single file descriptor * This is just a matter of traces coherency, the ring buffer itself * is protected. */ guard(mutex)(&iter->mutex); /* return any leftover data */ sret = trace_seq_to_user(&iter->seq, ubuf, cnt); if (sret != -EBUSY) return sret; trace_seq_init(&iter->seq); if (iter->trace->read) { sret = iter->trace->read(iter, filp, ubuf, cnt, ppos); if (sret) return sret; } waitagain: sret = tracing_wait_pipe(filp); if (sret <= 0) return sret; /* stop when tracing is finished */ if (trace_empty(iter)) return 0; if (cnt >= TRACE_SEQ_BUFFER_SIZE) cnt = TRACE_SEQ_BUFFER_SIZE - 1; /* reset all but tr, trace, and overruns */ trace_iterator_reset(iter); cpumask_clear(iter->started); trace_seq_init(&iter->seq); trace_event_read_lock(); trace_access_lock(iter->cpu_file); while (trace_find_next_entry_inc(iter) != NULL) { enum print_line_t ret; int save_len = iter->seq.seq.len; ret = print_trace_line(iter); if (ret == TRACE_TYPE_PARTIAL_LINE) { /* * If one print_trace_line() fills entire trace_seq in one shot, * trace_seq_to_user() will returns -EBUSY because save_len == 0, * In this case, we need to consume it, otherwise, loop will peek * this event next time, resulting in an infinite loop. */ if (save_len == 0) { iter->seq.full = 0; trace_seq_puts(&iter->seq, "[LINE TOO BIG]\n"); trace_consume(iter); break; } /* In other cases, don't print partial lines */ iter->seq.seq.len = save_len; break; } if (ret != TRACE_TYPE_NO_CONSUME) trace_consume(iter); if (trace_seq_used(&iter->seq) >= cnt) break; /* * Setting the full flag means we reached the trace_seq buffer * size and we should leave by partial output condition above. * One of the trace_seq_* functions is not used properly. */ WARN_ONCE(iter->seq.full, "full flag set for trace type %d", iter->ent->type); } trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); /* Now copy what we have to the user */ sret = trace_seq_to_user(&iter->seq, ubuf, cnt); if (iter->seq.readpos >= trace_seq_used(&iter->seq)) trace_seq_init(&iter->seq); /* * If there was nothing to send to user, in spite of consuming trace * entries, go back to wait for more entries. */ if (sret == -EBUSY) goto waitagain; return sret; } static void tracing_spd_release_pipe(struct splice_pipe_desc *spd, unsigned int idx) { __free_page(spd->pages[idx]); } static size_t tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) { size_t count; int save_len; int ret; /* Seq buffer is page-sized, exactly what we need. */ for (;;) { save_len = iter->seq.seq.len; ret = print_trace_line(iter); if (trace_seq_has_overflowed(&iter->seq)) { iter->seq.seq.len = save_len; break; } /* * This should not be hit, because it should only * be set if the iter->seq overflowed. But check it * anyway to be safe. */ if (ret == TRACE_TYPE_PARTIAL_LINE) { iter->seq.seq.len = save_len; break; } count = trace_seq_used(&iter->seq) - save_len; if (rem < count) { rem = 0; iter->seq.seq.len = save_len; break; } if (ret != TRACE_TYPE_NO_CONSUME) trace_consume(iter); rem -= count; if (!trace_find_next_entry_inc(iter)) { rem = 0; iter->ent = NULL; break; } } return rem; } static ssize_t tracing_splice_read_pipe(struct file *filp, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct page *pages_def[PIPE_DEF_BUFFERS]; struct partial_page partial_def[PIPE_DEF_BUFFERS]; struct trace_iterator *iter = filp->private_data; struct splice_pipe_desc spd = { .pages = pages_def, .partial = partial_def, .nr_pages = 0, /* This gets updated below. */ .nr_pages_max = PIPE_DEF_BUFFERS, .ops = &default_pipe_buf_ops, .spd_release = tracing_spd_release_pipe, }; ssize_t ret; size_t rem; unsigned int i; if (splice_grow_spd(pipe, &spd)) return -ENOMEM; mutex_lock(&iter->mutex); if (iter->trace->splice_read) { ret = iter->trace->splice_read(iter, filp, ppos, pipe, len, flags); if (ret) goto out_err; } ret = tracing_wait_pipe(filp); if (ret <= 0) goto out_err; if (!iter->ent && !trace_find_next_entry_inc(iter)) { ret = -EFAULT; goto out_err; } trace_event_read_lock(); trace_access_lock(iter->cpu_file); /* Fill as many pages as possible. */ for (i = 0, rem = len; i < spd.nr_pages_max && rem; i++) { spd.pages[i] = alloc_page(GFP_KERNEL); if (!spd.pages[i]) break; rem = tracing_fill_pipe_page(rem, iter); /* Copy the data into the page, so we can start over. */ ret = trace_seq_to_buffer(&iter->seq, page_address(spd.pages[i]), trace_seq_used(&iter->seq)); if (ret < 0) { __free_page(spd.pages[i]); break; } spd.partial[i].offset = 0; spd.partial[i].len = trace_seq_used(&iter->seq); trace_seq_init(&iter->seq); } trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); mutex_unlock(&iter->mutex); spd.nr_pages = i; if (i) ret = splice_to_pipe(pipe, &spd); else ret = 0; out: splice_shrink_spd(&spd); return ret; out_err: mutex_unlock(&iter->mutex); goto out; } static ssize_t tracing_entries_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct inode *inode = file_inode(filp); struct trace_array *tr = inode->i_private; int cpu = tracing_get_cpu(inode); char buf[64]; int r = 0; ssize_t ret; mutex_lock(&trace_types_lock); if (cpu == RING_BUFFER_ALL_CPUS) { int cpu, buf_size_same; unsigned long size; size = 0; buf_size_same = 1; /* check if all cpu sizes are same */ for_each_tracing_cpu(cpu) { /* fill in the size from first enabled cpu */ if (size == 0) size = per_cpu_ptr(tr->array_buffer.data, cpu)->entries; if (size != per_cpu_ptr(tr->array_buffer.data, cpu)->entries) { buf_size_same = 0; break; } } if (buf_size_same) { if (!tr->ring_buffer_expanded) r = sprintf(buf, "%lu (expanded: %lu)\n", size >> 10, trace_buf_size >> 10); else r = sprintf(buf, "%lu\n", size >> 10); } else r = sprintf(buf, "X\n"); } else r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->array_buffer.data, cpu)->entries >> 10); mutex_unlock(&trace_types_lock); ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); return ret; } static ssize_t tracing_entries_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct inode *inode = file_inode(filp); struct trace_array *tr = inode->i_private; unsigned long val; int ret; ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; /* must have at least 1 entry */ if (!val) return -EINVAL; /* value is in KB */ val <<= 10; ret = tracing_resize_ring_buffer(tr, val, tracing_get_cpu(inode)); if (ret < 0) return ret; *ppos += cnt; return cnt; } static ssize_t tracing_total_entries_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; char buf[64]; int r, cpu; unsigned long size = 0, expanded_size = 0; mutex_lock(&trace_types_lock); for_each_tracing_cpu(cpu) { size += per_cpu_ptr(tr->array_buffer.data, cpu)->entries >> 10; if (!tr->ring_buffer_expanded) expanded_size += trace_buf_size >> 10; } if (tr->ring_buffer_expanded) r = sprintf(buf, "%lu\n", size); else r = sprintf(buf, "%lu (expanded: %lu)\n", size, expanded_size); mutex_unlock(&trace_types_lock); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } static ssize_t tracing_last_boot_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; struct seq_buf seq; char buf[64]; seq_buf_init(&seq, buf, 64); seq_buf_printf(&seq, "text delta:\t%ld\n", tr->text_delta); seq_buf_printf(&seq, "data delta:\t%ld\n", tr->data_delta); return simple_read_from_buffer(ubuf, cnt, ppos, buf, seq_buf_used(&seq)); } static int tracing_buffer_meta_open(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; int cpu = tracing_get_cpu(inode); int ret; ret = tracing_check_open_get_tr(tr); if (ret) return ret; ret = ring_buffer_meta_seq_init(filp, tr->array_buffer.buffer, cpu); if (ret < 0) __trace_array_put(tr); return ret; } static ssize_t tracing_free_buffer_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { /* * There is no need to read what the user has written, this function * is just to make sure that there is no error when "echo" is used */ *ppos += cnt; return cnt; } static int tracing_free_buffer_release(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; /* disable tracing ? */ if (tr->trace_flags & TRACE_ITER_STOP_ON_FREE) tracer_tracing_off(tr); /* resize the ring buffer to 0 */ tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS); trace_array_put(tr); return 0; } #define TRACE_MARKER_MAX_SIZE 4096 static ssize_t tracing_mark_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) { struct trace_array *tr = filp->private_data; struct ring_buffer_event *event; enum event_trigger_type tt = ETT_NONE; struct trace_buffer *buffer; struct print_entry *entry; int meta_size; ssize_t written; size_t size; int len; /* Used in tracing_mark_raw_write() as well */ #define FAULTED_STR "<faulted>" #define FAULTED_SIZE (sizeof(FAULTED_STR) - 1) /* '\0' is already accounted for */ if (tracing_disabled) return -EINVAL; if (!(tr->trace_flags & TRACE_ITER_MARKERS)) return -EINVAL; if ((ssize_t)cnt < 0) return -EINVAL; if (cnt > TRACE_MARKER_MAX_SIZE) cnt = TRACE_MARKER_MAX_SIZE; meta_size = sizeof(*entry) + 2; /* add '\0' and possible '\n' */ again: size = cnt + meta_size; /* If less than "<faulted>", then make sure we can still add that */ if (cnt < FAULTED_SIZE) size += FAULTED_SIZE - cnt; buffer = tr->array_buffer.buffer; event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, tracing_gen_ctx()); if (unlikely(!event)) { /* * If the size was greater than what was allowed, then * make it smaller and try again. */ if (size > ring_buffer_max_event_size(buffer)) { /* cnt < FAULTED size should never be bigger than max */ if (WARN_ON_ONCE(cnt < FAULTED_SIZE)) return -EBADF; cnt = ring_buffer_max_event_size(buffer) - meta_size; /* The above should only happen once */ if (WARN_ON_ONCE(cnt + meta_size == size)) return -EBADF; goto again; } /* Ring buffer disabled, return as if not open for write */ return -EBADF; } entry = ring_buffer_event_data(event); entry->ip = _THIS_IP_; len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt); if (len) { memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); cnt = FAULTED_SIZE; written = -EFAULT; } else written = cnt; if (tr->trace_marker_file && !list_empty(&tr->trace_marker_file->triggers)) { /* do not add \n before testing triggers, but add \0 */ entry->buf[cnt] = '\0'; tt = event_triggers_call(tr->trace_marker_file, buffer, entry, event); } if (entry->buf[cnt - 1] != '\n') { entry->buf[cnt] = '\n'; entry->buf[cnt + 1] = '\0'; } else entry->buf[cnt] = '\0'; if (static_branch_unlikely(&trace_marker_exports_enabled)) ftrace_exports(event, TRACE_EXPORT_MARKER); __buffer_unlock_commit(buffer, event); if (tt) event_triggers_post_call(tr->trace_marker_file, tt); return written; } static ssize_t tracing_mark_raw_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) { struct trace_array *tr = filp->private_data; struct ring_buffer_event *event; struct trace_buffer *buffer; struct raw_data_entry *entry; ssize_t written; int size; int len; #define FAULT_SIZE_ID (FAULTED_SIZE + sizeof(int)) if (tracing_disabled) return -EINVAL; if (!(tr->trace_flags & TRACE_ITER_MARKERS)) return -EINVAL; /* The marker must at least have a tag id */ if (cnt < sizeof(unsigned int)) return -EINVAL; size = sizeof(*entry) + cnt; if (cnt < FAULT_SIZE_ID) size += FAULT_SIZE_ID - cnt; buffer = tr->array_buffer.buffer; if (size > ring_buffer_max_event_size(buffer)) return -EINVAL; event = __trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size, tracing_gen_ctx()); if (!event) /* Ring buffer disabled, return as if not open for write */ return -EBADF; entry = ring_buffer_event_data(event); len = __copy_from_user_inatomic(&entry->id, ubuf, cnt); if (len) { entry->id = -1; memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); written = -EFAULT; } else written = cnt; __buffer_unlock_commit(buffer, event); return written; } static int tracing_clock_show(struct seq_file *m, void *v) { struct trace_array *tr = m->private; int i; for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) seq_printf(m, "%s%s%s%s", i ? " " : "", i == tr->clock_id ? "[" : "", trace_clocks[i].name, i == tr->clock_id ? "]" : ""); seq_putc(m, '\n'); return 0; } int tracing_set_clock(struct trace_array *tr, const char *clockstr) { int i; for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) { if (strcmp(trace_clocks[i].name, clockstr) == 0) break; } if (i == ARRAY_SIZE(trace_clocks)) return -EINVAL; mutex_lock(&trace_types_lock); tr->clock_id = i; ring_buffer_set_clock(tr->array_buffer.buffer, trace_clocks[i].func); /* * New clock may not be consistent with the previous clock. * Reset the buffer so that it doesn't have incomparable timestamps. */ tracing_reset_online_cpus(&tr->array_buffer); #ifdef CONFIG_TRACER_MAX_TRACE if (tr->max_buffer.buffer) ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func); tracing_reset_online_cpus(&tr->max_buffer); #endif mutex_unlock(&trace_types_lock); return 0; } static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) { struct seq_file *m = filp->private_data; struct trace_array *tr = m->private; char buf[64]; const char *clockstr; int ret; if (cnt >= sizeof(buf)) return -EINVAL; if (copy_from_user(buf, ubuf, cnt)) return -EFAULT; buf[cnt] = 0; clockstr = strstrip(buf); ret = tracing_set_clock(tr, clockstr); if (ret) return ret; *fpos += cnt; return cnt; } static int tracing_clock_open(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; int ret; ret = tracing_check_open_get_tr(tr); if (ret) return ret; ret = single_open(file, tracing_clock_show, inode->i_private); if (ret < 0) trace_array_put(tr); return ret; } static int tracing_time_stamp_mode_show(struct seq_file *m, void *v) { struct trace_array *tr = m->private; mutex_lock(&trace_types_lock); if (ring_buffer_time_stamp_abs(tr->array_buffer.buffer)) seq_puts(m, "delta [absolute]\n"); else seq_puts(m, "[delta] absolute\n"); mutex_unlock(&trace_types_lock); return 0; } static int tracing_time_stamp_mode_open(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; int ret; ret = tracing_check_open_get_tr(tr); if (ret) return ret; ret = single_open(file, tracing_time_stamp_mode_show, inode->i_private); if (ret < 0) trace_array_put(tr); return ret; } u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_event *rbe) { if (rbe == this_cpu_read(trace_buffered_event)) return ring_buffer_time_stamp(buffer); return ring_buffer_event_time_stamp(buffer, rbe); } /* * Set or disable using the per CPU trace_buffer_event when possible. */ int tracing_set_filter_buffering(struct trace_array *tr, bool set) { guard(mutex)(&trace_types_lock); if (set && tr->no_filter_buffering_ref++) return 0; if (!set) { if (WARN_ON_ONCE(!tr->no_filter_buffering_ref)) return -EINVAL; --tr->no_filter_buffering_ref; } return 0; } struct ftrace_buffer_info { struct trace_iterator iter; void *spare; unsigned int spare_cpu; unsigned int spare_size; unsigned int read; }; #ifdef CONFIG_TRACER_SNAPSHOT static int tracing_snapshot_open(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; struct trace_iterator *iter; struct seq_file *m; int ret; ret = tracing_check_open_get_tr(tr); if (ret) return ret; if (file->f_mode & FMODE_READ) { iter = __tracing_open(inode, file, true); if (IS_ERR(iter)) ret = PTR_ERR(iter); } else { /* Writes still need the seq_file to hold the private data */ ret = -ENOMEM; m = kzalloc(sizeof(*m), GFP_KERNEL); if (!m) goto out; iter = kzalloc(sizeof(*iter), GFP_KERNEL); if (!iter) { kfree(m); goto out; } ret = 0; iter->tr = tr; iter->array_buffer = &tr->max_buffer; iter->cpu_file = tracing_get_cpu(inode); m->private = iter; file->private_data = m; } out: if (ret < 0) trace_array_put(tr); return ret; } static void tracing_swap_cpu_buffer(void *tr) { update_max_tr_single((struct trace_array *)tr, current, smp_processor_id()); } static ssize_t tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct seq_file *m = filp->private_data; struct trace_iterator *iter = m->private; struct trace_array *tr = iter->tr; unsigned long val; int ret; ret = tracing_update_buffers(tr); if (ret < 0) return ret; ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; guard(mutex)(&trace_types_lock); if (tr->current_trace->use_max_tr) return -EBUSY; local_irq_disable(); arch_spin_lock(&tr->max_lock); if (tr->cond_snapshot) ret = -EBUSY; arch_spin_unlock(&tr->max_lock); local_irq_enable(); if (ret) return ret; switch (val) { case 0: if (iter->cpu_file != RING_BUFFER_ALL_CPUS) return -EINVAL; if (tr->allocated_snapshot) free_snapshot(tr); break; case 1: /* Only allow per-cpu swap if the ring buffer supports it */ #ifndef CONFIG_RING_BUFFER_ALLOW_SWAP if (iter->cpu_file != RING_BUFFER_ALL_CPUS) return -EINVAL; #endif if (tr->allocated_snapshot) ret = resize_buffer_duplicate_size(&tr->max_buffer, &tr->array_buffer, iter->cpu_file); ret = tracing_arm_snapshot_locked(tr); if (ret) return ret; /* Now, we're going to swap */ if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { local_irq_disable(); update_max_tr(tr, current, smp_processor_id(), NULL); local_irq_enable(); } else { smp_call_function_single(iter->cpu_file, tracing_swap_cpu_buffer, (void *)tr, 1); } tracing_disarm_snapshot(tr); break; default: if (tr->allocated_snapshot) { if (iter->cpu_file == RING_BUFFER_ALL_CPUS) tracing_reset_online_cpus(&tr->max_buffer); else tracing_reset_cpu(&tr->max_buffer, iter->cpu_file); } break; } if (ret >= 0) { *ppos += cnt; ret = cnt; } return ret; } static int tracing_snapshot_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; int ret; ret = tracing_release(inode, file); if (file->f_mode & FMODE_READ) return ret; /* If write only, the seq_file is just a stub */ if (m) kfree(m->private); kfree(m); return 0; } static int tracing_buffers_open(struct inode *inode, struct file *filp); static ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos); static int tracing_buffers_release(struct inode *inode, struct file *file); static ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); static int snapshot_raw_open(struct inode *inode, struct file *filp) { struct ftrace_buffer_info *info; int ret; /* The following checks for tracefs lockdown */ ret = tracing_buffers_open(inode, filp); if (ret < 0) return ret; info = filp->private_data; if (info->iter.trace->use_max_tr) { tracing_buffers_release(inode, filp); return -EBUSY; } info->iter.snapshot = true; info->iter.array_buffer = &info->iter.tr->max_buffer; return ret; } #endif /* CONFIG_TRACER_SNAPSHOT */ static const struct file_operations tracing_thresh_fops = { .open = tracing_open_generic, .read = tracing_thresh_read, .write = tracing_thresh_write, .llseek = generic_file_llseek, }; #ifdef CONFIG_TRACER_MAX_TRACE static const struct file_operations tracing_max_lat_fops = { .open = tracing_open_generic_tr, .read = tracing_max_lat_read, .write = tracing_max_lat_write, .llseek = generic_file_llseek, .release = tracing_release_generic_tr, }; #endif static const struct file_operations set_tracer_fops = { .open = tracing_open_generic_tr, .read = tracing_set_trace_read, .write = tracing_set_trace_write, .llseek = generic_file_llseek, .release = tracing_release_generic_tr, }; static const struct file_operations tracing_pipe_fops = { .open = tracing_open_pipe, .poll = tracing_poll_pipe, .read = tracing_read_pipe, .splice_read = tracing_splice_read_pipe, .release = tracing_release_pipe, }; static const struct file_operations tracing_entries_fops = { .open = tracing_open_generic_tr, .read = tracing_entries_read, .write = tracing_entries_write, .llseek = generic_file_llseek, .release = tracing_release_generic_tr, }; static const struct file_operations tracing_buffer_meta_fops = { .open = tracing_buffer_meta_open, .read = seq_read, .llseek = seq_lseek, .release = tracing_seq_release, }; static const struct file_operations tracing_total_entries_fops = { .open = tracing_open_generic_tr, .read = tracing_total_entries_read, .llseek = generic_file_llseek, .release = tracing_release_generic_tr, }; static const struct file_operations tracing_free_buffer_fops = { .open = tracing_open_generic_tr, .write = tracing_free_buffer_write, .release = tracing_free_buffer_release, }; static const struct file_operations tracing_mark_fops = { .open = tracing_mark_open, .write = tracing_mark_write, .release = tracing_release_generic_tr, }; static const struct file_operations tracing_mark_raw_fops = { .open = tracing_mark_open, .write = tracing_mark_raw_write, .release = tracing_release_generic_tr, }; static const struct file_operations trace_clock_fops = { .open = tracing_clock_open, .read = seq_read, .llseek = seq_lseek, .release = tracing_single_release_tr, .write = tracing_clock_write, }; static const struct file_operations trace_time_stamp_mode_fops = { .open = tracing_time_stamp_mode_open, .read = seq_read, .llseek = seq_lseek, .release = tracing_single_release_tr, }; static const struct file_operations last_boot_fops = { .open = tracing_open_generic_tr, .read = tracing_last_boot_read, .llseek = generic_file_llseek, .release = tracing_release_generic_tr, }; #ifdef CONFIG_TRACER_SNAPSHOT static const struct file_operations snapshot_fops = { .open = tracing_snapshot_open, .read = seq_read, .write = tracing_snapshot_write, .llseek = tracing_lseek, .release = tracing_snapshot_release, }; static const struct file_operations snapshot_raw_fops = { .open = snapshot_raw_open, .read = tracing_buffers_read, .release = tracing_buffers_release, .splice_read = tracing_buffers_splice_read, }; #endif /* CONFIG_TRACER_SNAPSHOT */ /* * trace_min_max_write - Write a u64 value to a trace_min_max_param struct * @filp: The active open file structure * @ubuf: The userspace provided buffer to read value into * @cnt: The maximum number of bytes to read * @ppos: The current "file" position * * This function implements the write interface for a struct trace_min_max_param. * The filp->private_data must point to a trace_min_max_param structure that * defines where to write the value, the min and the max acceptable values, * and a lock to protect the write. */ static ssize_t trace_min_max_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_min_max_param *param = filp->private_data; u64 val; int err; if (!param) return -EFAULT; err = kstrtoull_from_user(ubuf, cnt, 10, &val); if (err) return err; if (param->lock) mutex_lock(param->lock); if (param->min && val < *param->min) err = -EINVAL; if (param->max && val > *param->max) err = -EINVAL; if (!err) *param->val = val; if (param->lock) mutex_unlock(param->lock); if (err) return err; return cnt; } /* * trace_min_max_read - Read a u64 value from a trace_min_max_param struct * @filp: The active open file structure * @ubuf: The userspace provided buffer to read value into * @cnt: The maximum number of bytes to read * @ppos: The current "file" position * * This function implements the read interface for a struct trace_min_max_param. * The filp->private_data must point to a trace_min_max_param struct with valid * data. */ static ssize_t trace_min_max_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_min_max_param *param = filp->private_data; char buf[U64_STR_SIZE]; int len; u64 val; if (!param) return -EFAULT; val = *param->val; if (cnt > sizeof(buf)) cnt = sizeof(buf); len = snprintf(buf, sizeof(buf), "%llu\n", val); return simple_read_from_buffer(ubuf, cnt, ppos, buf, len); } const struct file_operations trace_min_max_fops = { .open = tracing_open_generic, .read = trace_min_max_read, .write = trace_min_max_write, }; #define TRACING_LOG_ERRS_MAX 8 #define TRACING_LOG_LOC_MAX 128 #define CMD_PREFIX " Command: " struct err_info { const char **errs; /* ptr to loc-specific array of err strings */ u8 type; /* index into errs -> specific err string */ u16 pos; /* caret position */ u64 ts; }; struct tracing_log_err { struct list_head list; struct err_info info; char loc[TRACING_LOG_LOC_MAX]; /* err location */ char *cmd; /* what caused err */ }; static DEFINE_MUTEX(tracing_err_log_lock); static struct tracing_log_err *alloc_tracing_log_err(int len) { struct tracing_log_err *err; err = kzalloc(sizeof(*err), GFP_KERNEL); if (!err) return ERR_PTR(-ENOMEM); err->cmd = kzalloc(len, GFP_KERNEL); if (!err->cmd) { kfree(err); return ERR_PTR(-ENOMEM); } return err; } static void free_tracing_log_err(struct tracing_log_err *err) { kfree(err->cmd); kfree(err); } static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr, int len) { struct tracing_log_err *err; char *cmd; if (tr->n_err_log_entries < TRACING_LOG_ERRS_MAX) { err = alloc_tracing_log_err(len); if (PTR_ERR(err) != -ENOMEM) tr->n_err_log_entries++; return err; } cmd = kzalloc(len, GFP_KERNEL); if (!cmd) return ERR_PTR(-ENOMEM); err = list_first_entry(&tr->err_log, struct tracing_log_err, list); kfree(err->cmd); err->cmd = cmd; list_del(&err->list); return err; } /** * err_pos - find the position of a string within a command for error careting * @cmd: The tracing command that caused the error * @str: The string to position the caret at within @cmd * * Finds the position of the first occurrence of @str within @cmd. The * return value can be passed to tracing_log_err() for caret placement * within @cmd. * * Returns the index within @cmd of the first occurrence of @str or 0 * if @str was not found. */ unsigned int err_pos(char *cmd, const char *str) { char *found; if (WARN_ON(!strlen(cmd))) return 0; found = strstr(cmd, str); if (found) return found - cmd; return 0; } /** * tracing_log_err - write an error to the tracing error log * @tr: The associated trace array for the error (NULL for top level array) * @loc: A string describing where the error occurred * @cmd: The tracing command that caused the error * @errs: The array of loc-specific static error strings * @type: The index into errs[], which produces the specific static err string * @pos: The position the caret should be placed in the cmd * * Writes an error into tracing/error_log of the form: * * <loc>: error: <text> * Command: <cmd> * ^ * * tracing/error_log is a small log file containing the last * TRACING_LOG_ERRS_MAX errors (8). Memory for errors isn't allocated * unless there has been a tracing error, and the error log can be * cleared and have its memory freed by writing the empty string in * truncation mode to it i.e. echo > tracing/error_log. * * NOTE: the @errs array along with the @type param are used to * produce a static error string - this string is not copied and saved * when the error is logged - only a pointer to it is saved. See * existing callers for examples of how static strings are typically * defined for use with tracing_log_err(). */ void tracing_log_err(struct trace_array *tr, const char *loc, const char *cmd, const char **errs, u8 type, u16 pos) { struct tracing_log_err *err; int len = 0; if (!tr) tr = &global_trace; len += sizeof(CMD_PREFIX) + 2 * sizeof("\n") + strlen(cmd) + 1; guard(mutex)(&tracing_err_log_lock); err = get_tracing_log_err(tr, len); if (PTR_ERR(err) == -ENOMEM) return; snprintf(err->loc, TRACING_LOG_LOC_MAX, "%s: error: ", loc); snprintf(err->cmd, len, "\n" CMD_PREFIX "%s\n", cmd); err->info.errs = errs; err->info.type = type; err->info.pos = pos; err->info.ts = local_clock(); list_add_tail(&err->list, &tr->err_log); } static void clear_tracing_err_log(struct trace_array *tr) { struct tracing_log_err *err, *next; mutex_lock(&tracing_err_log_lock); list_for_each_entry_safe(err, next, &tr->err_log, list) { list_del(&err->list); free_tracing_log_err(err); } tr->n_err_log_entries = 0; mutex_unlock(&tracing_err_log_lock); } static void *tracing_err_log_seq_start(struct seq_file *m, loff_t *pos) { struct trace_array *tr = m->private; mutex_lock(&tracing_err_log_lock); return seq_list_start(&tr->err_log, *pos); } static void *tracing_err_log_seq_next(struct seq_file *m, void *v, loff_t *pos) { struct trace_array *tr = m->private; return seq_list_next(v, &tr->err_log, pos); } static void tracing_err_log_seq_stop(struct seq_file *m, void *v) { mutex_unlock(&tracing_err_log_lock); } static void tracing_err_log_show_pos(struct seq_file *m, u16 pos) { u16 i; for (i = 0; i < sizeof(CMD_PREFIX) - 1; i++) seq_putc(m, ' '); for (i = 0; i < pos; i++) seq_putc(m, ' '); seq_puts(m, "^\n"); } static int tracing_err_log_seq_show(struct seq_file *m, void *v) { struct tracing_log_err *err = v; if (err) { const char *err_text = err->info.errs[err->info.type]; u64 sec = err->info.ts; u32 nsec; nsec = do_div(sec, NSEC_PER_SEC); seq_printf(m, "[%5llu.%06u] %s%s", sec, nsec / 1000, err->loc, err_text); seq_printf(m, "%s", err->cmd); tracing_err_log_show_pos(m, err->info.pos); } return 0; } static const struct seq_operations tracing_err_log_seq_ops = { .start = tracing_err_log_seq_start, .next = tracing_err_log_seq_next, .stop = tracing_err_log_seq_stop, .show = tracing_err_log_seq_show }; static int tracing_err_log_open(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; int ret = 0; ret = tracing_check_open_get_tr(tr); if (ret) return ret; /* If this file was opened for write, then erase contents */ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) clear_tracing_err_log(tr); if (file->f_mode & FMODE_READ) { ret = seq_open(file, &tracing_err_log_seq_ops); if (!ret) { struct seq_file *m = file->private_data; m->private = tr; } else { trace_array_put(tr); } } return ret; } static ssize_t tracing_err_log_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { return count; } static int tracing_err_log_release(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; trace_array_put(tr); if (file->f_mode & FMODE_READ) seq_release(inode, file); return 0; } static const struct file_operations tracing_err_log_fops = { .open = tracing_err_log_open, .write = tracing_err_log_write, .read = seq_read, .llseek = tracing_lseek, .release = tracing_err_log_release, }; static int tracing_buffers_open(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; struct ftrace_buffer_info *info; int ret; ret = tracing_check_open_get_tr(tr); if (ret) return ret; info = kvzalloc(sizeof(*info), GFP_KERNEL); if (!info) { trace_array_put(tr); return -ENOMEM; } mutex_lock(&trace_types_lock); info->iter.tr = tr; info->iter.cpu_file = tracing_get_cpu(inode); info->iter.trace = tr->current_trace; info->iter.array_buffer = &tr->array_buffer; info->spare = NULL; /* Force reading ring buffer for first read */ info->read = (unsigned int)-1; filp->private_data = info; tr->trace_ref++; mutex_unlock(&trace_types_lock); ret = nonseekable_open(inode, filp); if (ret < 0) trace_array_put(tr); return ret; } static __poll_t tracing_buffers_poll(struct file *filp, poll_table *poll_table) { struct ftrace_buffer_info *info = filp->private_data; struct trace_iterator *iter = &info->iter; return trace_poll(iter, filp, poll_table); } static ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { struct ftrace_buffer_info *info = filp->private_data; struct trace_iterator *iter = &info->iter; void *trace_data; int page_size; ssize_t ret = 0; ssize_t size; if (!count) return 0; #ifdef CONFIG_TRACER_MAX_TRACE if (iter->snapshot && iter->tr->current_trace->use_max_tr) return -EBUSY; #endif page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer); /* Make sure the spare matches the current sub buffer size */ if (info->spare) { if (page_size != info->spare_size) { ring_buffer_free_read_page(iter->array_buffer->buffer, info->spare_cpu, info->spare); info->spare = NULL; } } if (!info->spare) { info->spare = ring_buffer_alloc_read_page(iter->array_buffer->buffer, iter->cpu_file); if (IS_ERR(info->spare)) { ret = PTR_ERR(info->spare); info->spare = NULL; } else { info->spare_cpu = iter->cpu_file; info->spare_size = page_size; } } if (!info->spare) return ret; /* Do we have previous read data to read? */ if (info->read < page_size) goto read; again: trace_access_lock(iter->cpu_file); ret = ring_buffer_read_page(iter->array_buffer->buffer, info->spare, count, iter->cpu_file, 0); trace_access_unlock(iter->cpu_file); if (ret < 0) { if (trace_empty(iter) && !iter->closed) { if ((filp->f_flags & O_NONBLOCK)) return -EAGAIN; ret = wait_on_pipe(iter, 0); if (ret) return ret; goto again; } return 0; } info->read = 0; read: size = page_size - info->read; if (size > count) size = count; trace_data = ring_buffer_read_page_data(info->spare); ret = copy_to_user(ubuf, trace_data + info->read, size); if (ret == size) return -EFAULT; size -= ret; *ppos += size; info->read += size; return size; } static int tracing_buffers_flush(struct file *file, fl_owner_t id) { struct ftrace_buffer_info *info = file->private_data; struct trace_iterator *iter = &info->iter; iter->closed = true; /* Make sure the waiters see the new wait_index */ (void)atomic_fetch_inc_release(&iter->wait_index); ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file); return 0; } static int tracing_buffers_release(struct inode *inode, struct file *file) { struct ftrace_buffer_info *info = file->private_data; struct trace_iterator *iter = &info->iter; mutex_lock(&trace_types_lock); iter->tr->trace_ref--; __trace_array_put(iter->tr); if (info->spare) ring_buffer_free_read_page(iter->array_buffer->buffer, info->spare_cpu, info->spare); kvfree(info); mutex_unlock(&trace_types_lock); return 0; } struct buffer_ref { struct trace_buffer *buffer; void *page; int cpu; refcount_t refcount; }; static void buffer_ref_release(struct buffer_ref *ref) { if (!refcount_dec_and_test(&ref->refcount)) return; ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); kfree(ref); } static void buffer_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct buffer_ref *ref = (struct buffer_ref *)buf->private; buffer_ref_release(ref); buf->private = 0; } static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct buffer_ref *ref = (struct buffer_ref *)buf->private; if (refcount_read(&ref->refcount) > INT_MAX/2) return false; refcount_inc(&ref->refcount); return true; } /* Pipe buffer operations for a buffer. */ static const struct pipe_buf_operations buffer_pipe_buf_ops = { .release = buffer_pipe_buf_release, .get = buffer_pipe_buf_get, }; /* * Callback from splice_to_pipe(), if we need to release some pages * at the end of the spd in case we error'ed out in filling the pipe. */ static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i) { struct buffer_ref *ref = (struct buffer_ref *)spd->partial[i].private; buffer_ref_release(ref); spd->partial[i].private = 0; } static ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct ftrace_buffer_info *info = file->private_data; struct trace_iterator *iter = &info->iter; struct partial_page partial_def[PIPE_DEF_BUFFERS]; struct page *pages_def[PIPE_DEF_BUFFERS]; struct splice_pipe_desc spd = { .pages = pages_def, .partial = partial_def, .nr_pages_max = PIPE_DEF_BUFFERS, .ops = &buffer_pipe_buf_ops, .spd_release = buffer_spd_release, }; struct buffer_ref *ref; bool woken = false; int page_size; int entries, i; ssize_t ret = 0; #ifdef CONFIG_TRACER_MAX_TRACE if (iter->snapshot && iter->tr->current_trace->use_max_tr) return -EBUSY; #endif page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer); if (*ppos & (page_size - 1)) return -EINVAL; if (len & (page_size - 1)) { if (len < page_size) return -EINVAL; len &= (~(page_size - 1)); } if (splice_grow_spd(pipe, &spd)) return -ENOMEM; again: trace_access_lock(iter->cpu_file); entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file); for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= page_size) { struct page *page; int r; ref = kzalloc(sizeof(*ref), GFP_KERNEL); if (!ref) { ret = -ENOMEM; break; } refcount_set(&ref->refcount, 1); ref->buffer = iter->array_buffer->buffer; ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file); if (IS_ERR(ref->page)) { ret = PTR_ERR(ref->page); ref->page = NULL; kfree(ref); break; } ref->cpu = iter->cpu_file; r = ring_buffer_read_page(ref->buffer, ref->page, len, iter->cpu_file, 1); if (r < 0) { ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); kfree(ref); break; } page = virt_to_page(ring_buffer_read_page_data(ref->page)); spd.pages[i] = page; spd.partial[i].len = page_size; spd.partial[i].offset = 0; spd.partial[i].private = (unsigned long)ref; spd.nr_pages++; *ppos += page_size; entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file); } trace_access_unlock(iter->cpu_file); spd.nr_pages = i; /* did we read anything? */ if (!spd.nr_pages) { if (ret) goto out; if (woken) goto out; ret = -EAGAIN; if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) goto out; ret = wait_on_pipe(iter, iter->snapshot ? 0 : iter->tr->buffer_percent); if (ret) goto out; /* No need to wait after waking up when tracing is off */ if (!tracer_tracing_is_on(iter->tr)) goto out; /* Iterate one more time to collect any new data then exit */ woken = true; goto again; } ret = splice_to_pipe(pipe, &spd); out: splice_shrink_spd(&spd); return ret; } static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct ftrace_buffer_info *info = file->private_data; struct trace_iterator *iter = &info->iter; int err; if (cmd == TRACE_MMAP_IOCTL_GET_READER) { if (!(file->f_flags & O_NONBLOCK)) { err = ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, iter->tr->buffer_percent, NULL, NULL); if (err) return err; } return ring_buffer_map_get_reader(iter->array_buffer->buffer, iter->cpu_file); } else if (cmd) { return -ENOTTY; } /* * An ioctl call with cmd 0 to the ring buffer file will wake up all * waiters */ mutex_lock(&trace_types_lock); /* Make sure the waiters see the new wait_index */ (void)atomic_fetch_inc_release(&iter->wait_index); ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file); mutex_unlock(&trace_types_lock); return 0; } #ifdef CONFIG_TRACER_MAX_TRACE static int get_snapshot_map(struct trace_array *tr) { int err = 0; /* * Called with mmap_lock held. lockdep would be unhappy if we would now * take trace_types_lock. Instead use the specific * snapshot_trigger_lock. */ spin_lock(&tr->snapshot_trigger_lock); if (tr->snapshot || tr->mapped == UINT_MAX) err = -EBUSY; else tr->mapped++; spin_unlock(&tr->snapshot_trigger_lock); /* Wait for update_max_tr() to observe iter->tr->mapped */ if (tr->mapped == 1) synchronize_rcu(); return err; } static void put_snapshot_map(struct trace_array *tr) { spin_lock(&tr->snapshot_trigger_lock); if (!WARN_ON(!tr->mapped)) tr->mapped--; spin_unlock(&tr->snapshot_trigger_lock); } #else static inline int get_snapshot_map(struct trace_array *tr) { return 0; } static inline void put_snapshot_map(struct trace_array *tr) { } #endif static void tracing_buffers_mmap_close(struct vm_area_struct *vma) { struct ftrace_buffer_info *info = vma->vm_file->private_data; struct trace_iterator *iter = &info->iter; WARN_ON(ring_buffer_unmap(iter->array_buffer->buffer, iter->cpu_file)); put_snapshot_map(iter->tr); } static const struct vm_operations_struct tracing_buffers_vmops = { .close = tracing_buffers_mmap_close, }; static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma) { struct ftrace_buffer_info *info = filp->private_data; struct trace_iterator *iter = &info->iter; int ret = 0; ret = get_snapshot_map(iter->tr); if (ret) return ret; ret = ring_buffer_map(iter->array_buffer->buffer, iter->cpu_file, vma); if (ret) put_snapshot_map(iter->tr); vma->vm_ops = &tracing_buffers_vmops; return ret; } static const struct file_operations tracing_buffers_fops = { .open = tracing_buffers_open, .read = tracing_buffers_read, .poll = tracing_buffers_poll, .release = tracing_buffers_release, .flush = tracing_buffers_flush, .splice_read = tracing_buffers_splice_read, .unlocked_ioctl = tracing_buffers_ioctl, .mmap = tracing_buffers_mmap, }; static ssize_t tracing_stats_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { struct inode *inode = file_inode(filp); struct trace_array *tr = inode->i_private; struct array_buffer *trace_buf = &tr->array_buffer; int cpu = tracing_get_cpu(inode); struct trace_seq *s; unsigned long cnt; unsigned long long t; unsigned long usec_rem; s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) return -ENOMEM; trace_seq_init(s); cnt = ring_buffer_entries_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "entries: %ld\n", cnt); cnt = ring_buffer_overrun_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "overrun: %ld\n", cnt); cnt = ring_buffer_commit_overrun_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "commit overrun: %ld\n", cnt); cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "bytes: %ld\n", cnt); if (trace_clocks[tr->clock_id].in_ns) { /* local or global for trace_clock */ t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu)); usec_rem = do_div(t, USEC_PER_SEC); trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", t, usec_rem); t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer)); usec_rem = do_div(t, USEC_PER_SEC); trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); } else { /* counter or tsc mode for trace_clock */ trace_seq_printf(s, "oldest event ts: %llu\n", ring_buffer_oldest_event_ts(trace_buf->buffer, cpu)); trace_seq_printf(s, "now ts: %llu\n", ring_buffer_time_stamp(trace_buf->buffer)); } cnt = ring_buffer_dropped_events_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "dropped events: %ld\n", cnt); cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "read events: %ld\n", cnt); count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, trace_seq_used(s)); kfree(s); return count; } static const struct file_operations tracing_stats_fops = { .open = tracing_open_generic_tr, .read = tracing_stats_read, .llseek = generic_file_llseek, .release = tracing_release_generic_tr, }; #ifdef CONFIG_DYNAMIC_FTRACE static ssize_t tracing_read_dyn_info(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { ssize_t ret; char *buf; int r; /* 512 should be plenty to hold the amount needed */ #define DYN_INFO_BUF_SIZE 512 buf = kmalloc(DYN_INFO_BUF_SIZE, GFP_KERNEL); if (!buf) return -ENOMEM; r = scnprintf(buf, DYN_INFO_BUF_SIZE, "%ld pages:%ld groups: %ld\n" "ftrace boot update time = %llu (ns)\n" "ftrace module total update time = %llu (ns)\n", ftrace_update_tot_cnt, ftrace_number_of_pages, ftrace_number_of_groups, ftrace_update_time, ftrace_total_mod_time); ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); kfree(buf); return ret; } static const struct file_operations tracing_dyn_info_fops = { .open = tracing_open_generic, .read = tracing_read_dyn_info, .llseek = generic_file_llseek, }; #endif /* CONFIG_DYNAMIC_FTRACE */ #if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) static void ftrace_snapshot(unsigned long ip, unsigned long parent_ip, struct trace_array *tr, struct ftrace_probe_ops *ops, void *data) { tracing_snapshot_instance(tr); } static void ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, struct trace_array *tr, struct ftrace_probe_ops *ops, void *data) { struct ftrace_func_mapper *mapper = data; long *count = NULL; if (mapper) count = (long *)ftrace_func_mapper_find_ip(mapper, ip); if (count) { if (*count <= 0) return; (*count)--; } tracing_snapshot_instance(tr); } static int ftrace_snapshot_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data) { struct ftrace_func_mapper *mapper = data; long *count = NULL; seq_printf(m, "%ps:", (void *)ip); seq_puts(m, "snapshot"); if (mapper) count = (long *)ftrace_func_mapper_find_ip(mapper, ip); if (count) seq_printf(m, ":count=%ld\n", *count); else seq_puts(m, ":unlimited\n"); return 0; } static int ftrace_snapshot_init(struct ftrace_probe_ops *ops, struct trace_array *tr, unsigned long ip, void *init_data, void **data) { struct ftrace_func_mapper *mapper = *data; if (!mapper) { mapper = allocate_ftrace_func_mapper(); if (!mapper) return -ENOMEM; *data = mapper; } return ftrace_func_mapper_add_ip(mapper, ip, init_data); } static void ftrace_snapshot_free(struct ftrace_probe_ops *ops, struct trace_array *tr, unsigned long ip, void *data) { struct ftrace_func_mapper *mapper = data; if (!ip) { if (!mapper) return; free_ftrace_func_mapper(mapper, NULL); return; } ftrace_func_mapper_remove_ip(mapper, ip); } static struct ftrace_probe_ops snapshot_probe_ops = { .func = ftrace_snapshot, .print = ftrace_snapshot_print, }; static struct ftrace_probe_ops snapshot_count_probe_ops = { .func = ftrace_count_snapshot, .print = ftrace_snapshot_print, .init = ftrace_snapshot_init, .free = ftrace_snapshot_free, }; static int ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash, char *glob, char *cmd, char *param, int enable) { struct ftrace_probe_ops *ops; void *count = (void *)-1; char *number; int ret; if (!tr) return -ENODEV; /* hash funcs only work with set_ftrace_filter */ if (!enable) return -EINVAL; ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops; if (glob[0] == '!') { ret = unregister_ftrace_function_probe_func(glob+1, tr, ops); if (!ret) tracing_disarm_snapshot(tr); return ret; } if (!param) goto out_reg; number = strsep(&param, ":"); if (!strlen(number)) goto out_reg; /* * We use the callback data field (which is a pointer) * as our counter. */ ret = kstrtoul(number, 0, (unsigned long *)&count); if (ret) return ret; out_reg: ret = tracing_arm_snapshot(tr); if (ret < 0) goto out; ret = register_ftrace_function_probe(glob, tr, ops, count); if (ret < 0) tracing_disarm_snapshot(tr); out: return ret < 0 ? ret : 0; } static struct ftrace_func_command ftrace_snapshot_cmd = { .name = "snapshot", .func = ftrace_trace_snapshot_callback, }; static __init int register_snapshot_cmd(void) { return register_ftrace_command(&ftrace_snapshot_cmd); } #else static inline __init int register_snapshot_cmd(void) { return 0; } #endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */ static struct dentry *tracing_get_dentry(struct trace_array *tr) { if (WARN_ON(!tr->dir)) return ERR_PTR(-ENODEV); /* Top directory uses NULL as the parent */ if (tr->flags & TRACE_ARRAY_FL_GLOBAL) return NULL; /* All sub buffers have a descriptor */ return tr->dir; } static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) { struct dentry *d_tracer; if (tr->percpu_dir) return tr->percpu_dir; d_tracer = tracing_get_dentry(tr); if (IS_ERR(d_tracer)) return NULL; tr->percpu_dir = tracefs_create_dir("per_cpu", d_tracer); MEM_FAIL(!tr->percpu_dir, "Could not create tracefs directory 'per_cpu/%d'\n", cpu); return tr->percpu_dir; } static struct dentry * trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent, void *data, long cpu, const struct file_operations *fops) { struct dentry *ret = trace_create_file(name, mode, parent, data, fops); if (ret) /* See tracing_get_cpu() */ d_inode(ret)->i_cdev = (void *)(cpu + 1); return ret; } static void tracing_init_tracefs_percpu(struct trace_array *tr, long cpu) { struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu); struct dentry *d_cpu; char cpu_dir[30]; /* 30 characters should be more than enough */ if (!d_percpu) return; snprintf(cpu_dir, 30, "cpu%ld", cpu); d_cpu = tracefs_create_dir(cpu_dir, d_percpu); if (!d_cpu) { pr_warn("Could not create tracefs '%s' entry\n", cpu_dir); return; } /* per cpu trace_pipe */ trace_create_cpu_file("trace_pipe", TRACE_MODE_READ, d_cpu, tr, cpu, &tracing_pipe_fops); /* per cpu trace */ trace_create_cpu_file("trace", TRACE_MODE_WRITE, d_cpu, tr, cpu, &tracing_fops); trace_create_cpu_file("trace_pipe_raw", TRACE_MODE_READ, d_cpu, tr, cpu, &tracing_buffers_fops); trace_create_cpu_file("stats", TRACE_MODE_READ, d_cpu, tr, cpu, &tracing_stats_fops); trace_create_cpu_file("buffer_size_kb", TRACE_MODE_READ, d_cpu, tr, cpu, &tracing_entries_fops); if (tr->range_addr_start) trace_create_cpu_file("buffer_meta", TRACE_MODE_READ, d_cpu, tr, cpu, &tracing_buffer_meta_fops); #ifdef CONFIG_TRACER_SNAPSHOT if (!tr->range_addr_start) { trace_create_cpu_file("snapshot", TRACE_MODE_WRITE, d_cpu, tr, cpu, &snapshot_fops); trace_create_cpu_file("snapshot_raw", TRACE_MODE_READ, d_cpu, tr, cpu, &snapshot_raw_fops); } #endif } #ifdef CONFIG_FTRACE_SELFTEST /* Let selftest have access to static functions in this file */ #include "trace_selftest.c" #endif static ssize_t trace_options_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_option_dentry *topt = filp->private_data; char *buf; if (topt->flags->val & topt->opt->bit) buf = "1\n"; else buf = "0\n"; return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); } static ssize_t trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_option_dentry *topt = filp->private_data; unsigned long val; int ret; ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; if (val != 0 && val != 1) return -EINVAL; if (!!(topt->flags->val & topt->opt->bit) != val) { mutex_lock(&trace_types_lock); ret = __set_tracer_option(topt->tr, topt->flags, topt->opt, !val); mutex_unlock(&trace_types_lock); if (ret) return ret; } *ppos += cnt; return cnt; } static int tracing_open_options(struct inode *inode, struct file *filp) { struct trace_option_dentry *topt = inode->i_private; int ret; ret = tracing_check_open_get_tr(topt->tr); if (ret) return ret; filp->private_data = inode->i_private; return 0; } static int tracing_release_options(struct inode *inode, struct file *file) { struct trace_option_dentry *topt = file->private_data; trace_array_put(topt->tr); return 0; } static const struct file_operations trace_options_fops = { .open = tracing_open_options, .read = trace_options_read, .write = trace_options_write, .llseek = generic_file_llseek, .release = tracing_release_options, }; /* * In order to pass in both the trace_array descriptor as well as the index * to the flag that the trace option file represents, the trace_array * has a character array of trace_flags_index[], which holds the index * of the bit for the flag it represents. index[0] == 0, index[1] == 1, etc. * The address of this character array is passed to the flag option file * read/write callbacks. * * In order to extract both the index and the trace_array descriptor, * get_tr_index() uses the following algorithm. * * idx = *ptr; * * As the pointer itself contains the address of the index (remember * index[1] == 1). * * Then to get the trace_array descriptor, by subtracting that index * from the ptr, we get to the start of the index itself. * * ptr - idx == &index[0] * * Then a simple container_of() from that pointer gets us to the * trace_array descriptor. */ static void get_tr_index(void *data, struct trace_array **ptr, unsigned int *pindex) { *pindex = *(unsigned char *)data; *ptr = container_of(data - *pindex, struct trace_array, trace_flags_index); } static ssize_t trace_options_core_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { void *tr_index = filp->private_data; struct trace_array *tr; unsigned int index; char *buf; get_tr_index(tr_index, &tr, &index); if (tr->trace_flags & (1 << index)) buf = "1\n"; else buf = "0\n"; return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); } static ssize_t trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { void *tr_index = filp->private_data; struct trace_array *tr; unsigned int index; unsigned long val; int ret; get_tr_index(tr_index, &tr, &index); ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; if (val != 0 && val != 1) return -EINVAL; mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); ret = set_tracer_flag(tr, 1 << index, val); mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); if (ret < 0) return ret; *ppos += cnt; return cnt; } static const struct file_operations trace_options_core_fops = { .open = tracing_open_generic, .read = trace_options_core_read, .write = trace_options_core_write, .llseek = generic_file_llseek, }; struct dentry *trace_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops) { struct dentry *ret; ret = tracefs_create_file(name, mode, parent, data, fops); if (!ret) pr_warn("Could not create tracefs '%s' entry\n", name); return ret; } static struct dentry *trace_options_init_dentry(struct trace_array *tr) { struct dentry *d_tracer; if (tr->options) return tr->options; d_tracer = tracing_get_dentry(tr); if (IS_ERR(d_tracer)) return NULL; tr->options = tracefs_create_dir("options", d_tracer); if (!tr->options) { pr_warn("Could not create tracefs directory 'options'\n"); return NULL; } return tr->options; } static void create_trace_option_file(struct trace_array *tr, struct trace_option_dentry *topt, struct tracer_flags *flags, struct tracer_opt *opt) { struct dentry *t_options; t_options = trace_options_init_dentry(tr); if (!t_options) return; topt->flags = flags; topt->opt = opt; topt->tr = tr; topt->entry = trace_create_file(opt->name, TRACE_MODE_WRITE, t_options, topt, &trace_options_fops); } static void create_trace_option_files(struct trace_array *tr, struct tracer *tracer) { struct trace_option_dentry *topts; struct trace_options *tr_topts; struct tracer_flags *flags; struct tracer_opt *opts; int cnt; int i; if (!tracer) return; flags = tracer->flags; if (!flags || !flags->opts) return; /* * If this is an instance, only create flags for tracers * the instance may have. */ if (!trace_ok_for_array(tracer, tr)) return; for (i = 0; i < tr->nr_topts; i++) { /* Make sure there's no duplicate flags. */ if (WARN_ON_ONCE(tr->topts[i].tracer->flags == tracer->flags)) return; } opts = flags->opts; for (cnt = 0; opts[cnt].name; cnt++) ; topts = kcalloc(cnt + 1, sizeof(*topts), GFP_KERNEL); if (!topts) return; tr_topts = krealloc(tr->topts, sizeof(*tr->topts) * (tr->nr_topts + 1), GFP_KERNEL); if (!tr_topts) { kfree(topts); return; } tr->topts = tr_topts; tr->topts[tr->nr_topts].tracer = tracer; tr->topts[tr->nr_topts].topts = topts; tr->nr_topts++; for (cnt = 0; opts[cnt].name; cnt++) { create_trace_option_file(tr, &topts[cnt], flags, &opts[cnt]); MEM_FAIL(topts[cnt].entry == NULL, "Failed to create trace option: %s", opts[cnt].name); } } static struct dentry * create_trace_option_core_file(struct trace_array *tr, const char *option, long index) { struct dentry *t_options; t_options = trace_options_init_dentry(tr); if (!t_options) return NULL; return trace_create_file(option, TRACE_MODE_WRITE, t_options, (void *)&tr->trace_flags_index[index], &trace_options_core_fops); } static void create_trace_options_dir(struct trace_array *tr) { struct dentry *t_options; bool top_level = tr == &global_trace; int i; t_options = trace_options_init_dentry(tr); if (!t_options) return; for (i = 0; trace_options[i]; i++) { if (top_level || !((1 << i) & TOP_LEVEL_TRACE_FLAGS)) create_trace_option_core_file(tr, trace_options[i], i); } } static ssize_t rb_simple_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; char buf[64]; int r; r = tracer_tracing_is_on(tr); r = sprintf(buf, "%d\n", r); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } static ssize_t rb_simple_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; struct trace_buffer *buffer = tr->array_buffer.buffer; unsigned long val; int ret; ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; if (buffer) { mutex_lock(&trace_types_lock); if (!!val == tracer_tracing_is_on(tr)) { val = 0; /* do nothing */ } else if (val) { tracer_tracing_on(tr); if (tr->current_trace->start) tr->current_trace->start(tr); } else { tracer_tracing_off(tr); if (tr->current_trace->stop) tr->current_trace->stop(tr); /* Wake up any waiters */ ring_buffer_wake_waiters(buffer, RING_BUFFER_ALL_CPUS); } mutex_unlock(&trace_types_lock); } (*ppos)++; return cnt; } static const struct file_operations rb_simple_fops = { .open = tracing_open_generic_tr, .read = rb_simple_read, .write = rb_simple_write, .release = tracing_release_generic_tr, .llseek = default_llseek, }; static ssize_t buffer_percent_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; char buf[64]; int r; r = tr->buffer_percent; r = sprintf(buf, "%d\n", r); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } static ssize_t buffer_percent_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; unsigned long val; int ret; ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; if (val > 100) return -EINVAL; tr->buffer_percent = val; (*ppos)++; return cnt; } static const struct file_operations buffer_percent_fops = { .open = tracing_open_generic_tr, .read = buffer_percent_read, .write = buffer_percent_write, .release = tracing_release_generic_tr, .llseek = default_llseek, }; static ssize_t buffer_subbuf_size_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; size_t size; char buf[64]; int order; int r; order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer); size = (PAGE_SIZE << order) / 1024; r = sprintf(buf, "%zd\n", size); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } static ssize_t buffer_subbuf_size_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; unsigned long val; int old_order; int order; int pages; int ret; ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; val *= 1024; /* value passed in is in KB */ pages = DIV_ROUND_UP(val, PAGE_SIZE); order = fls(pages - 1); /* limit between 1 and 128 system pages */ if (order < 0 || order > 7) return -EINVAL; /* Do not allow tracing while changing the order of the ring buffer */ tracing_stop_tr(tr); old_order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer); if (old_order == order) goto out; ret = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, order); if (ret) goto out; #ifdef CONFIG_TRACER_MAX_TRACE if (!tr->allocated_snapshot) goto out_max; ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, order); if (ret) { /* Put back the old order */ cnt = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, old_order); if (WARN_ON_ONCE(cnt)) { /* * AARGH! We are left with different orders! * The max buffer is our "snapshot" buffer. * When a tracer needs a snapshot (one of the * latency tracers), it swaps the max buffer * with the saved snap shot. We succeeded to * update the order of the main buffer, but failed to * update the order of the max buffer. But when we tried * to reset the main buffer to the original size, we * failed there too. This is very unlikely to * happen, but if it does, warn and kill all * tracing. */ tracing_disabled = 1; } goto out; } out_max: #endif (*ppos)++; out: if (ret) cnt = ret; tracing_start_tr(tr); return cnt; } static const struct file_operations buffer_subbuf_size_fops = { .open = tracing_open_generic_tr, .read = buffer_subbuf_size_read, .write = buffer_subbuf_size_write, .release = tracing_release_generic_tr, .llseek = default_llseek, }; static struct dentry *trace_instance_dir; static void init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer); static int allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size) { enum ring_buffer_flags rb_flags; rb_flags = tr->trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; buf->tr = tr; if (tr->range_addr_start && tr->range_addr_size) { buf->buffer = ring_buffer_alloc_range(size, rb_flags, 0, tr->range_addr_start, tr->range_addr_size); ring_buffer_last_boot_delta(buf->buffer, &tr->text_delta, &tr->data_delta); /* * This is basically the same as a mapped buffer, * with the same restrictions. */ tr->mapped++; } else { buf->buffer = ring_buffer_alloc(size, rb_flags); } if (!buf->buffer) return -ENOMEM; buf->data = alloc_percpu(struct trace_array_cpu); if (!buf->data) { ring_buffer_free(buf->buffer); buf->buffer = NULL; return -ENOMEM; } /* Allocate the first page for all buffers */ set_buffer_entries(&tr->array_buffer, ring_buffer_size(tr->array_buffer.buffer, 0)); return 0; } static void free_trace_buffer(struct array_buffer *buf) { if (buf->buffer) { ring_buffer_free(buf->buffer); buf->buffer = NULL; free_percpu(buf->data); buf->data = NULL; } } static int allocate_trace_buffers(struct trace_array *tr, int size) { int ret; ret = allocate_trace_buffer(tr, &tr->array_buffer, size); if (ret) return ret; #ifdef CONFIG_TRACER_MAX_TRACE /* Fix mapped buffer trace arrays do not have snapshot buffers */ if (tr->range_addr_start) return 0; ret = allocate_trace_buffer(tr, &tr->max_buffer, allocate_snapshot ? size : 1); if (MEM_FAIL(ret, "Failed to allocate trace buffer\n")) { free_trace_buffer(&tr->array_buffer); return -ENOMEM; } tr->allocated_snapshot = allocate_snapshot; allocate_snapshot = false; #endif return 0; } static void free_trace_buffers(struct trace_array *tr) { if (!tr) return; free_trace_buffer(&tr->array_buffer); #ifdef CONFIG_TRACER_MAX_TRACE free_trace_buffer(&tr->max_buffer); #endif } static void init_trace_flags_index(struct trace_array *tr) { int i; /* Used by the trace options files */ for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++) tr->trace_flags_index[i] = i; } static void __update_tracer_options(struct trace_array *tr) { struct tracer *t; for (t = trace_types; t; t = t->next) add_tracer_options(tr, t); } static void update_tracer_options(struct trace_array *tr) { mutex_lock(&trace_types_lock); tracer_options_updated = true; __update_tracer_options(tr); mutex_unlock(&trace_types_lock); } /* Must have trace_types_lock held */ struct trace_array *trace_array_find(const char *instance) { struct trace_array *tr, *found = NULL; list_for_each_entry(tr, &ftrace_trace_arrays, list) { if (tr->name && strcmp(tr->name, instance) == 0) { found = tr; break; } } return found; } struct trace_array *trace_array_find_get(const char *instance) { struct trace_array *tr; mutex_lock(&trace_types_lock); tr = trace_array_find(instance); if (tr) tr->ref++; mutex_unlock(&trace_types_lock); return tr; } static int trace_array_create_dir(struct trace_array *tr) { int ret; tr->dir = tracefs_create_dir(tr->name, trace_instance_dir); if (!tr->dir) return -EINVAL; ret = event_trace_add_tracer(tr->dir, tr); if (ret) { tracefs_remove(tr->dir); return ret; } init_tracer_tracefs(tr, tr->dir); __update_tracer_options(tr); return ret; } static struct trace_array * trace_array_create_systems(const char *name, const char *systems, unsigned long range_addr_start, unsigned long range_addr_size) { struct trace_array *tr; int ret; ret = -ENOMEM; tr = kzalloc(sizeof(*tr), GFP_KERNEL); if (!tr) return ERR_PTR(ret); tr->name = kstrdup(name, GFP_KERNEL); if (!tr->name) goto out_free_tr; if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL)) goto out_free_tr; if (!zalloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL)) goto out_free_tr; if (systems) { tr->system_names = kstrdup_const(systems, GFP_KERNEL); if (!tr->system_names) goto out_free_tr; } /* Only for boot up memory mapped ring buffers */ tr->range_addr_start = range_addr_start; tr->range_addr_size = range_addr_size; tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS; cpumask_copy(tr->tracing_cpumask, cpu_all_mask); raw_spin_lock_init(&tr->start_lock); tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; #ifdef CONFIG_TRACER_MAX_TRACE spin_lock_init(&tr->snapshot_trigger_lock); #endif tr->current_trace = &nop_trace; INIT_LIST_HEAD(&tr->systems); INIT_LIST_HEAD(&tr->events); INIT_LIST_HEAD(&tr->hist_vars); INIT_LIST_HEAD(&tr->err_log); #ifdef CONFIG_MODULES INIT_LIST_HEAD(&tr->mod_events); #endif if (allocate_trace_buffers(tr, trace_buf_size) < 0) goto out_free_tr; /* The ring buffer is defaultly expanded */ trace_set_ring_buffer_expanded(tr); if (ftrace_allocate_ftrace_ops(tr) < 0) goto out_free_tr; ftrace_init_trace_array(tr); init_trace_flags_index(tr); if (trace_instance_dir) { ret = trace_array_create_dir(tr); if (ret) goto out_free_tr; } else __trace_early_add_events(tr); list_add(&tr->list, &ftrace_trace_arrays); tr->ref++; return tr; out_free_tr: ftrace_free_ftrace_ops(tr); free_trace_buffers(tr); free_cpumask_var(tr->pipe_cpumask); free_cpumask_var(tr->tracing_cpumask); kfree_const(tr->system_names); kfree(tr->name); kfree(tr); return ERR_PTR(ret); } static struct trace_array *trace_array_create(const char *name) { return trace_array_create_systems(name, NULL, 0, 0); } static int instance_mkdir(const char *name) { struct trace_array *tr; int ret; guard(mutex)(&event_mutex); guard(mutex)(&trace_types_lock); ret = -EEXIST; if (trace_array_find(name)) return -EEXIST; tr = trace_array_create(name); ret = PTR_ERR_OR_ZERO(tr); return ret; } static u64 map_pages(u64 start, u64 size) { struct page **pages; phys_addr_t page_start; unsigned int page_count; unsigned int i; void *vaddr; page_count = DIV_ROUND_UP(size, PAGE_SIZE); page_start = start; pages = kmalloc_array(page_count, sizeof(struct page *), GFP_KERNEL); if (!pages) return 0; for (i = 0; i < page_count; i++) { phys_addr_t addr = page_start + i * PAGE_SIZE; pages[i] = pfn_to_page(addr >> PAGE_SHIFT); } vaddr = vmap(pages, page_count, VM_MAP, PAGE_KERNEL); kfree(pages); return (u64)(unsigned long)vaddr; } /** * trace_array_get_by_name - Create/Lookup a trace array, given its name. * @name: The name of the trace array to be looked up/created. * @systems: A list of systems to create event directories for (NULL for all) * * Returns pointer to trace array with given name. * NULL, if it cannot be created. * * NOTE: This function increments the reference counter associated with the * trace array returned. This makes sure it cannot be freed while in use. * Use trace_array_put() once the trace array is no longer needed. * If the trace_array is to be freed, trace_array_destroy() needs to * be called after the trace_array_put(), or simply let user space delete * it from the tracefs instances directory. But until the * trace_array_put() is called, user space can not delete it. * */ struct trace_array *trace_array_get_by_name(const char *name, const char *systems) { struct trace_array *tr; guard(mutex)(&event_mutex); guard(mutex)(&trace_types_lock); list_for_each_entry(tr, &ftrace_trace_arrays, list) { if (tr->name && strcmp(tr->name, name) == 0) { tr->ref++; return tr; } } tr = trace_array_create_systems(name, systems, 0, 0); if (IS_ERR(tr)) tr = NULL; else tr->ref++; return tr; } EXPORT_SYMBOL_GPL(trace_array_get_by_name); static int __remove_instance(struct trace_array *tr) { int i; /* Reference counter for a newly created trace array = 1. */ if (tr->ref > 1 || (tr->current_trace && tr->trace_ref)) return -EBUSY; list_del(&tr->list); /* Disable all the flags that were enabled coming in */ for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++) { if ((1 << i) & ZEROED_TRACE_FLAGS) set_tracer_flag(tr, 1 << i, 0); } if (printk_trace == tr) update_printk_trace(&global_trace); tracing_set_nop(tr); clear_ftrace_function_probes(tr); event_trace_del_tracer(tr); ftrace_clear_pids(tr); ftrace_destroy_function_files(tr); tracefs_remove(tr->dir); free_percpu(tr->last_func_repeats); free_trace_buffers(tr); clear_tracing_err_log(tr); for (i = 0; i < tr->nr_topts; i++) { kfree(tr->topts[i].topts); } kfree(tr->topts); free_cpumask_var(tr->pipe_cpumask); free_cpumask_var(tr->tracing_cpumask); kfree_const(tr->system_names); kfree(tr->name); kfree(tr); return 0; } int trace_array_destroy(struct trace_array *this_tr) { struct trace_array *tr; if (!this_tr) return -EINVAL; guard(mutex)(&event_mutex); guard(mutex)(&trace_types_lock); /* Making sure trace array exists before destroying it. */ list_for_each_entry(tr, &ftrace_trace_arrays, list) { if (tr == this_tr) return __remove_instance(tr); } return -ENODEV; } EXPORT_SYMBOL_GPL(trace_array_destroy); static int instance_rmdir(const char *name) { struct trace_array *tr; guard(mutex)(&event_mutex); guard(mutex)(&trace_types_lock); tr = trace_array_find(name); if (!tr) return -ENODEV; return __remove_instance(tr); } static __init void create_trace_instances(struct dentry *d_tracer) { struct trace_array *tr; trace_instance_dir = tracefs_create_instance_dir("instances", d_tracer, instance_mkdir, instance_rmdir); if (MEM_FAIL(!trace_instance_dir, "Failed to create instances directory\n")) return; guard(mutex)(&event_mutex); guard(mutex)(&trace_types_lock); list_for_each_entry(tr, &ftrace_trace_arrays, list) { if (!tr->name) continue; if (MEM_FAIL(trace_array_create_dir(tr) < 0, "Failed to create instance directory\n")) return; } } static void init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) { int cpu; trace_create_file("available_tracers", TRACE_MODE_READ, d_tracer, tr, &show_traces_fops); trace_create_file("current_tracer", TRACE_MODE_WRITE, d_tracer, tr, &set_tracer_fops); trace_create_file("tracing_cpumask", TRACE_MODE_WRITE, d_tracer, tr, &tracing_cpumask_fops); trace_create_file("trace_options", TRACE_MODE_WRITE, d_tracer, tr, &tracing_iter_fops); trace_create_file("trace", TRACE_MODE_WRITE, d_tracer, tr, &tracing_fops); trace_create_file("trace_pipe", TRACE_MODE_READ, d_tracer, tr, &tracing_pipe_fops); trace_create_file("buffer_size_kb", TRACE_MODE_WRITE, d_tracer, tr, &tracing_entries_fops); trace_create_file("buffer_total_size_kb", TRACE_MODE_READ, d_tracer, tr, &tracing_total_entries_fops); trace_create_file("free_buffer", 0200, d_tracer, tr, &tracing_free_buffer_fops); trace_create_file("trace_marker", 0220, d_tracer, tr, &tracing_mark_fops); tr->trace_marker_file = __find_event_file(tr, "ftrace", "print"); trace_create_file("trace_marker_raw", 0220, d_tracer, tr, &tracing_mark_raw_fops); trace_create_file("trace_clock", TRACE_MODE_WRITE, d_tracer, tr, &trace_clock_fops); trace_create_file("tracing_on", TRACE_MODE_WRITE, d_tracer, tr, &rb_simple_fops); trace_create_file("timestamp_mode", TRACE_MODE_READ, d_tracer, tr, &trace_time_stamp_mode_fops); tr->buffer_percent = 50; trace_create_file("buffer_percent", TRACE_MODE_WRITE, d_tracer, tr, &buffer_percent_fops); trace_create_file("buffer_subbuf_size_kb", TRACE_MODE_WRITE, d_tracer, tr, &buffer_subbuf_size_fops); create_trace_options_dir(tr); #ifdef CONFIG_TRACER_MAX_TRACE trace_create_maxlat_file(tr, d_tracer); #endif if (ftrace_create_function_files(tr, d_tracer)) MEM_FAIL(1, "Could not allocate function filter files"); if (tr->range_addr_start) { trace_create_file("last_boot_info", TRACE_MODE_READ, d_tracer, tr, &last_boot_fops); #ifdef CONFIG_TRACER_SNAPSHOT } else { trace_create_file("snapshot", TRACE_MODE_WRITE, d_tracer, tr, &snapshot_fops); #endif } trace_create_file("error_log", TRACE_MODE_WRITE, d_tracer, tr, &tracing_err_log_fops); for_each_tracing_cpu(cpu) tracing_init_tracefs_percpu(tr, cpu); ftrace_init_tracefs(tr, d_tracer); } static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore) { struct vfsmount *mnt; struct file_system_type *type; /* * To maintain backward compatibility for tools that mount * debugfs to get to the tracing facility, tracefs is automatically * mounted to the debugfs/tracing directory. */ type = get_fs_type("tracefs"); if (!type) return NULL; mnt = vfs_submount(mntpt, type, "tracefs", NULL); put_filesystem(type); if (IS_ERR(mnt)) return NULL; mntget(mnt); return mnt; } /** * tracing_init_dentry - initialize top level trace array * * This is called when creating files or directories in the tracing * directory. It is called via fs_initcall() by any of the boot up code * and expects to return the dentry of the top level tracing directory. */ int tracing_init_dentry(void) { struct trace_array *tr = &global_trace; if (security_locked_down(LOCKDOWN_TRACEFS)) { pr_warn("Tracing disabled due to lockdown\n"); return -EPERM; } /* The top level trace array uses NULL as parent */ if (tr->dir) return 0; if (WARN_ON(!tracefs_initialized())) return -ENODEV; /* * As there may still be users that expect the tracing * files to exist in debugfs/tracing, we must automount * the tracefs file system there, so older tools still * work with the newer kernel. */ tr->dir = debugfs_create_automount("tracing", NULL, trace_automount, NULL); return 0; } extern struct trace_eval_map *__start_ftrace_eval_maps[]; extern struct trace_eval_map *__stop_ftrace_eval_maps[]; static struct workqueue_struct *eval_map_wq __initdata; static struct work_struct eval_map_work __initdata; static struct work_struct tracerfs_init_work __initdata; static void __init eval_map_work_func(struct work_struct *work) { int len; len = __stop_ftrace_eval_maps - __start_ftrace_eval_maps; trace_insert_eval_map(NULL, __start_ftrace_eval_maps, len); } static int __init trace_eval_init(void) { INIT_WORK(&eval_map_work, eval_map_work_func); eval_map_wq = alloc_workqueue("eval_map_wq", WQ_UNBOUND, 0); if (!eval_map_wq) { pr_err("Unable to allocate eval_map_wq\n"); /* Do work here */ eval_map_work_func(&eval_map_work); return -ENOMEM; } queue_work(eval_map_wq, &eval_map_work); return 0; } subsys_initcall(trace_eval_init); static int __init trace_eval_sync(void) { /* Make sure the eval map updates are finished */ if (eval_map_wq) destroy_workqueue(eval_map_wq); return 0; } late_initcall_sync(trace_eval_sync); #ifdef CONFIG_MODULES bool module_exists(const char *module) { /* All modules have the symbol __this_module */ static const char this_mod[] = "__this_module"; char modname[MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 2]; unsigned long val; int n; n = snprintf(modname, sizeof(modname), "%s:%s", module, this_mod); if (n > sizeof(modname) - 1) return false; val = module_kallsyms_lookup_name(modname); return val != 0; } static void trace_module_add_evals(struct module *mod) { if (!mod->num_trace_evals) return; /* * Modules with bad taint do not have events created, do * not bother with enums either. */ if (trace_module_has_bad_taint(mod)) return; trace_insert_eval_map(mod, mod->trace_evals, mod->num_trace_evals); } #ifdef CONFIG_TRACE_EVAL_MAP_FILE static void trace_module_remove_evals(struct module *mod) { union trace_eval_map_item *map; union trace_eval_map_item **last = &trace_eval_maps; if (!mod->num_trace_evals) return; guard(mutex)(&trace_eval_mutex); map = trace_eval_maps; while (map) { if (map->head.mod == mod) break; map = trace_eval_jmp_to_tail(map); last = &map->tail.next; map = map->tail.next; } if (!map) return; *last = trace_eval_jmp_to_tail(map)->tail.next; kfree(map); } #else static inline void trace_module_remove_evals(struct module *mod) { } #endif /* CONFIG_TRACE_EVAL_MAP_FILE */ static int trace_module_notify(struct notifier_block *self, unsigned long val, void *data) { struct module *mod = data; switch (val) { case MODULE_STATE_COMING: trace_module_add_evals(mod); break; case MODULE_STATE_GOING: trace_module_remove_evals(mod); break; } return NOTIFY_OK; } static struct notifier_block trace_module_nb = { .notifier_call = trace_module_notify, .priority = 0, }; #endif /* CONFIG_MODULES */ static __init void tracer_init_tracefs_work_func(struct work_struct *work) { event_trace_init(); init_tracer_tracefs(&global_trace, NULL); ftrace_init_tracefs_toplevel(&global_trace, NULL); trace_create_file("tracing_thresh", TRACE_MODE_WRITE, NULL, &global_trace, &tracing_thresh_fops); trace_create_file("README", TRACE_MODE_READ, NULL, NULL, &tracing_readme_fops); trace_create_file("saved_cmdlines", TRACE_MODE_READ, NULL, NULL, &tracing_saved_cmdlines_fops); trace_create_file("saved_cmdlines_size", TRACE_MODE_WRITE, NULL, NULL, &tracing_saved_cmdlines_size_fops); trace_create_file("saved_tgids", TRACE_MODE_READ, NULL, NULL, &tracing_saved_tgids_fops); trace_create_eval_file(NULL); #ifdef CONFIG_MODULES register_module_notifier(&trace_module_nb); #endif #ifdef CONFIG_DYNAMIC_FTRACE trace_create_file("dyn_ftrace_total_info", TRACE_MODE_READ, NULL, NULL, &tracing_dyn_info_fops); #endif create_trace_instances(NULL); update_tracer_options(&global_trace); } static __init int tracer_init_tracefs(void) { int ret; trace_access_lock_init(); ret = tracing_init_dentry(); if (ret) return 0; if (eval_map_wq) { INIT_WORK(&tracerfs_init_work, tracer_init_tracefs_work_func); queue_work(eval_map_wq, &tracerfs_init_work); } else { tracer_init_tracefs_work_func(NULL); } rv_init_interface(); return 0; } fs_initcall(tracer_init_tracefs); static int trace_die_panic_handler(struct notifier_block *self, unsigned long ev, void *unused); static struct notifier_block trace_panic_notifier = { .notifier_call = trace_die_panic_handler, .priority = INT_MAX - 1, }; static struct notifier_block trace_die_notifier = { .notifier_call = trace_die_panic_handler, .priority = INT_MAX - 1, }; /* * The idea is to execute the following die/panic callback early, in order * to avoid showing irrelevant information in the trace (like other panic * notifier functions); we are the 2nd to run, after hung_task/rcu_stall * warnings get disabled (to prevent potential log flooding). */ static int trace_die_panic_handler(struct notifier_block *self, unsigned long ev, void *unused) { if (!ftrace_dump_on_oops_enabled()) return NOTIFY_DONE; /* The die notifier requires DIE_OOPS to trigger */ if (self == &trace_die_notifier && ev != DIE_OOPS) return NOTIFY_DONE; ftrace_dump(DUMP_PARAM); return NOTIFY_DONE; } /* * printk is set to max of 1024, we really don't need it that big. * Nothing should be printing 1000 characters anyway. */ #define TRACE_MAX_PRINT 1000 /* * Define here KERN_TRACE so that we have one place to modify * it if we decide to change what log level the ftrace dump * should be at. */ #define KERN_TRACE KERN_EMERG void trace_printk_seq(struct trace_seq *s) { /* Probably should print a warning here. */ if (s->seq.len >= TRACE_MAX_PRINT) s->seq.len = TRACE_MAX_PRINT; /* * More paranoid code. Although the buffer size is set to * PAGE_SIZE, and TRACE_MAX_PRINT is 1000, this is just * an extra layer of protection. */ if (WARN_ON_ONCE(s->seq.len >= s->seq.size)) s->seq.len = s->seq.size - 1; /* should be zero ended, but we are paranoid. */ s->buffer[s->seq.len] = 0; printk(KERN_TRACE "%s", s->buffer); trace_seq_init(s); } static void trace_init_iter(struct trace_iterator *iter, struct trace_array *tr) { iter->tr = tr; iter->trace = iter->tr->current_trace; iter->cpu_file = RING_BUFFER_ALL_CPUS; iter->array_buffer = &tr->array_buffer; if (iter->trace && iter->trace->open) iter->trace->open(iter); /* Annotate start of buffers if we had overruns */ if (ring_buffer_overruns(iter->array_buffer->buffer)) iter->iter_flags |= TRACE_FILE_ANNOTATE; /* Output in nanoseconds only if we are using a clock in nanoseconds. */ if (trace_clocks[iter->tr->clock_id].in_ns) iter->iter_flags |= TRACE_FILE_TIME_IN_NS; /* Can not use kmalloc for iter.temp and iter.fmt */ iter->temp = static_temp_buf; iter->temp_size = STATIC_TEMP_BUF_SIZE; iter->fmt = static_fmt_buf; iter->fmt_size = STATIC_FMT_BUF_SIZE; } void trace_init_global_iter(struct trace_iterator *iter) { trace_init_iter(iter, &global_trace); } static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_mode) { /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; unsigned int old_userobj; unsigned long flags; int cnt = 0, cpu; /* * Always turn off tracing when we dump. * We don't need to show trace output of what happens * between multiple crashes. * * If the user does a sysrq-z, then they can re-enable * tracing with echo 1 > tracing_on. */ tracer_tracing_off(tr); local_irq_save(flags); /* Simulate the iterator */ trace_init_iter(&iter, tr); for_each_tracing_cpu(cpu) { atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); } old_userobj = tr->trace_flags & TRACE_ITER_SYM_USEROBJ; /* don't look at user memory in panic mode */ tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ; if (dump_mode == DUMP_ORIG) iter.cpu_file = raw_smp_processor_id(); else iter.cpu_file = RING_BUFFER_ALL_CPUS; if (tr == &global_trace) printk(KERN_TRACE "Dumping ftrace buffer:\n"); else printk(KERN_TRACE "Dumping ftrace instance %s buffer:\n", tr->name); /* Did function tracer already get disabled? */ if (ftrace_is_dead()) { printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n"); printk("# MAY BE MISSING FUNCTION EVENTS\n"); } /* * We need to stop all tracing on all CPUS to read * the next buffer. This is a bit expensive, but is * not done often. We fill all what we can read, * and then release the locks again. */ while (!trace_empty(&iter)) { if (!cnt) printk(KERN_TRACE "---------------------------------\n"); cnt++; trace_iterator_reset(&iter); iter.iter_flags |= TRACE_FILE_LAT_FMT; if (trace_find_next_entry_inc(&iter) != NULL) { int ret; ret = print_trace_line(&iter); if (ret != TRACE_TYPE_NO_CONSUME) trace_consume(&iter); } touch_nmi_watchdog(); trace_printk_seq(&iter.seq); } if (!cnt) printk(KERN_TRACE " (ftrace buffer empty)\n"); else printk(KERN_TRACE "---------------------------------\n"); tr->trace_flags |= old_userobj; for_each_tracing_cpu(cpu) { atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); } local_irq_restore(flags); } static void ftrace_dump_by_param(void) { bool first_param = true; char dump_param[MAX_TRACER_SIZE]; char *buf, *token, *inst_name; struct trace_array *tr; strscpy(dump_param, ftrace_dump_on_oops, MAX_TRACER_SIZE); buf = dump_param; while ((token = strsep(&buf, ",")) != NULL) { if (first_param) { first_param = false; if (!strcmp("0", token)) continue; else if (!strcmp("1", token)) { ftrace_dump_one(&global_trace, DUMP_ALL); continue; } else if (!strcmp("2", token) || !strcmp("orig_cpu", token)) { ftrace_dump_one(&global_trace, DUMP_ORIG); continue; } } inst_name = strsep(&token, "="); tr = trace_array_find(inst_name); if (!tr) { printk(KERN_TRACE "Instance %s not found\n", inst_name); continue; } if (token && (!strcmp("2", token) || !strcmp("orig_cpu", token))) ftrace_dump_one(tr, DUMP_ORIG); else ftrace_dump_one(tr, DUMP_ALL); } } void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { static atomic_t dump_running; /* Only allow one dump user at a time. */ if (atomic_inc_return(&dump_running) != 1) { atomic_dec(&dump_running); return; } switch (oops_dump_mode) { case DUMP_ALL: ftrace_dump_one(&global_trace, DUMP_ALL); break; case DUMP_ORIG: ftrace_dump_one(&global_trace, DUMP_ORIG); break; case DUMP_PARAM: ftrace_dump_by_param(); break; case DUMP_NONE: break; default: printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n"); ftrace_dump_one(&global_trace, DUMP_ALL); } atomic_dec(&dump_running); } EXPORT_SYMBOL_GPL(ftrace_dump); #define WRITE_BUFSIZE 4096 ssize_t trace_parse_run_command(struct file *file, const char __user *buffer, size_t count, loff_t *ppos, int (*createfn)(const char *)) { char *kbuf, *buf, *tmp; int ret = 0; size_t done = 0; size_t size; kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL); if (!kbuf) return -ENOMEM; while (done < count) { size = count - done; if (size >= WRITE_BUFSIZE) size = WRITE_BUFSIZE - 1; if (copy_from_user(kbuf, buffer + done, size)) { ret = -EFAULT; goto out; } kbuf[size] = '\0'; buf = kbuf; do { tmp = strchr(buf, '\n'); if (tmp) { *tmp = '\0'; size = tmp - buf + 1; } else { size = strlen(buf); if (done + size < count) { if (buf != kbuf) break; /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */ pr_warn("Line length is too long: Should be less than %d\n", WRITE_BUFSIZE - 2); ret = -EINVAL; goto out; } } done += size; /* Remove comments */ tmp = strchr(buf, '#'); if (tmp) *tmp = '\0'; ret = createfn(buf); if (ret) goto out; buf += size; } while (done < count); } ret = done; out: kfree(kbuf); return ret; } #ifdef CONFIG_TRACER_MAX_TRACE __init static bool tr_needs_alloc_snapshot(const char *name) { char *test; int len = strlen(name); bool ret; if (!boot_snapshot_index) return false; if (strncmp(name, boot_snapshot_info, len) == 0 && boot_snapshot_info[len] == '\t') return true; test = kmalloc(strlen(name) + 3, GFP_KERNEL); if (!test) return false; sprintf(test, "\t%s\t", name); ret = strstr(boot_snapshot_info, test) == NULL; kfree(test); return ret; } __init static void do_allocate_snapshot(const char *name) { if (!tr_needs_alloc_snapshot(name)) return; /* * When allocate_snapshot is set, the next call to * allocate_trace_buffers() (called by trace_array_get_by_name()) * will allocate the snapshot buffer. That will alse clear * this flag. */ allocate_snapshot = true; } #else static inline void do_allocate_snapshot(const char *name) { } #endif __init static void enable_instances(void) { struct trace_array *tr; char *curr_str; char *name; char *str; char *tok; /* A tab is always appended */ boot_instance_info[boot_instance_index - 1] = '\0'; str = boot_instance_info; while ((curr_str = strsep(&str, "\t"))) { phys_addr_t start = 0; phys_addr_t size = 0; unsigned long addr = 0; bool traceprintk = false; bool traceoff = false; char *flag_delim; char *addr_delim; tok = strsep(&curr_str, ","); flag_delim = strchr(tok, '^'); addr_delim = strchr(tok, '@'); if (addr_delim) *addr_delim++ = '\0'; if (flag_delim) *flag_delim++ = '\0'; name = tok; if (flag_delim) { char *flag; while ((flag = strsep(&flag_delim, "^"))) { if (strcmp(flag, "traceoff") == 0) { traceoff = true; } else if ((strcmp(flag, "printk") == 0) || (strcmp(flag, "traceprintk") == 0) || (strcmp(flag, "trace_printk") == 0)) { traceprintk = true; } else { pr_info("Tracing: Invalid instance flag '%s' for %s\n", flag, name); } } } tok = addr_delim; if (tok && isdigit(*tok)) { start = memparse(tok, &tok); if (!start) { pr_warn("Tracing: Invalid boot instance address for %s\n", name); continue; } if (*tok != ':') { pr_warn("Tracing: No size specified for instance %s\n", name); continue; } tok++; size = memparse(tok, &tok); if (!size) { pr_warn("Tracing: Invalid boot instance size for %s\n", name); continue; } } else if (tok) { if (!reserve_mem_find_by_name(tok, &start, &size)) { start = 0; pr_warn("Failed to map boot instance %s to %s\n", name, tok); continue; } } if (start) { addr = map_pages(start, size); if (addr) { pr_info("Tracing: mapped boot instance %s at physical memory %pa of size 0x%lx\n", name, &start, (unsigned long)size); } else { pr_warn("Tracing: Failed to map boot instance %s\n", name); continue; } } else { /* Only non mapped buffers have snapshot buffers */ if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE)) do_allocate_snapshot(name); } tr = trace_array_create_systems(name, NULL, addr, size); if (IS_ERR(tr)) { pr_warn("Tracing: Failed to create instance buffer %s\n", curr_str); continue; } if (traceoff) tracer_tracing_off(tr); if (traceprintk) update_printk_trace(tr); /* * If start is set, then this is a mapped buffer, and * cannot be deleted by user space, so keep the reference * to it. */ if (start) { tr->flags |= TRACE_ARRAY_FL_BOOT; tr->ref++; } while ((tok = strsep(&curr_str, ","))) { early_enable_events(tr, tok, true); } } } __init static int tracer_alloc_buffers(void) { int ring_buf_size; int ret = -ENOMEM; if (security_locked_down(LOCKDOWN_TRACEFS)) { pr_warn("Tracing disabled due to lockdown\n"); return -EPERM; } /* * Make sure we don't accidentally add more trace options * than we have bits for. */ BUILD_BUG_ON(TRACE_ITER_LAST_BIT > TRACE_FLAGS_MAX_SIZE); if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) goto out; if (!alloc_cpumask_var(&global_trace.tracing_cpumask, GFP_KERNEL)) goto out_free_buffer_mask; /* Only allocate trace_printk buffers if a trace_printk exists */ if (&__stop___trace_bprintk_fmt != &__start___trace_bprintk_fmt) /* Must be called before global_trace.buffer is allocated */ trace_printk_init_buffers(); /* To save memory, keep the ring buffer size to its minimum */ if (global_trace.ring_buffer_expanded) ring_buf_size = trace_buf_size; else ring_buf_size = 1; cpumask_copy(tracing_buffer_mask, cpu_possible_mask); cpumask_copy(global_trace.tracing_cpumask, cpu_all_mask); raw_spin_lock_init(&global_trace.start_lock); /* * The prepare callbacks allocates some memory for the ring buffer. We * don't free the buffer if the CPU goes down. If we were to free * the buffer, then the user would lose any trace that was in the * buffer. The memory will be removed once the "instance" is removed. */ ret = cpuhp_setup_state_multi(CPUHP_TRACE_RB_PREPARE, "trace/RB:prepare", trace_rb_cpu_prepare, NULL); if (ret < 0) goto out_free_cpumask; /* Used for event triggers */ ret = -ENOMEM; temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE); if (!temp_buffer) goto out_rm_hp_state; if (trace_create_savedcmd() < 0) goto out_free_temp_buffer; if (!zalloc_cpumask_var(&global_trace.pipe_cpumask, GFP_KERNEL)) goto out_free_savedcmd; /* TODO: make the number of buffers hot pluggable with CPUS */ if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) { MEM_FAIL(1, "tracer: failed to allocate ring buffer!\n"); goto out_free_pipe_cpumask; } if (global_trace.buffer_disabled) tracing_off(); if (trace_boot_clock) { ret = tracing_set_clock(&global_trace, trace_boot_clock); if (ret < 0) pr_warn("Trace clock %s not defined, going back to default\n", trace_boot_clock); } /* * register_tracer() might reference current_trace, so it * needs to be set before we register anything. This is * just a bootstrap of current_trace anyway. */ global_trace.current_trace = &nop_trace; global_trace.max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; #ifdef CONFIG_TRACER_MAX_TRACE spin_lock_init(&global_trace.snapshot_trigger_lock); #endif ftrace_init_global_array_ops(&global_trace); #ifdef CONFIG_MODULES INIT_LIST_HEAD(&global_trace.mod_events); #endif init_trace_flags_index(&global_trace); register_tracer(&nop_trace); /* Function tracing may start here (via kernel command line) */ init_function_trace(); /* All seems OK, enable tracing */ tracing_disabled = 0; atomic_notifier_chain_register(&panic_notifier_list, &trace_panic_notifier); register_die_notifier(&trace_die_notifier); global_trace.flags = TRACE_ARRAY_FL_GLOBAL; INIT_LIST_HEAD(&global_trace.systems); INIT_LIST_HEAD(&global_trace.events); INIT_LIST_HEAD(&global_trace.hist_vars); INIT_LIST_HEAD(&global_trace.err_log); list_add(&global_trace.list, &ftrace_trace_arrays); apply_trace_boot_options(); register_snapshot_cmd(); return 0; out_free_pipe_cpumask: free_cpumask_var(global_trace.pipe_cpumask); out_free_savedcmd: trace_free_saved_cmdlines_buffer(); out_free_temp_buffer: ring_buffer_free(temp_buffer); out_rm_hp_state: cpuhp_remove_multi_state(CPUHP_TRACE_RB_PREPARE); out_free_cpumask: free_cpumask_var(global_trace.tracing_cpumask); out_free_buffer_mask: free_cpumask_var(tracing_buffer_mask); out: return ret; } #ifdef CONFIG_FUNCTION_TRACER /* Used to set module cached ftrace filtering at boot up */ __init struct trace_array *trace_get_global_array(void) { return &global_trace; } #endif void __init ftrace_boot_snapshot(void) { #ifdef CONFIG_TRACER_MAX_TRACE struct trace_array *tr; if (!snapshot_at_boot) return; list_for_each_entry(tr, &ftrace_trace_arrays, list) { if (!tr->allocated_snapshot) continue; tracing_snapshot_instance(tr); trace_array_puts(tr, "** Boot snapshot taken **\n"); } #endif } void __init early_trace_init(void) { if (tracepoint_printk) { tracepoint_print_iter = kzalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL); if (MEM_FAIL(!tracepoint_print_iter, "Failed to allocate trace iterator\n")) tracepoint_printk = 0; else static_key_enable(&tracepoint_printk_key.key); } tracer_alloc_buffers(); init_events(); } void __init trace_init(void) { trace_event_init(); if (boot_instance_index) enable_instances(); } __init static void clear_boot_tracer(void) { /* * The default tracer at boot buffer is an init section. * This function is called in lateinit. If we did not * find the boot tracer, then clear it out, to prevent * later registration from accessing the buffer that is * about to be freed. */ if (!default_bootup_tracer) return; printk(KERN_INFO "ftrace bootup tracer '%s' not registered.\n", default_bootup_tracer); default_bootup_tracer = NULL; } #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK __init static void tracing_set_default_clock(void) { /* sched_clock_stable() is determined in late_initcall */ if (!trace_boot_clock && !sched_clock_stable()) { if (security_locked_down(LOCKDOWN_TRACEFS)) { pr_warn("Can not set tracing clock due to lockdown\n"); return; } printk(KERN_WARNING "Unstable clock detected, switching default tracing clock to \"global\"\n" "If you want to keep using the local clock, then add:\n" " \"trace_clock=local\"\n" "on the kernel command line\n"); tracing_set_clock(&global_trace, "global"); } } #else static inline void tracing_set_default_clock(void) { } #endif __init static int late_trace_init(void) { if (tracepoint_printk && tracepoint_printk_stop_on_boot) { static_key_disable(&tracepoint_printk_key.key); tracepoint_printk = 0; } tracing_set_default_clock(); clear_boot_tracer(); return 0; } late_initcall_sync(late_trace_init);
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ #include <linux/types.h> #include <linux/bpf.h> #include <linux/bpf_local_storage.h> #include <uapi/linux/btf.h> #include <linux/btf_ids.h> DEFINE_BPF_STORAGE_CACHE(cgroup_cache); static DEFINE_PER_CPU(int, bpf_cgrp_storage_busy); static void bpf_cgrp_storage_lock(void) { cant_migrate(); this_cpu_inc(bpf_cgrp_storage_busy); } static void bpf_cgrp_storage_unlock(void) { this_cpu_dec(bpf_cgrp_storage_busy); } static bool bpf_cgrp_storage_trylock(void) { cant_migrate(); if (unlikely(this_cpu_inc_return(bpf_cgrp_storage_busy) != 1)) { this_cpu_dec(bpf_cgrp_storage_busy); return false; } return true; } static struct bpf_local_storage __rcu **cgroup_storage_ptr(void *owner) { struct cgroup *cg = owner; return &cg->bpf_cgrp_storage; } void bpf_cgrp_storage_free(struct cgroup *cgroup) { struct bpf_local_storage *local_storage; migrate_disable(); rcu_read_lock(); local_storage = rcu_dereference(cgroup->bpf_cgrp_storage); if (!local_storage) goto out; bpf_cgrp_storage_lock(); bpf_local_storage_destroy(local_storage); bpf_cgrp_storage_unlock(); out: rcu_read_unlock(); migrate_enable(); } static struct bpf_local_storage_data * cgroup_storage_lookup(struct cgroup *cgroup, struct bpf_map *map, bool cacheit_lockit) { struct bpf_local_storage *cgroup_storage; struct bpf_local_storage_map *smap; cgroup_storage = rcu_dereference_check(cgroup->bpf_cgrp_storage, bpf_rcu_lock_held()); if (!cgroup_storage) return NULL; smap = (struct bpf_local_storage_map *)map; return bpf_local_storage_lookup(cgroup_storage, smap, cacheit_lockit); } static void *bpf_cgrp_storage_lookup_elem(struct bpf_map *map, void *key) { struct bpf_local_storage_data *sdata; struct cgroup *cgroup; int fd; fd = *(int *)key; cgroup = cgroup_v1v2_get_from_fd(fd); if (IS_ERR(cgroup)) return ERR_CAST(cgroup); bpf_cgrp_storage_lock(); sdata = cgroup_storage_lookup(cgroup, map, true); bpf_cgrp_storage_unlock(); cgroup_put(cgroup); return sdata ? sdata->data : NULL; } static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_local_storage_data *sdata; struct cgroup *cgroup; int fd; fd = *(int *)key; cgroup = cgroup_v1v2_get_from_fd(fd); if (IS_ERR(cgroup)) return PTR_ERR(cgroup); bpf_cgrp_storage_lock(); sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, value, map_flags, false, GFP_ATOMIC); bpf_cgrp_storage_unlock(); cgroup_put(cgroup); return PTR_ERR_OR_ZERO(sdata); } static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map) { struct bpf_local_storage_data *sdata; sdata = cgroup_storage_lookup(cgroup, map, false); if (!sdata) return -ENOENT; bpf_selem_unlink(SELEM(sdata), false); return 0; } static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key) { struct cgroup *cgroup; int err, fd; fd = *(int *)key; cgroup = cgroup_v1v2_get_from_fd(fd); if (IS_ERR(cgroup)) return PTR_ERR(cgroup); bpf_cgrp_storage_lock(); err = cgroup_storage_delete(cgroup, map); bpf_cgrp_storage_unlock(); cgroup_put(cgroup); return err; } static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key) { return -ENOTSUPP; } static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) { return bpf_local_storage_map_alloc(attr, &cgroup_cache, true); } static void cgroup_storage_map_free(struct bpf_map *map) { bpf_local_storage_map_free(map, &cgroup_cache, NULL); } /* *gfp_flags* is a hidden argument provided by the verifier */ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup, void *, value, u64, flags, gfp_t, gfp_flags) { struct bpf_local_storage_data *sdata; WARN_ON_ONCE(!bpf_rcu_lock_held()); if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) return (unsigned long)NULL; if (!cgroup) return (unsigned long)NULL; if (!bpf_cgrp_storage_trylock()) return (unsigned long)NULL; sdata = cgroup_storage_lookup(cgroup, map, true); if (sdata) goto unlock; /* only allocate new storage, when the cgroup is refcounted */ if (!percpu_ref_is_dying(&cgroup->self.refcnt) && (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, value, BPF_NOEXIST, false, gfp_flags); unlock: bpf_cgrp_storage_unlock(); return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data; } BPF_CALL_2(bpf_cgrp_storage_delete, struct bpf_map *, map, struct cgroup *, cgroup) { int ret; WARN_ON_ONCE(!bpf_rcu_lock_held()); if (!cgroup) return -EINVAL; if (!bpf_cgrp_storage_trylock()) return -EBUSY; ret = cgroup_storage_delete(cgroup, map); bpf_cgrp_storage_unlock(); return ret; } const struct bpf_map_ops cgrp_storage_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, .map_alloc = cgroup_storage_map_alloc, .map_free = cgroup_storage_map_free, .map_get_next_key = notsupp_get_next_key, .map_lookup_elem = bpf_cgrp_storage_lookup_elem, .map_update_elem = bpf_cgrp_storage_update_elem, .map_delete_elem = bpf_cgrp_storage_delete_elem, .map_check_btf = bpf_local_storage_map_check_btf, .map_mem_usage = bpf_local_storage_map_mem_usage, .map_btf_id = &bpf_local_storage_map_btf_id[0], .map_owner_storage_ptr = cgroup_storage_ptr, }; const struct bpf_func_proto bpf_cgrp_storage_get_proto = { .func = bpf_cgrp_storage_get, .gpl_only = false, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &bpf_cgroup_btf_id[0], .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, .arg4_type = ARG_ANYTHING, }; const struct bpf_func_proto bpf_cgrp_storage_delete_proto = { .func = bpf_cgrp_storage_delete, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &bpf_cgroup_btf_id[0], };
5 4 4 2 3 3 5 5 2 3 4 3 3 2 8 8 3 3 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 // SPDX-License-Identifier: GPL-2.0-or-later /* * Glue Code for x86_64/AVX2 assembler optimized version of Serpent * * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> */ #include <linux/module.h> #include <linux/types.h> #include <linux/crypto.h> #include <linux/err.h> #include <crypto/algapi.h> #include <crypto/internal/simd.h> #include <crypto/serpent.h> #include "serpent-avx.h" #include "ecb_cbc_helpers.h" #define SERPENT_AVX2_PARALLEL_BLOCKS 16 /* 16-way AVX2 parallel cipher functions */ asmlinkage void serpent_ecb_enc_16way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void serpent_ecb_dec_16way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void serpent_cbc_dec_16way(const void *ctx, u8 *dst, const u8 *src); static int serpent_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { return __serpent_setkey(crypto_skcipher_ctx(tfm), key, keylen); } static int ecb_encrypt(struct skcipher_request *req) { ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS); ECB_BLOCK(SERPENT_AVX2_PARALLEL_BLOCKS, serpent_ecb_enc_16way); ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_ecb_enc_8way_avx); ECB_BLOCK(1, __serpent_encrypt); ECB_WALK_END(); } static int ecb_decrypt(struct skcipher_request *req) { ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS); ECB_BLOCK(SERPENT_AVX2_PARALLEL_BLOCKS, serpent_ecb_dec_16way); ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_ecb_dec_8way_avx); ECB_BLOCK(1, __serpent_decrypt); ECB_WALK_END(); } static int cbc_encrypt(struct skcipher_request *req) { CBC_WALK_START(req, SERPENT_BLOCK_SIZE, -1); CBC_ENC_BLOCK(__serpent_encrypt); CBC_WALK_END(); } static int cbc_decrypt(struct skcipher_request *req) { CBC_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS); CBC_DEC_BLOCK(SERPENT_AVX2_PARALLEL_BLOCKS, serpent_cbc_dec_16way); CBC_DEC_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_cbc_dec_8way_avx); CBC_DEC_BLOCK(1, __serpent_decrypt); CBC_WALK_END(); } static struct skcipher_alg serpent_algs[] = { { .base.cra_name = "__ecb(serpent)", .base.cra_driver_name = "__ecb-serpent-avx2", .base.cra_priority = 600, .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = SERPENT_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct serpent_ctx), .base.cra_module = THIS_MODULE, .min_keysize = SERPENT_MIN_KEY_SIZE, .max_keysize = SERPENT_MAX_KEY_SIZE, .setkey = serpent_setkey_skcipher, .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { .base.cra_name = "__cbc(serpent)", .base.cra_driver_name = "__cbc-serpent-avx2", .base.cra_priority = 600, .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = SERPENT_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct serpent_ctx), .base.cra_module = THIS_MODULE, .min_keysize = SERPENT_MIN_KEY_SIZE, .max_keysize = SERPENT_MAX_KEY_SIZE, .ivsize = SERPENT_BLOCK_SIZE, .setkey = serpent_setkey_skcipher, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, }, }; static struct simd_skcipher_alg *serpent_simd_algs[ARRAY_SIZE(serpent_algs)]; static int __init serpent_avx2_init(void) { const char *feature_name; if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_OSXSAVE)) { pr_info("AVX2 instructions are not detected.\n"); return -ENODEV; } if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, &feature_name)) { pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } return simd_register_skciphers_compat(serpent_algs, ARRAY_SIZE(serpent_algs), serpent_simd_algs); } static void __exit serpent_avx2_fini(void) { simd_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs), serpent_simd_algs); } module_init(serpent_avx2_init); module_exit(serpent_avx2_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX2 optimized"); MODULE_ALIAS_CRYPTO("serpent"); MODULE_ALIAS_CRYPTO("serpent-asm");
1 10 10 10 1 10 5 4 4 4 8 8 8 8 7 8 1 1 1 1 1 1 1 2 2 10 10 3 2 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 // SPDX-License-Identifier: GPL-2.0-or-later /* * HID support for Linux * * Copyright (c) 1999 Andreas Gal * Copyright (c) 2000-2005 Vojtech Pavlik <vojtech@suse.cz> * Copyright (c) 2005 Michael Haboustak <mike-@cinci.rr.com> for Concept2, Inc * Copyright (c) 2006-2012 Jiri Kosina */ /* */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/mm.h> #include <linux/spinlock.h> #include <linux/unaligned.h> #include <asm/byteorder.h> #include <linux/input.h> #include <linux/wait.h> #include <linux/vmalloc.h> #include <linux/sched.h> #include <linux/semaphore.h> #include <linux/hid.h> #include <linux/hiddev.h> #include <linux/hid-debug.h> #include <linux/hidraw.h> #include "hid-ids.h" /* * Version Information */ #define DRIVER_DESC "HID core driver" static int hid_ignore_special_drivers = 0; module_param_named(ignore_special_drivers, hid_ignore_special_drivers, int, 0600); MODULE_PARM_DESC(ignore_special_drivers, "Ignore any special drivers and handle all devices by generic driver"); /* * Convert a signed n-bit integer to signed 32-bit integer. */ static s32 snto32(__u32 value, unsigned int n) { if (!value || !n) return 0; if (n > 32) n = 32; return sign_extend32(value, n - 1); } /* * Convert a signed 32-bit integer to a signed n-bit integer. */ static u32 s32ton(__s32 value, unsigned int n) { s32 a = value >> (n - 1); if (a && a != -1) return value < 0 ? 1 << (n - 1) : (1 << (n - 1)) - 1; return value & ((1 << n) - 1); } /* * Register a new report for a device. */ struct hid_report *hid_register_report(struct hid_device *device, enum hid_report_type type, unsigned int id, unsigned int application) { struct hid_report_enum *report_enum = device->report_enum + type; struct hid_report *report; if (id >= HID_MAX_IDS) return NULL; if (report_enum->report_id_hash[id]) return report_enum->report_id_hash[id]; report = kzalloc(sizeof(struct hid_report), GFP_KERNEL); if (!report) return NULL; if (id != 0) report_enum->numbered = 1; report->id = id; report->type = type; report->size = 0; report->device = device; report->application = application; report_enum->report_id_hash[id] = report; list_add_tail(&report->list, &report_enum->report_list); INIT_LIST_HEAD(&report->field_entry_list); return report; } EXPORT_SYMBOL_GPL(hid_register_report); /* * Register a new field for this report. */ static struct hid_field *hid_register_field(struct hid_report *report, unsigned usages) { struct hid_field *field; if (report->maxfield == HID_MAX_FIELDS) { hid_err(report->device, "too many fields in report\n"); return NULL; } field = kvzalloc((sizeof(struct hid_field) + usages * sizeof(struct hid_usage) + 3 * usages * sizeof(unsigned int)), GFP_KERNEL); if (!field) return NULL; field->index = report->maxfield++; report->field[field->index] = field; field->usage = (struct hid_usage *)(field + 1); field->value = (s32 *)(field->usage + usages); field->new_value = (s32 *)(field->value + usages); field->usages_priorities = (s32 *)(field->new_value + usages); field->report = report; return field; } /* * Open a collection. The type/usage is pushed on the stack. */ static int open_collection(struct hid_parser *parser, unsigned type) { struct hid_collection *collection; unsigned usage; int collection_index; usage = parser->local.usage[0]; if (parser->collection_stack_ptr == parser->collection_stack_size) { unsigned int *collection_stack; unsigned int new_size = parser->collection_stack_size + HID_COLLECTION_STACK_SIZE; collection_stack = krealloc(parser->collection_stack, new_size * sizeof(unsigned int), GFP_KERNEL); if (!collection_stack) return -ENOMEM; parser->collection_stack = collection_stack; parser->collection_stack_size = new_size; } if (parser->device->maxcollection == parser->device->collection_size) { collection = kmalloc( array3_size(sizeof(struct hid_collection), parser->device->collection_size, 2), GFP_KERNEL); if (collection == NULL) { hid_err(parser->device, "failed to reallocate collection array\n"); return -ENOMEM; } memcpy(collection, parser->device->collection, sizeof(struct hid_collection) * parser->device->collection_size); memset(collection + parser->device->collection_size, 0, sizeof(struct hid_collection) * parser->device->collection_size); kfree(parser->device->collection); parser->device->collection = collection; parser->device->collection_size *= 2; } parser->collection_stack[parser->collection_stack_ptr++] = parser->device->maxcollection; collection_index = parser->device->maxcollection++; collection = parser->device->collection + collection_index; collection->type = type; collection->usage = usage; collection->level = parser->collection_stack_ptr - 1; collection->parent_idx = (collection->level == 0) ? -1 : parser->collection_stack[collection->level - 1]; if (type == HID_COLLECTION_APPLICATION) parser->device->maxapplication++; return 0; } /* * Close a collection. */ static int close_collection(struct hid_parser *parser) { if (!parser->collection_stack_ptr) { hid_err(parser->device, "collection stack underflow\n"); return -EINVAL; } parser->collection_stack_ptr--; return 0; } /* * Climb up the stack, search for the specified collection type * and return the usage. */ static unsigned hid_lookup_collection(struct hid_parser *parser, unsigned type) { struct hid_collection *collection = parser->device->collection; int n; for (n = parser->collection_stack_ptr - 1; n >= 0; n--) { unsigned index = parser->collection_stack[n]; if (collection[index].type == type) return collection[index].usage; } return 0; /* we know nothing about this usage type */ } /* * Concatenate usage which defines 16 bits or less with the * currently defined usage page to form a 32 bit usage */ static void complete_usage(struct hid_parser *parser, unsigned int index) { parser->local.usage[index] &= 0xFFFF; parser->local.usage[index] |= (parser->global.usage_page & 0xFFFF) << 16; } /* * Add a usage to the temporary parser table. */ static int hid_add_usage(struct hid_parser *parser, unsigned usage, u8 size) { if (parser->local.usage_index >= HID_MAX_USAGES) { hid_err(parser->device, "usage index exceeded\n"); return -1; } parser->local.usage[parser->local.usage_index] = usage; /* * If Usage item only includes usage id, concatenate it with * currently defined usage page */ if (size <= 2) complete_usage(parser, parser->local.usage_index); parser->local.usage_size[parser->local.usage_index] = size; parser->local.collection_index[parser->local.usage_index] = parser->collection_stack_ptr ? parser->collection_stack[parser->collection_stack_ptr - 1] : 0; parser->local.usage_index++; return 0; } /* * Register a new field for this report. */ static int hid_add_field(struct hid_parser *parser, unsigned report_type, unsigned flags) { struct hid_report *report; struct hid_field *field; unsigned int max_buffer_size = HID_MAX_BUFFER_SIZE; unsigned int usages; unsigned int offset; unsigned int i; unsigned int application; application = hid_lookup_collection(parser, HID_COLLECTION_APPLICATION); report = hid_register_report(parser->device, report_type, parser->global.report_id, application); if (!report) { hid_err(parser->device, "hid_register_report failed\n"); return -1; } /* Handle both signed and unsigned cases properly */ if ((parser->global.logical_minimum < 0 && parser->global.logical_maximum < parser->global.logical_minimum) || (parser->global.logical_minimum >= 0 && (__u32)parser->global.logical_maximum < (__u32)parser->global.logical_minimum)) { dbg_hid("logical range invalid 0x%x 0x%x\n", parser->global.logical_minimum, parser->global.logical_maximum); return -1; } offset = report->size; report->size += parser->global.report_size * parser->global.report_count; if (parser->device->ll_driver->max_buffer_size) max_buffer_size = parser->device->ll_driver->max_buffer_size; /* Total size check: Allow for possible report index byte */ if (report->size > (max_buffer_size - 1) << 3) { hid_err(parser->device, "report is too long\n"); return -1; } if (!parser->local.usage_index) /* Ignore padding fields */ return 0; usages = max_t(unsigned, parser->local.usage_index, parser->global.report_count); field = hid_register_field(report, usages); if (!field) return 0; field->physical = hid_lookup_collection(parser, HID_COLLECTION_PHYSICAL); field->logical = hid_lookup_collection(parser, HID_COLLECTION_LOGICAL); field->application = application; for (i = 0; i < usages; i++) { unsigned j = i; /* Duplicate the last usage we parsed if we have excess values */ if (i >= parser->local.usage_index) j = parser->local.usage_index - 1; field->usage[i].hid = parser->local.usage[j]; field->usage[i].collection_index = parser->local.collection_index[j]; field->usage[i].usage_index = i; field->usage[i].resolution_multiplier = 1; } field->maxusage = usages; field->flags = flags; field->report_offset = offset; field->report_type = report_type; field->report_size = parser->global.report_size; field->report_count = parser->global.report_count; field->logical_minimum = parser->global.logical_minimum; field->logical_maximum = parser->global.logical_maximum; field->physical_minimum = parser->global.physical_minimum; field->physical_maximum = parser->global.physical_maximum; field->unit_exponent = parser->global.unit_exponent; field->unit = parser->global.unit; return 0; } /* * Read data value from item. */ static u32 item_udata(struct hid_item *item) { switch (item->size) { case 1: return item->data.u8; case 2: return item->data.u16; case 4: return item->data.u32; } return 0; } static s32 item_sdata(struct hid_item *item) { switch (item->size) { case 1: return item->data.s8; case 2: return item->data.s16; case 4: return item->data.s32; } return 0; } /* * Process a global item. */ static int hid_parser_global(struct hid_parser *parser, struct hid_item *item) { __s32 raw_value; switch (item->tag) { case HID_GLOBAL_ITEM_TAG_PUSH: if (parser->global_stack_ptr == HID_GLOBAL_STACK_SIZE) { hid_err(parser->device, "global environment stack overflow\n"); return -1; } memcpy(parser->global_stack + parser->global_stack_ptr++, &parser->global, sizeof(struct hid_global)); return 0; case HID_GLOBAL_ITEM_TAG_POP: if (!parser->global_stack_ptr) { hid_err(parser->device, "global environment stack underflow\n"); return -1; } memcpy(&parser->global, parser->global_stack + --parser->global_stack_ptr, sizeof(struct hid_global)); return 0; case HID_GLOBAL_ITEM_TAG_USAGE_PAGE: parser->global.usage_page = item_udata(item); return 0; case HID_GLOBAL_ITEM_TAG_LOGICAL_MINIMUM: parser->global.logical_minimum = item_sdata(item); return 0; case HID_GLOBAL_ITEM_TAG_LOGICAL_MAXIMUM: if (parser->global.logical_minimum < 0) parser->global.logical_maximum = item_sdata(item); else parser->global.logical_maximum = item_udata(item); return 0; case HID_GLOBAL_ITEM_TAG_PHYSICAL_MINIMUM: parser->global.physical_minimum = item_sdata(item); return 0; case HID_GLOBAL_ITEM_TAG_PHYSICAL_MAXIMUM: if (parser->global.physical_minimum < 0) parser->global.physical_maximum = item_sdata(item); else parser->global.physical_maximum = item_udata(item); return 0; case HID_GLOBAL_ITEM_TAG_UNIT_EXPONENT: /* Many devices provide unit exponent as a two's complement * nibble due to the common misunderstanding of HID * specification 1.11, 6.2.2.7 Global Items. Attempt to handle * both this and the standard encoding. */ raw_value = item_sdata(item); if (!(raw_value & 0xfffffff0)) parser->global.unit_exponent = snto32(raw_value, 4); else parser->global.unit_exponent = raw_value; return 0; case HID_GLOBAL_ITEM_TAG_UNIT: parser->global.unit = item_udata(item); return 0; case HID_GLOBAL_ITEM_TAG_REPORT_SIZE: parser->global.report_size = item_udata(item); if (parser->global.report_size > 256) { hid_err(parser->device, "invalid report_size %d\n", parser->global.report_size); return -1; } return 0; case HID_GLOBAL_ITEM_TAG_REPORT_COUNT: parser->global.report_count = item_udata(item); if (parser->global.report_count > HID_MAX_USAGES) { hid_err(parser->device, "invalid report_count %d\n", parser->global.report_count); return -1; } return 0; case HID_GLOBAL_ITEM_TAG_REPORT_ID: parser->global.report_id = item_udata(item); if (parser->global.report_id == 0 || parser->global.report_id >= HID_MAX_IDS) { hid_err(parser->device, "report_id %u is invalid\n", parser->global.report_id); return -1; } return 0; default: hid_err(parser->device, "unknown global tag 0x%x\n", item->tag); return -1; } } /* * Process a local item. */ static int hid_parser_local(struct hid_parser *parser, struct hid_item *item) { __u32 data; unsigned n; __u32 count; data = item_udata(item); switch (item->tag) { case HID_LOCAL_ITEM_TAG_DELIMITER: if (data) { /* * We treat items before the first delimiter * as global to all usage sets (branch 0). * In the moment we process only these global * items and the first delimiter set. */ if (parser->local.delimiter_depth != 0) { hid_err(parser->device, "nested delimiters\n"); return -1; } parser->local.delimiter_depth++; parser->local.delimiter_branch++; } else { if (parser->local.delimiter_depth < 1) { hid_err(parser->device, "bogus close delimiter\n"); return -1; } parser->local.delimiter_depth--; } return 0; case HID_LOCAL_ITEM_TAG_USAGE: if (parser->local.delimiter_branch > 1) { dbg_hid("alternative usage ignored\n"); return 0; } return hid_add_usage(parser, data, item->size); case HID_LOCAL_ITEM_TAG_USAGE_MINIMUM: if (parser->local.delimiter_branch > 1) { dbg_hid("alternative usage ignored\n"); return 0; } parser->local.usage_minimum = data; return 0; case HID_LOCAL_ITEM_TAG_USAGE_MAXIMUM: if (parser->local.delimiter_branch > 1) { dbg_hid("alternative usage ignored\n"); return 0; } count = data - parser->local.usage_minimum; if (count + parser->local.usage_index >= HID_MAX_USAGES) { /* * We do not warn if the name is not set, we are * actually pre-scanning the device. */ if (dev_name(&parser->device->dev)) hid_warn(parser->device, "ignoring exceeding usage max\n"); data = HID_MAX_USAGES - parser->local.usage_index + parser->local.usage_minimum - 1; if (data <= 0) { hid_err(parser->device, "no more usage index available\n"); return -1; } } for (n = parser->local.usage_minimum; n <= data; n++) if (hid_add_usage(parser, n, item->size)) { dbg_hid("hid_add_usage failed\n"); return -1; } return 0; default: dbg_hid("unknown local item tag 0x%x\n", item->tag); return 0; } return 0; } /* * Concatenate Usage Pages into Usages where relevant: * As per specification, 6.2.2.8: "When the parser encounters a main item it * concatenates the last declared Usage Page with a Usage to form a complete * usage value." */ static void hid_concatenate_last_usage_page(struct hid_parser *parser) { int i; unsigned int usage_page; unsigned int current_page; if (!parser->local.usage_index) return; usage_page = parser->global.usage_page; /* * Concatenate usage page again only if last declared Usage Page * has not been already used in previous usages concatenation */ for (i = parser->local.usage_index - 1; i >= 0; i--) { if (parser->local.usage_size[i] > 2) /* Ignore extended usages */ continue; current_page = parser->local.usage[i] >> 16; if (current_page == usage_page) break; complete_usage(parser, i); } } /* * Process a main item. */ static int hid_parser_main(struct hid_parser *parser, struct hid_item *item) { __u32 data; int ret; hid_concatenate_last_usage_page(parser); data = item_udata(item); switch (item->tag) { case HID_MAIN_ITEM_TAG_BEGIN_COLLECTION: ret = open_collection(parser, data & 0xff); break; case HID_MAIN_ITEM_TAG_END_COLLECTION: ret = close_collection(parser); break; case HID_MAIN_ITEM_TAG_INPUT: ret = hid_add_field(parser, HID_INPUT_REPORT, data); break; case HID_MAIN_ITEM_TAG_OUTPUT: ret = hid_add_field(parser, HID_OUTPUT_REPORT, data); break; case HID_MAIN_ITEM_TAG_FEATURE: ret = hid_add_field(parser, HID_FEATURE_REPORT, data); break; default: hid_warn(parser->device, "unknown main item tag 0x%x\n", item->tag); ret = 0; } memset(&parser->local, 0, sizeof(parser->local)); /* Reset the local parser environment */ return ret; } /* * Process a reserved item. */ static int hid_parser_reserved(struct hid_parser *parser, struct hid_item *item) { dbg_hid("reserved item type, tag 0x%x\n", item->tag); return 0; } /* * Free a report and all registered fields. The field->usage and * field->value table's are allocated behind the field, so we need * only to free(field) itself. */ static void hid_free_report(struct hid_report *report) { unsigned n; kfree(report->field_entries); for (n = 0; n < report->maxfield; n++) kvfree(report->field[n]); kfree(report); } /* * Close report. This function returns the device * state to the point prior to hid_open_report(). */ static void hid_close_report(struct hid_device *device) { unsigned i, j; for (i = 0; i < HID_REPORT_TYPES; i++) { struct hid_report_enum *report_enum = device->report_enum + i; for (j = 0; j < HID_MAX_IDS; j++) { struct hid_report *report = report_enum->report_id_hash[j]; if (report) hid_free_report(report); } memset(report_enum, 0, sizeof(*report_enum)); INIT_LIST_HEAD(&report_enum->report_list); } /* * If the HID driver had a rdesc_fixup() callback, dev->rdesc * will be allocated by hid-core and needs to be freed. * Otherwise, it is either equal to dev_rdesc or bpf_rdesc, in * which cases it'll be freed later on device removal or destroy. */ if (device->rdesc != device->dev_rdesc && device->rdesc != device->bpf_rdesc) kfree(device->rdesc); device->rdesc = NULL; device->rsize = 0; kfree(device->collection); device->collection = NULL; device->collection_size = 0; device->maxcollection = 0; device->maxapplication = 0; device->status &= ~HID_STAT_PARSED; } static inline void hid_free_bpf_rdesc(struct hid_device *hdev) { /* bpf_rdesc is either equal to dev_rdesc or allocated by call_hid_bpf_rdesc_fixup() */ if (hdev->bpf_rdesc != hdev->dev_rdesc) kfree(hdev->bpf_rdesc); hdev->bpf_rdesc = NULL; } /* * Free a device structure, all reports, and all fields. */ void hiddev_free(struct kref *ref) { struct hid_device *hid = container_of(ref, struct hid_device, ref); hid_close_report(hid); hid_free_bpf_rdesc(hid); kfree(hid->dev_rdesc); kfree(hid); } static void hid_device_release(struct device *dev) { struct hid_device *hid = to_hid_device(dev); kref_put(&hid->ref, hiddev_free); } /* * Fetch a report description item from the data stream. We support long * items, though they are not used yet. */ static const u8 *fetch_item(const __u8 *start, const __u8 *end, struct hid_item *item) { u8 b; if ((end - start) <= 0) return NULL; b = *start++; item->type = (b >> 2) & 3; item->tag = (b >> 4) & 15; if (item->tag == HID_ITEM_TAG_LONG) { item->format = HID_ITEM_FORMAT_LONG; if ((end - start) < 2) return NULL; item->size = *start++; item->tag = *start++; if ((end - start) < item->size) return NULL; item->data.longdata = start; start += item->size; return start; } item->format = HID_ITEM_FORMAT_SHORT; item->size = BIT(b & 3) >> 1; /* 0, 1, 2, 3 -> 0, 1, 2, 4 */ if (end - start < item->size) return NULL; switch (item->size) { case 0: break; case 1: item->data.u8 = *start; break; case 2: item->data.u16 = get_unaligned_le16(start); break; case 4: item->data.u32 = get_unaligned_le32(start); break; } return start + item->size; } static void hid_scan_input_usage(struct hid_parser *parser, u32 usage) { struct hid_device *hid = parser->device; if (usage == HID_DG_CONTACTID) hid->group = HID_GROUP_MULTITOUCH; } static void hid_scan_feature_usage(struct hid_parser *parser, u32 usage) { if (usage == 0xff0000c5 && parser->global.report_count == 256 && parser->global.report_size == 8) parser->scan_flags |= HID_SCAN_FLAG_MT_WIN_8; if (usage == 0xff0000c6 && parser->global.report_count == 1 && parser->global.report_size == 8) parser->scan_flags |= HID_SCAN_FLAG_MT_WIN_8; } static void hid_scan_collection(struct hid_parser *parser, unsigned type) { struct hid_device *hid = parser->device; int i; if (((parser->global.usage_page << 16) == HID_UP_SENSOR) && (type == HID_COLLECTION_PHYSICAL || type == HID_COLLECTION_APPLICATION)) hid->group = HID_GROUP_SENSOR_HUB; if (hid->vendor == USB_VENDOR_ID_MICROSOFT && hid->product == USB_DEVICE_ID_MS_POWER_COVER && hid->group == HID_GROUP_MULTITOUCH) hid->group = HID_GROUP_GENERIC; if ((parser->global.usage_page << 16) == HID_UP_GENDESK) for (i = 0; i < parser->local.usage_index; i++) if (parser->local.usage[i] == HID_GD_POINTER) parser->scan_flags |= HID_SCAN_FLAG_GD_POINTER; if ((parser->global.usage_page << 16) >= HID_UP_MSVENDOR) parser->scan_flags |= HID_SCAN_FLAG_VENDOR_SPECIFIC; if ((parser->global.usage_page << 16) == HID_UP_GOOGLEVENDOR) for (i = 0; i < parser->local.usage_index; i++) if (parser->local.usage[i] == (HID_UP_GOOGLEVENDOR | 0x0001)) parser->device->group = HID_GROUP_VIVALDI; } static int hid_scan_main(struct hid_parser *parser, struct hid_item *item) { __u32 data; int i; hid_concatenate_last_usage_page(parser); data = item_udata(item); switch (item->tag) { case HID_MAIN_ITEM_TAG_BEGIN_COLLECTION: hid_scan_collection(parser, data & 0xff); break; case HID_MAIN_ITEM_TAG_END_COLLECTION: break; case HID_MAIN_ITEM_TAG_INPUT: /* ignore constant inputs, they will be ignored by hid-input */ if (data & HID_MAIN_ITEM_CONSTANT) break; for (i = 0; i < parser->local.usage_index; i++) hid_scan_input_usage(parser, parser->local.usage[i]); break; case HID_MAIN_ITEM_TAG_OUTPUT: break; case HID_MAIN_ITEM_TAG_FEATURE: for (i = 0; i < parser->local.usage_index; i++) hid_scan_feature_usage(parser, parser->local.usage[i]); break; } /* Reset the local parser environment */ memset(&parser->local, 0, sizeof(parser->local)); return 0; } /* * Scan a report descriptor before the device is added to the bus. * Sets device groups and other properties that determine what driver * to load. */ static int hid_scan_report(struct hid_device *hid) { struct hid_parser *parser; struct hid_item item; const __u8 *start = hid->dev_rdesc; const __u8 *end = start + hid->dev_rsize; static int (*dispatch_type[])(struct hid_parser *parser, struct hid_item *item) = { hid_scan_main, hid_parser_global, hid_parser_local, hid_parser_reserved }; parser = vzalloc(sizeof(struct hid_parser)); if (!parser) return -ENOMEM; parser->device = hid; hid->group = HID_GROUP_GENERIC; /* * The parsing is simpler than the one in hid_open_report() as we should * be robust against hid errors. Those errors will be raised by * hid_open_report() anyway. */ while ((start = fetch_item(start, end, &item)) != NULL) dispatch_type[item.type](parser, &item); /* * Handle special flags set during scanning. */ if ((parser->scan_flags & HID_SCAN_FLAG_MT_WIN_8) && (hid->group == HID_GROUP_MULTITOUCH)) hid->group = HID_GROUP_MULTITOUCH_WIN_8; /* * Vendor specific handlings */ switch (hid->vendor) { case USB_VENDOR_ID_WACOM: hid->group = HID_GROUP_WACOM; break; case USB_VENDOR_ID_SYNAPTICS: if (hid->group == HID_GROUP_GENERIC) if ((parser->scan_flags & HID_SCAN_FLAG_VENDOR_SPECIFIC) && (parser->scan_flags & HID_SCAN_FLAG_GD_POINTER)) /* * hid-rmi should take care of them, * not hid-generic */ hid->group = HID_GROUP_RMI; break; } kfree(parser->collection_stack); vfree(parser); return 0; } /** * hid_parse_report - parse device report * * @hid: hid device * @start: report start * @size: report size * * Allocate the device report as read by the bus driver. This function should * only be called from parse() in ll drivers. */ int hid_parse_report(struct hid_device *hid, const __u8 *start, unsigned size) { hid->dev_rdesc = kmemdup(start, size, GFP_KERNEL); if (!hid->dev_rdesc) return -ENOMEM; hid->dev_rsize = size; return 0; } EXPORT_SYMBOL_GPL(hid_parse_report); static const char * const hid_report_names[] = { "HID_INPUT_REPORT", "HID_OUTPUT_REPORT", "HID_FEATURE_REPORT", }; /** * hid_validate_values - validate existing device report's value indexes * * @hid: hid device * @type: which report type to examine * @id: which report ID to examine (0 for first) * @field_index: which report field to examine * @report_counts: expected number of values * * Validate the number of values in a given field of a given report, after * parsing. */ struct hid_report *hid_validate_values(struct hid_device *hid, enum hid_report_type type, unsigned int id, unsigned int field_index, unsigned int report_counts) { struct hid_report *report; if (type > HID_FEATURE_REPORT) { hid_err(hid, "invalid HID report type %u\n", type); return NULL; } if (id >= HID_MAX_IDS) { hid_err(hid, "invalid HID report id %u\n", id); return NULL; } /* * Explicitly not using hid_get_report() here since it depends on * ->numbered being checked, which may not always be the case when * drivers go to access report values. */ if (id == 0) { /* * Validating on id 0 means we should examine the first * report in the list. */ report = list_first_entry_or_null( &hid->report_enum[type].report_list, struct hid_report, list); } else { report = hid->report_enum[type].report_id_hash[id]; } if (!report) { hid_err(hid, "missing %s %u\n", hid_report_names[type], id); return NULL; } if (report->maxfield <= field_index) { hid_err(hid, "not enough fields in %s %u\n", hid_report_names[type], id); return NULL; } if (report->field[field_index]->report_count < report_counts) { hid_err(hid, "not enough values in %s %u field %u\n", hid_report_names[type], id, field_index); return NULL; } return report; } EXPORT_SYMBOL_GPL(hid_validate_values); static int hid_calculate_multiplier(struct hid_device *hid, struct hid_field *multiplier) { int m; __s32 v = *multiplier->value; __s32 lmin = multiplier->logical_minimum; __s32 lmax = multiplier->logical_maximum; __s32 pmin = multiplier->physical_minimum; __s32 pmax = multiplier->physical_maximum; /* * "Because OS implementations will generally divide the control's * reported count by the Effective Resolution Multiplier, designers * should take care not to establish a potential Effective * Resolution Multiplier of zero." * HID Usage Table, v1.12, Section 4.3.1, p31 */ if (lmax - lmin == 0) return 1; /* * Handling the unit exponent is left as an exercise to whoever * finds a device where that exponent is not 0. */ m = ((v - lmin)/(lmax - lmin) * (pmax - pmin) + pmin); if (unlikely(multiplier->unit_exponent != 0)) { hid_warn(hid, "unsupported Resolution Multiplier unit exponent %d\n", multiplier->unit_exponent); } /* There are no devices with an effective multiplier > 255 */ if (unlikely(m == 0 || m > 255 || m < -255)) { hid_warn(hid, "unsupported Resolution Multiplier %d\n", m); m = 1; } return m; } static void hid_apply_multiplier_to_field(struct hid_device *hid, struct hid_field *field, struct hid_collection *multiplier_collection, int effective_multiplier) { struct hid_collection *collection; struct hid_usage *usage; int i; /* * If multiplier_collection is NULL, the multiplier applies * to all fields in the report. * Otherwise, it is the Logical Collection the multiplier applies to * but our field may be in a subcollection of that collection. */ for (i = 0; i < field->maxusage; i++) { usage = &field->usage[i]; collection = &hid->collection[usage->collection_index]; while (collection->parent_idx != -1 && collection != multiplier_collection) collection = &hid->collection[collection->parent_idx]; if (collection->parent_idx != -1 || multiplier_collection == NULL) usage->resolution_multiplier = effective_multiplier; } } static void hid_apply_multiplier(struct hid_device *hid, struct hid_field *multiplier) { struct hid_report_enum *rep_enum; struct hid_report *rep; struct hid_field *field; struct hid_collection *multiplier_collection; int effective_multiplier; int i; /* * "The Resolution Multiplier control must be contained in the same * Logical Collection as the control(s) to which it is to be applied. * If no Resolution Multiplier is defined, then the Resolution * Multiplier defaults to 1. If more than one control exists in a * Logical Collection, the Resolution Multiplier is associated with * all controls in the collection. If no Logical Collection is * defined, the Resolution Multiplier is associated with all * controls in the report." * HID Usage Table, v1.12, Section 4.3.1, p30 * * Thus, search from the current collection upwards until we find a * logical collection. Then search all fields for that same parent * collection. Those are the fields the multiplier applies to. * * If we have more than one multiplier, it will overwrite the * applicable fields later. */ multiplier_collection = &hid->collection[multiplier->usage->collection_index]; while (multiplier_collection->parent_idx != -1 && multiplier_collection->type != HID_COLLECTION_LOGICAL) multiplier_collection = &hid->collection[multiplier_collection->parent_idx]; if (multiplier_collection->type != HID_COLLECTION_LOGICAL) multiplier_collection = NULL; effective_multiplier = hid_calculate_multiplier(hid, multiplier); rep_enum = &hid->report_enum[HID_INPUT_REPORT]; list_for_each_entry(rep, &rep_enum->report_list, list) { for (i = 0; i < rep->maxfield; i++) { field = rep->field[i]; hid_apply_multiplier_to_field(hid, field, multiplier_collection, effective_multiplier); } } } /* * hid_setup_resolution_multiplier - set up all resolution multipliers * * @device: hid device * * Search for all Resolution Multiplier Feature Reports and apply their * value to all matching Input items. This only updates the internal struct * fields. * * The Resolution Multiplier is applied by the hardware. If the multiplier * is anything other than 1, the hardware will send pre-multiplied events * so that the same physical interaction generates an accumulated * accumulated_value = value * * multiplier * This may be achieved by sending * - "value * multiplier" for each event, or * - "value" but "multiplier" times as frequently, or * - a combination of the above * The only guarantee is that the same physical interaction always generates * an accumulated 'value * multiplier'. * * This function must be called before any event processing and after * any SetRequest to the Resolution Multiplier. */ void hid_setup_resolution_multiplier(struct hid_device *hid) { struct hid_report_enum *rep_enum; struct hid_report *rep; struct hid_usage *usage; int i, j; rep_enum = &hid->report_enum[HID_FEATURE_REPORT]; list_for_each_entry(rep, &rep_enum->report_list, list) { for (i = 0; i < rep->maxfield; i++) { /* Ignore if report count is out of bounds. */ if (rep->field[i]->report_count < 1) continue; for (j = 0; j < rep->field[i]->maxusage; j++) { usage = &rep->field[i]->usage[j]; if (usage->hid == HID_GD_RESOLUTION_MULTIPLIER) hid_apply_multiplier(hid, rep->field[i]); } } } } EXPORT_SYMBOL_GPL(hid_setup_resolution_multiplier); /** * hid_open_report - open a driver-specific device report * * @device: hid device * * Parse a report description into a hid_device structure. Reports are * enumerated, fields are attached to these reports. * 0 returned on success, otherwise nonzero error value. * * This function (or the equivalent hid_parse() macro) should only be * called from probe() in drivers, before starting the device. */ int hid_open_report(struct hid_device *device) { struct hid_parser *parser; struct hid_item item; unsigned int size; const __u8 *start; const __u8 *end; const __u8 *next; int ret; int i; static int (*dispatch_type[])(struct hid_parser *parser, struct hid_item *item) = { hid_parser_main, hid_parser_global, hid_parser_local, hid_parser_reserved }; if (WARN_ON(device->status & HID_STAT_PARSED)) return -EBUSY; start = device->bpf_rdesc; if (WARN_ON(!start)) return -ENODEV; size = device->bpf_rsize; if (device->driver->report_fixup) { /* * device->driver->report_fixup() needs to work * on a copy of our report descriptor so it can * change it. */ __u8 *buf = kmemdup(start, size, GFP_KERNEL); if (buf == NULL) return -ENOMEM; start = device->driver->report_fixup(device, buf, &size); /* * The second kmemdup is required in case report_fixup() returns * a static read-only memory, but we have no idea if that memory * needs to be cleaned up or not at the end. */ start = kmemdup(start, size, GFP_KERNEL); kfree(buf); if (start == NULL) return -ENOMEM; } device->rdesc = start; device->rsize = size; parser = vzalloc(sizeof(struct hid_parser)); if (!parser) { ret = -ENOMEM; goto alloc_err; } parser->device = device; end = start + size; device->collection = kcalloc(HID_DEFAULT_NUM_COLLECTIONS, sizeof(struct hid_collection), GFP_KERNEL); if (!device->collection) { ret = -ENOMEM; goto err; } device->collection_size = HID_DEFAULT_NUM_COLLECTIONS; for (i = 0; i < HID_DEFAULT_NUM_COLLECTIONS; i++) device->collection[i].parent_idx = -1; ret = -EINVAL; while ((next = fetch_item(start, end, &item)) != NULL) { start = next; if (item.format != HID_ITEM_FORMAT_SHORT) { hid_err(device, "unexpected long global item\n"); goto err; } if (dispatch_type[item.type](parser, &item)) { hid_err(device, "item %u %u %u %u parsing failed\n", item.format, (unsigned)item.size, (unsigned)item.type, (unsigned)item.tag); goto err; } if (start == end) { if (parser->collection_stack_ptr) { hid_err(device, "unbalanced collection at end of report description\n"); goto err; } if (parser->local.delimiter_depth) { hid_err(device, "unbalanced delimiter at end of report description\n"); goto err; } /* * fetch initial values in case the device's * default multiplier isn't the recommended 1 */ hid_setup_resolution_multiplier(device); kfree(parser->collection_stack); vfree(parser); device->status |= HID_STAT_PARSED; return 0; } } hid_err(device, "item fetching failed at offset %u/%u\n", size - (unsigned int)(end - start), size); err: kfree(parser->collection_stack); alloc_err: vfree(parser); hid_close_report(device); return ret; } EXPORT_SYMBOL_GPL(hid_open_report); /* * Extract/implement a data field from/to a little endian report (bit array). * * Code sort-of follows HID spec: * http://www.usb.org/developers/hidpage/HID1_11.pdf * * While the USB HID spec allows unlimited length bit fields in "report * descriptors", most devices never use more than 16 bits. * One model of UPS is claimed to report "LINEV" as a 32-bit field. * Search linux-kernel and linux-usb-devel archives for "hid-core extract". */ static u32 __extract(u8 *report, unsigned offset, int n) { unsigned int idx = offset / 8; unsigned int bit_nr = 0; unsigned int bit_shift = offset % 8; int bits_to_copy = 8 - bit_shift; u32 value = 0; u32 mask = n < 32 ? (1U << n) - 1 : ~0U; while (n > 0) { value |= ((u32)report[idx] >> bit_shift) << bit_nr; n -= bits_to_copy; bit_nr += bits_to_copy; bits_to_copy = 8; bit_shift = 0; idx++; } return value & mask; } u32 hid_field_extract(const struct hid_device *hid, u8 *report, unsigned offset, unsigned n) { if (n > 32) { hid_warn_once(hid, "%s() called with n (%d) > 32! (%s)\n", __func__, n, current->comm); n = 32; } return __extract(report, offset, n); } EXPORT_SYMBOL_GPL(hid_field_extract); /* * "implement" : set bits in a little endian bit stream. * Same concepts as "extract" (see comments above). * The data mangled in the bit stream remains in little endian * order the whole time. It make more sense to talk about * endianness of register values by considering a register * a "cached" copy of the little endian bit stream. */ static void __implement(u8 *report, unsigned offset, int n, u32 value) { unsigned int idx = offset / 8; unsigned int bit_shift = offset % 8; int bits_to_set = 8 - bit_shift; while (n - bits_to_set >= 0) { report[idx] &= ~(0xff << bit_shift); report[idx] |= value << bit_shift; value >>= bits_to_set; n -= bits_to_set; bits_to_set = 8; bit_shift = 0; idx++; } /* last nibble */ if (n) { u8 bit_mask = ((1U << n) - 1); report[idx] &= ~(bit_mask << bit_shift); report[idx] |= value << bit_shift; } } static void implement(const struct hid_device *hid, u8 *report, unsigned offset, unsigned n, u32 value) { if (unlikely(n > 32)) { hid_warn(hid, "%s() called with n (%d) > 32! (%s)\n", __func__, n, current->comm); n = 32; } else if (n < 32) { u32 m = (1U << n) - 1; if (unlikely(value > m)) { hid_warn(hid, "%s() called with too large value %d (n: %d)! (%s)\n", __func__, value, n, current->comm); value &= m; } } __implement(report, offset, n, value); } /* * Search an array for a value. */ static int search(__s32 *array, __s32 value, unsigned n) { while (n--) { if (*array++ == value) return 0; } return -1; } /** * hid_match_report - check if driver's raw_event should be called * * @hid: hid device * @report: hid report to match against * * compare hid->driver->report_table->report_type to report->type */ static int hid_match_report(struct hid_device *hid, struct hid_report *report) { const struct hid_report_id *id = hid->driver->report_table; if (!id) /* NULL means all */ return 1; for (; id->report_type != HID_TERMINATOR; id++) if (id->report_type == HID_ANY_ID || id->report_type == report->type) return 1; return 0; } /** * hid_match_usage - check if driver's event should be called * * @hid: hid device * @usage: usage to match against * * compare hid->driver->usage_table->usage_{type,code} to * usage->usage_{type,code} */ static int hid_match_usage(struct hid_device *hid, struct hid_usage *usage) { const struct hid_usage_id *id = hid->driver->usage_table; if (!id) /* NULL means all */ return 1; for (; id->usage_type != HID_ANY_ID - 1; id++) if ((id->usage_hid == HID_ANY_ID || id->usage_hid == usage->hid) && (id->usage_type == HID_ANY_ID || id->usage_type == usage->type) && (id->usage_code == HID_ANY_ID || id->usage_code == usage->code)) return 1; return 0; } static void hid_process_event(struct hid_device *hid, struct hid_field *field, struct hid_usage *usage, __s32 value, int interrupt) { struct hid_driver *hdrv = hid->driver; int ret; if (!list_empty(&hid->debug_list)) hid_dump_input(hid, usage, value); if (hdrv && hdrv->event && hid_match_usage(hid, usage)) { ret = hdrv->event(hid, field, usage, value); if (ret != 0) { if (ret < 0) hid_err(hid, "%s's event failed with %d\n", hdrv->name, ret); return; } } if (hid->claimed & HID_CLAIMED_INPUT) hidinput_hid_event(hid, field, usage, value); if (hid->claimed & HID_CLAIMED_HIDDEV && interrupt && hid->hiddev_hid_event) hid->hiddev_hid_event(hid, field, usage, value); } /* * Checks if the given value is valid within this field */ static inline int hid_array_value_is_valid(struct hid_field *field, __s32 value) { __s32 min = field->logical_minimum; /* * Value needs to be between logical min and max, and * (value - min) is used as an index in the usage array. * This array is of size field->maxusage */ return value >= min && value <= field->logical_maximum && value - min < field->maxusage; } /* * Fetch the field from the data. The field content is stored for next * report processing (we do differential reporting to the layer). */ static void hid_input_fetch_field(struct hid_device *hid, struct hid_field *field, __u8 *data) { unsigned n; unsigned count = field->report_count; unsigned offset = field->report_offset; unsigned size = field->report_size; __s32 min = field->logical_minimum; __s32 *value; value = field->new_value; memset(value, 0, count * sizeof(__s32)); field->ignored = false; for (n = 0; n < count; n++) { value[n] = min < 0 ? snto32(hid_field_extract(hid, data, offset + n * size, size), size) : hid_field_extract(hid, data, offset + n * size, size); /* Ignore report if ErrorRollOver */ if (!(field->flags & HID_MAIN_ITEM_VARIABLE) && hid_array_value_is_valid(field, value[n]) && field->usage[value[n] - min].hid == HID_UP_KEYBOARD + 1) { field->ignored = true; return; } } } /* * Process a received variable field. */ static void hid_input_var_field(struct hid_device *hid, struct hid_field *field, int interrupt) { unsigned int count = field->report_count; __s32 *value = field->new_value; unsigned int n; for (n = 0; n < count; n++) hid_process_event(hid, field, &field->usage[n], value[n], interrupt); memcpy(field->value, value, count * sizeof(__s32)); } /* * Process a received array field. The field content is stored for * next report processing (we do differential reporting to the layer). */ static void hid_input_array_field(struct hid_device *hid, struct hid_field *field, int interrupt) { unsigned int n; unsigned int count = field->report_count; __s32 min = field->logical_minimum; __s32 *value; value = field->new_value; /* ErrorRollOver */ if (field->ignored) return; for (n = 0; n < count; n++) { if (hid_array_value_is_valid(field, field->value[n]) && search(value, field->value[n], count)) hid_process_event(hid, field, &field->usage[field->value[n] - min], 0, interrupt); if (hid_array_value_is_valid(field, value[n]) && search(field->value, value[n], count)) hid_process_event(hid, field, &field->usage[value[n] - min], 1, interrupt); } memcpy(field->value, value, count * sizeof(__s32)); } /* * Analyse a received report, and fetch the data from it. The field * content is stored for next report processing (we do differential * reporting to the layer). */ static void hid_process_report(struct hid_device *hid, struct hid_report *report, __u8 *data, int interrupt) { unsigned int a; struct hid_field_entry *entry; struct hid_field *field; /* first retrieve all incoming values in data */ for (a = 0; a < report->maxfield; a++) hid_input_fetch_field(hid, report->field[a], data); if (!list_empty(&report->field_entry_list)) { /* INPUT_REPORT, we have a priority list of fields */ list_for_each_entry(entry, &report->field_entry_list, list) { field = entry->field; if (field->flags & HID_MAIN_ITEM_VARIABLE) hid_process_event(hid, field, &field->usage[entry->index], field->new_value[entry->index], interrupt); else hid_input_array_field(hid, field, interrupt); } /* we need to do the memcpy at the end for var items */ for (a = 0; a < report->maxfield; a++) { field = report->field[a]; if (field->flags & HID_MAIN_ITEM_VARIABLE) memcpy(field->value, field->new_value, field->report_count * sizeof(__s32)); } } else { /* FEATURE_REPORT, regular processing */ for (a = 0; a < report->maxfield; a++) { field = report->field[a]; if (field->flags & HID_MAIN_ITEM_VARIABLE) hid_input_var_field(hid, field, interrupt); else hid_input_array_field(hid, field, interrupt); } } } /* * Insert a given usage_index in a field in the list * of processed usages in the report. * * The elements of lower priority score are processed * first. */ static void __hid_insert_field_entry(struct hid_device *hid, struct hid_report *report, struct hid_field_entry *entry, struct hid_field *field, unsigned int usage_index) { struct hid_field_entry *next; entry->field = field; entry->index = usage_index; entry->priority = field->usages_priorities[usage_index]; /* insert the element at the correct position */ list_for_each_entry(next, &report->field_entry_list, list) { /* * the priority of our element is strictly higher * than the next one, insert it before */ if (entry->priority > next->priority) { list_add_tail(&entry->list, &next->list); return; } } /* lowest priority score: insert at the end */ list_add_tail(&entry->list, &report->field_entry_list); } static void hid_report_process_ordering(struct hid_device *hid, struct hid_report *report) { struct hid_field *field; struct hid_field_entry *entries; unsigned int a, u, usages; unsigned int count = 0; /* count the number of individual fields in the report */ for (a = 0; a < report->maxfield; a++) { field = report->field[a]; if (field->flags & HID_MAIN_ITEM_VARIABLE) count += field->report_count; else count++; } /* allocate the memory to process the fields */ entries = kcalloc(count, sizeof(*entries), GFP_KERNEL); if (!entries) return; report->field_entries = entries; /* * walk through all fields in the report and * store them by priority order in report->field_entry_list * * - Var elements are individualized (field + usage_index) * - Arrays are taken as one, we can not chose an order for them */ usages = 0; for (a = 0; a < report->maxfield; a++) { field = report->field[a]; if (field->flags & HID_MAIN_ITEM_VARIABLE) { for (u = 0; u < field->report_count; u++) { __hid_insert_field_entry(hid, report, &entries[usages], field, u); usages++; } } else { __hid_insert_field_entry(hid, report, &entries[usages], field, 0); usages++; } } } static void hid_process_ordering(struct hid_device *hid) { struct hid_report *report; struct hid_report_enum *report_enum = &hid->report_enum[HID_INPUT_REPORT]; list_for_each_entry(report, &report_enum->report_list, list) hid_report_process_ordering(hid, report); } /* * Output the field into the report. */ static void hid_output_field(const struct hid_device *hid, struct hid_field *field, __u8 *data) { unsigned count = field->report_count; unsigned offset = field->report_offset; unsigned size = field->report_size; unsigned n; for (n = 0; n < count; n++) { if (field->logical_minimum < 0) /* signed values */ implement(hid, data, offset + n * size, size, s32ton(field->value[n], size)); else /* unsigned values */ implement(hid, data, offset + n * size, size, field->value[n]); } } /* * Compute the size of a report. */ static size_t hid_compute_report_size(struct hid_report *report) { if (report->size) return ((report->size - 1) >> 3) + 1; return 0; } /* * Create a report. 'data' has to be allocated using * hid_alloc_report_buf() so that it has proper size. */ void hid_output_report(struct hid_report *report, __u8 *data) { unsigned n; if (report->id > 0) *data++ = report->id; memset(data, 0, hid_compute_report_size(report)); for (n = 0; n < report->maxfield; n++) hid_output_field(report->device, report->field[n], data); } EXPORT_SYMBOL_GPL(hid_output_report); /* * Allocator for buffer that is going to be passed to hid_output_report() */ u8 *hid_alloc_report_buf(struct hid_report *report, gfp_t flags) { /* * 7 extra bytes are necessary to achieve proper functionality * of implement() working on 8 byte chunks */ u32 len = hid_report_len(report) + 7; return kzalloc(len, flags); } EXPORT_SYMBOL_GPL(hid_alloc_report_buf); /* * Set a field value. The report this field belongs to has to be * created and transferred to the device, to set this value in the * device. */ int hid_set_field(struct hid_field *field, unsigned offset, __s32 value) { unsigned size; if (!field) return -1; size = field->report_size; hid_dump_input(field->report->device, field->usage + offset, value); if (offset >= field->report_count) { hid_err(field->report->device, "offset (%d) exceeds report_count (%d)\n", offset, field->report_count); return -1; } if (field->logical_minimum < 0) { if (value != snto32(s32ton(value, size), size)) { hid_err(field->report->device, "value %d is out of range\n", value); return -1; } } field->value[offset] = value; return 0; } EXPORT_SYMBOL_GPL(hid_set_field); struct hid_field *hid_find_field(struct hid_device *hdev, unsigned int report_type, unsigned int application, unsigned int usage) { struct list_head *report_list = &hdev->report_enum[report_type].report_list; struct hid_report *report; int i, j; list_for_each_entry(report, report_list, list) { if (report->application != application) continue; for (i = 0; i < report->maxfield; i++) { struct hid_field *field = report->field[i]; for (j = 0; j < field->maxusage; j++) { if (field->usage[j].hid == usage) return field; } } } return NULL; } EXPORT_SYMBOL_GPL(hid_find_field); static struct hid_report *hid_get_report(struct hid_report_enum *report_enum, const u8 *data) { struct hid_report *report; unsigned int n = 0; /* Normally report number is 0 */ /* Device uses numbered reports, data[0] is report number */ if (report_enum->numbered) n = *data; report = report_enum->report_id_hash[n]; if (report == NULL) dbg_hid("undefined report_id %u received\n", n); return report; } /* * Implement a generic .request() callback, using .raw_request() * DO NOT USE in hid drivers directly, but through hid_hw_request instead. */ int __hid_request(struct hid_device *hid, struct hid_report *report, enum hid_class_request reqtype) { char *buf; int ret; u32 len; buf = hid_alloc_report_buf(report, GFP_KERNEL); if (!buf) return -ENOMEM; len = hid_report_len(report); if (reqtype == HID_REQ_SET_REPORT) hid_output_report(report, buf); ret = hid->ll_driver->raw_request(hid, report->id, buf, len, report->type, reqtype); if (ret < 0) { dbg_hid("unable to complete request: %d\n", ret); goto out; } if (reqtype == HID_REQ_GET_REPORT) hid_input_report(hid, report->type, buf, ret, 0); ret = 0; out: kfree(buf); return ret; } EXPORT_SYMBOL_GPL(__hid_request); int hid_report_raw_event(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, int interrupt) { struct hid_report_enum *report_enum = hid->report_enum + type; struct hid_report *report; struct hid_driver *hdrv; int max_buffer_size = HID_MAX_BUFFER_SIZE; u32 rsize, csize = size; u8 *cdata = data; int ret = 0; report = hid_get_report(report_enum, data); if (!report) goto out; if (report_enum->numbered) { cdata++; csize--; } rsize = hid_compute_report_size(report); if (hid->ll_driver->max_buffer_size) max_buffer_size = hid->ll_driver->max_buffer_size; if (report_enum->numbered && rsize >= max_buffer_size) rsize = max_buffer_size - 1; else if (rsize > max_buffer_size) rsize = max_buffer_size; if (csize < rsize) { dbg_hid("report %d is too short, (%d < %d)\n", report->id, csize, rsize); memset(cdata + csize, 0, rsize - csize); } if ((hid->claimed & HID_CLAIMED_HIDDEV) && hid->hiddev_report_event) hid->hiddev_report_event(hid, report); if (hid->claimed & HID_CLAIMED_HIDRAW) { ret = hidraw_report_event(hid, data, size); if (ret) goto out; } if (hid->claimed != HID_CLAIMED_HIDRAW && report->maxfield) { hid_process_report(hid, report, cdata, interrupt); hdrv = hid->driver; if (hdrv && hdrv->report) hdrv->report(hid, report); } if (hid->claimed & HID_CLAIMED_INPUT) hidinput_report_event(hid, report); out: return ret; } EXPORT_SYMBOL_GPL(hid_report_raw_event); static int __hid_input_report(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, int interrupt, u64 source, bool from_bpf, bool lock_already_taken) { struct hid_report_enum *report_enum; struct hid_driver *hdrv; struct hid_report *report; int ret = 0; if (!hid) return -ENODEV; ret = down_trylock(&hid->driver_input_lock); if (lock_already_taken && !ret) { up(&hid->driver_input_lock); return -EINVAL; } else if (!lock_already_taken && ret) { return -EBUSY; } if (!hid->driver) { ret = -ENODEV; goto unlock; } report_enum = hid->report_enum + type; hdrv = hid->driver; data = dispatch_hid_bpf_device_event(hid, type, data, &size, interrupt, source, from_bpf); if (IS_ERR(data)) { ret = PTR_ERR(data); goto unlock; } if (!size) { dbg_hid("empty report\n"); ret = -1; goto unlock; } /* Avoid unnecessary overhead if debugfs is disabled */ if (!list_empty(&hid->debug_list)) hid_dump_report(hid, type, data, size); report = hid_get_report(report_enum, data); if (!report) { ret = -1; goto unlock; } if (hdrv && hdrv->raw_event && hid_match_report(hid, report)) { ret = hdrv->raw_event(hid, report, data, size); if (ret < 0) goto unlock; } ret = hid_report_raw_event(hid, type, data, size, interrupt); unlock: if (!lock_already_taken) up(&hid->driver_input_lock); return ret; } /** * hid_input_report - report data from lower layer (usb, bt...) * * @hid: hid device * @type: HID report type (HID_*_REPORT) * @data: report contents * @size: size of data parameter * @interrupt: distinguish between interrupt and control transfers * * This is data entry for lower layers. */ int hid_input_report(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, int interrupt) { return __hid_input_report(hid, type, data, size, interrupt, 0, false, /* from_bpf */ false /* lock_already_taken */); } EXPORT_SYMBOL_GPL(hid_input_report); bool hid_match_one_id(const struct hid_device *hdev, const struct hid_device_id *id) { return (id->bus == HID_BUS_ANY || id->bus == hdev->bus) && (id->group == HID_GROUP_ANY || id->group == hdev->group) && (id->vendor == HID_ANY_ID || id->vendor == hdev->vendor) && (id->product == HID_ANY_ID || id->product == hdev->product); } const struct hid_device_id *hid_match_id(const struct hid_device *hdev, const struct hid_device_id *id) { for (; id->bus; id++) if (hid_match_one_id(hdev, id)) return id; return NULL; } EXPORT_SYMBOL_GPL(hid_match_id); static const struct hid_device_id hid_hiddev_list[] = { { HID_USB_DEVICE(USB_VENDOR_ID_MGE, USB_DEVICE_ID_MGE_UPS) }, { HID_USB_DEVICE(USB_VENDOR_ID_MGE, USB_DEVICE_ID_MGE_UPS1) }, { } }; static bool hid_hiddev(struct hid_device *hdev) { return !!hid_match_id(hdev, hid_hiddev_list); } static ssize_t report_descriptor_read(struct file *filp, struct kobject *kobj, const struct bin_attribute *attr, char *buf, loff_t off, size_t count) { struct device *dev = kobj_to_dev(kobj); struct hid_device *hdev = to_hid_device(dev); if (off >= hdev->rsize) return 0; if (off + count > hdev->rsize) count = hdev->rsize - off; memcpy(buf, hdev->rdesc + off, count); return count; } static ssize_t country_show(struct device *dev, struct device_attribute *attr, char *buf) { struct hid_device *hdev = to_hid_device(dev); return sprintf(buf, "%02x\n", hdev->country & 0xff); } static const BIN_ATTR_RO(report_descriptor, HID_MAX_DESCRIPTOR_SIZE); static const DEVICE_ATTR_RO(country); int hid_connect(struct hid_device *hdev, unsigned int connect_mask) { static const char *types[] = { "Device", "Pointer", "Mouse", "Device", "Joystick", "Gamepad", "Keyboard", "Keypad", "Multi-Axis Controller" }; const char *type, *bus; char buf[64] = ""; unsigned int i; int len; int ret; ret = hid_bpf_connect_device(hdev); if (ret) return ret; if (hdev->quirks & HID_QUIRK_HIDDEV_FORCE) connect_mask |= (HID_CONNECT_HIDDEV_FORCE | HID_CONNECT_HIDDEV); if (hdev->quirks & HID_QUIRK_HIDINPUT_FORCE) connect_mask |= HID_CONNECT_HIDINPUT_FORCE; if (hdev->bus != BUS_USB) connect_mask &= ~HID_CONNECT_HIDDEV; if (hid_hiddev(hdev)) connect_mask |= HID_CONNECT_HIDDEV_FORCE; if ((connect_mask & HID_CONNECT_HIDINPUT) && !hidinput_connect(hdev, connect_mask & HID_CONNECT_HIDINPUT_FORCE)) hdev->claimed |= HID_CLAIMED_INPUT; if ((connect_mask & HID_CONNECT_HIDDEV) && hdev->hiddev_connect && !hdev->hiddev_connect(hdev, connect_mask & HID_CONNECT_HIDDEV_FORCE)) hdev->claimed |= HID_CLAIMED_HIDDEV; if ((connect_mask & HID_CONNECT_HIDRAW) && !hidraw_connect(hdev)) hdev->claimed |= HID_CLAIMED_HIDRAW; if (connect_mask & HID_CONNECT_DRIVER) hdev->claimed |= HID_CLAIMED_DRIVER; /* Drivers with the ->raw_event callback set are not required to connect * to any other listener. */ if (!hdev->claimed && !hdev->driver->raw_event) { hid_err(hdev, "device has no listeners, quitting\n"); return -ENODEV; } hid_process_ordering(hdev); if ((hdev->claimed & HID_CLAIMED_INPUT) && (connect_mask & HID_CONNECT_FF) && hdev->ff_init) hdev->ff_init(hdev); len = 0; if (hdev->claimed & HID_CLAIMED_INPUT) len += sprintf(buf + len, "input"); if (hdev->claimed & HID_CLAIMED_HIDDEV) len += sprintf(buf + len, "%shiddev%d", len ? "," : "", ((struct hiddev *)hdev->hiddev)->minor); if (hdev->claimed & HID_CLAIMED_HIDRAW) len += sprintf(buf + len, "%shidraw%d", len ? "," : "", ((struct hidraw *)hdev->hidraw)->minor); type = "Device"; for (i = 0; i < hdev->maxcollection; i++) { struct hid_collection *col = &hdev->collection[i]; if (col->type == HID_COLLECTION_APPLICATION && (col->usage & HID_USAGE_PAGE) == HID_UP_GENDESK && (col->usage & 0xffff) < ARRAY_SIZE(types)) { type = types[col->usage & 0xffff]; break; } } switch (hdev->bus) { case BUS_USB: bus = "USB"; break; case BUS_BLUETOOTH: bus = "BLUETOOTH"; break; case BUS_I2C: bus = "I2C"; break; case BUS_VIRTUAL: bus = "VIRTUAL"; break; case BUS_INTEL_ISHTP: case BUS_AMD_SFH: bus = "SENSOR HUB"; break; default: bus = "<UNKNOWN>"; } ret = device_create_file(&hdev->dev, &dev_attr_country); if (ret) hid_warn(hdev, "can't create sysfs country code attribute err: %d\n", ret); hid_info(hdev, "%s: %s HID v%x.%02x %s [%s] on %s\n", buf, bus, hdev->version >> 8, hdev->version & 0xff, type, hdev->name, hdev->phys); return 0; } EXPORT_SYMBOL_GPL(hid_connect); void hid_disconnect(struct hid_device *hdev) { device_remove_file(&hdev->dev, &dev_attr_country); if (hdev->claimed & HID_CLAIMED_INPUT) hidinput_disconnect(hdev); if (hdev->claimed & HID_CLAIMED_HIDDEV) hdev->hiddev_disconnect(hdev); if (hdev->claimed & HID_CLAIMED_HIDRAW) hidraw_disconnect(hdev); hdev->claimed = 0; hid_bpf_disconnect_device(hdev); } EXPORT_SYMBOL_GPL(hid_disconnect); /** * hid_hw_start - start underlying HW * @hdev: hid device * @connect_mask: which outputs to connect, see HID_CONNECT_* * * Call this in probe function *after* hid_parse. This will setup HW * buffers and start the device (if not defeirred to device open). * hid_hw_stop must be called if this was successful. */ int hid_hw_start(struct hid_device *hdev, unsigned int connect_mask) { int error; error = hdev->ll_driver->start(hdev); if (error) return error; if (connect_mask) { error = hid_connect(hdev, connect_mask); if (error) { hdev->ll_driver->stop(hdev); return error; } } return 0; } EXPORT_SYMBOL_GPL(hid_hw_start); /** * hid_hw_stop - stop underlying HW * @hdev: hid device * * This is usually called from remove function or from probe when something * failed and hid_hw_start was called already. */ void hid_hw_stop(struct hid_device *hdev) { hid_disconnect(hdev); hdev->ll_driver->stop(hdev); } EXPORT_SYMBOL_GPL(hid_hw_stop); /** * hid_hw_open - signal underlying HW to start delivering events * @hdev: hid device * * Tell underlying HW to start delivering events from the device. * This function should be called sometime after successful call * to hid_hw_start(). */ int hid_hw_open(struct hid_device *hdev) { int ret; ret = mutex_lock_killable(&hdev->ll_open_lock); if (ret) return ret; if (!hdev->ll_open_count++) { ret = hdev->ll_driver->open(hdev); if (ret) hdev->ll_open_count--; } mutex_unlock(&hdev->ll_open_lock); return ret; } EXPORT_SYMBOL_GPL(hid_hw_open); /** * hid_hw_close - signal underlaying HW to stop delivering events * * @hdev: hid device * * This function indicates that we are not interested in the events * from this device anymore. Delivery of events may or may not stop, * depending on the number of users still outstanding. */ void hid_hw_close(struct hid_device *hdev) { mutex_lock(&hdev->ll_open_lock); if (!--hdev->ll_open_count) hdev->ll_driver->close(hdev); mutex_unlock(&hdev->ll_open_lock); } EXPORT_SYMBOL_GPL(hid_hw_close); /** * hid_hw_request - send report request to device * * @hdev: hid device * @report: report to send * @reqtype: hid request type */ void hid_hw_request(struct hid_device *hdev, struct hid_report *report, enum hid_class_request reqtype) { if (hdev->ll_driver->request) return hdev->ll_driver->request(hdev, report, reqtype); __hid_request(hdev, report, reqtype); } EXPORT_SYMBOL_GPL(hid_hw_request); int __hid_hw_raw_request(struct hid_device *hdev, unsigned char reportnum, __u8 *buf, size_t len, enum hid_report_type rtype, enum hid_class_request reqtype, u64 source, bool from_bpf) { unsigned int max_buffer_size = HID_MAX_BUFFER_SIZE; int ret; if (hdev->ll_driver->max_buffer_size) max_buffer_size = hdev->ll_driver->max_buffer_size; if (len < 1 || len > max_buffer_size || !buf) return -EINVAL; ret = dispatch_hid_bpf_raw_requests(hdev, reportnum, buf, len, rtype, reqtype, source, from_bpf); if (ret) return ret; return hdev->ll_driver->raw_request(hdev, reportnum, buf, len, rtype, reqtype); } /** * hid_hw_raw_request - send report request to device * * @hdev: hid device * @reportnum: report ID * @buf: in/out data to transfer * @len: length of buf * @rtype: HID report type * @reqtype: HID_REQ_GET_REPORT or HID_REQ_SET_REPORT * * Return: count of data transferred, negative if error * * Same behavior as hid_hw_request, but with raw buffers instead. */ int hid_hw_raw_request(struct hid_device *hdev, unsigned char reportnum, __u8 *buf, size_t len, enum hid_report_type rtype, enum hid_class_request reqtype) { return __hid_hw_raw_request(hdev, reportnum, buf, len, rtype, reqtype, 0, false); } EXPORT_SYMBOL_GPL(hid_hw_raw_request); int __hid_hw_output_report(struct hid_device *hdev, __u8 *buf, size_t len, u64 source, bool from_bpf) { unsigned int max_buffer_size = HID_MAX_BUFFER_SIZE; int ret; if (hdev->ll_driver->max_buffer_size) max_buffer_size = hdev->ll_driver->max_buffer_size; if (len < 1 || len > max_buffer_size || !buf) return -EINVAL; ret = dispatch_hid_bpf_output_report(hdev, buf, len, source, from_bpf); if (ret) return ret; if (hdev->ll_driver->output_report) return hdev->ll_driver->output_report(hdev, buf, len); return -ENOSYS; } /** * hid_hw_output_report - send output report to device * * @hdev: hid device * @buf: raw data to transfer * @len: length of buf * * Return: count of data transferred, negative if error */ int hid_hw_output_report(struct hid_device *hdev, __u8 *buf, size_t len) { return __hid_hw_output_report(hdev, buf, len, 0, false); } EXPORT_SYMBOL_GPL(hid_hw_output_report); #ifdef CONFIG_PM int hid_driver_suspend(struct hid_device *hdev, pm_message_t state) { if (hdev->driver && hdev->driver->suspend) return hdev->driver->suspend(hdev, state); return 0; } EXPORT_SYMBOL_GPL(hid_driver_suspend); int hid_driver_reset_resume(struct hid_device *hdev) { if (hdev->driver && hdev->driver->reset_resume) return hdev->driver->reset_resume(hdev); return 0; } EXPORT_SYMBOL_GPL(hid_driver_reset_resume); int hid_driver_resume(struct hid_device *hdev) { if (hdev->driver && hdev->driver->resume) return hdev->driver->resume(hdev); return 0; } EXPORT_SYMBOL_GPL(hid_driver_resume); #endif /* CONFIG_PM */ struct hid_dynid { struct list_head list; struct hid_device_id id; }; /** * new_id_store - add a new HID device ID to this driver and re-probe devices * @drv: target device driver * @buf: buffer for scanning device ID data * @count: input size * * Adds a new dynamic hid device ID to this driver, * and causes the driver to probe for all devices again. */ static ssize_t new_id_store(struct device_driver *drv, const char *buf, size_t count) { struct hid_driver *hdrv = to_hid_driver(drv); struct hid_dynid *dynid; __u32 bus, vendor, product; unsigned long driver_data = 0; int ret; ret = sscanf(buf, "%x %x %x %lx", &bus, &vendor, &product, &driver_data); if (ret < 3) return -EINVAL; dynid = kzalloc(sizeof(*dynid), GFP_KERNEL); if (!dynid) return -ENOMEM; dynid->id.bus = bus; dynid->id.group = HID_GROUP_ANY; dynid->id.vendor = vendor; dynid->id.product = product; dynid->id.driver_data = driver_data; spin_lock(&hdrv->dyn_lock); list_add_tail(&dynid->list, &hdrv->dyn_list); spin_unlock(&hdrv->dyn_lock); ret = driver_attach(&hdrv->driver); return ret ? : count; } static DRIVER_ATTR_WO(new_id); static struct attribute *hid_drv_attrs[] = { &driver_attr_new_id.attr, NULL, }; ATTRIBUTE_GROUPS(hid_drv); static void hid_free_dynids(struct hid_driver *hdrv) { struct hid_dynid *dynid, *n; spin_lock(&hdrv->dyn_lock); list_for_each_entry_safe(dynid, n, &hdrv->dyn_list, list) { list_del(&dynid->list); kfree(dynid); } spin_unlock(&hdrv->dyn_lock); } const struct hid_device_id *hid_match_device(struct hid_device *hdev, struct hid_driver *hdrv) { struct hid_dynid *dynid; spin_lock(&hdrv->dyn_lock); list_for_each_entry(dynid, &hdrv->dyn_list, list) { if (hid_match_one_id(hdev, &dynid->id)) { spin_unlock(&hdrv->dyn_lock); return &dynid->id; } } spin_unlock(&hdrv->dyn_lock); return hid_match_id(hdev, hdrv->id_table); } EXPORT_SYMBOL_GPL(hid_match_device); static int hid_bus_match(struct device *dev, const struct device_driver *drv) { struct hid_driver *hdrv = to_hid_driver(drv); struct hid_device *hdev = to_hid_device(dev); return hid_match_device(hdev, hdrv) != NULL; } /** * hid_compare_device_paths - check if both devices share the same path * @hdev_a: hid device * @hdev_b: hid device * @separator: char to use as separator * * Check if two devices share the same path up to the last occurrence of * the separator char. Both paths must exist (i.e., zero-length paths * don't match). */ bool hid_compare_device_paths(struct hid_device *hdev_a, struct hid_device *hdev_b, char separator) { int n1 = strrchr(hdev_a->phys, separator) - hdev_a->phys; int n2 = strrchr(hdev_b->phys, separator) - hdev_b->phys; if (n1 != n2 || n1 <= 0 || n2 <= 0) return false; return !strncmp(hdev_a->phys, hdev_b->phys, n1); } EXPORT_SYMBOL_GPL(hid_compare_device_paths); static bool hid_check_device_match(struct hid_device *hdev, struct hid_driver *hdrv, const struct hid_device_id **id) { *id = hid_match_device(hdev, hdrv); if (!*id) return false; if (hdrv->match) return hdrv->match(hdev, hid_ignore_special_drivers); /* * hid-generic implements .match(), so we must be dealing with a * different HID driver here, and can simply check if * hid_ignore_special_drivers or HID_QUIRK_IGNORE_SPECIAL_DRIVER * are set or not. */ return !hid_ignore_special_drivers && !(hdev->quirks & HID_QUIRK_IGNORE_SPECIAL_DRIVER); } static int __hid_device_probe(struct hid_device *hdev, struct hid_driver *hdrv) { const struct hid_device_id *id; int ret; if (!hdev->bpf_rsize) { /* in case a bpf program gets detached, we need to free the old one */ hid_free_bpf_rdesc(hdev); /* keep this around so we know we called it once */ hdev->bpf_rsize = hdev->dev_rsize; /* call_hid_bpf_rdesc_fixup will always return a valid pointer */ hdev->bpf_rdesc = call_hid_bpf_rdesc_fixup(hdev, hdev->dev_rdesc, &hdev->bpf_rsize); } if (!hid_check_device_match(hdev, hdrv, &id)) return -ENODEV; hdev->devres_group_id = devres_open_group(&hdev->dev, NULL, GFP_KERNEL); if (!hdev->devres_group_id) return -ENOMEM; /* reset the quirks that has been previously set */ hdev->quirks = hid_lookup_quirk(hdev); hdev->driver = hdrv; if (hdrv->probe) { ret = hdrv->probe(hdev, id); } else { /* default probe */ ret = hid_open_report(hdev); if (!ret) ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT); } /* * Note that we are not closing the devres group opened above so * even resources that were attached to the device after probe is * run are released when hid_device_remove() is executed. This is * needed as some drivers would allocate additional resources, * for example when updating firmware. */ if (ret) { devres_release_group(&hdev->dev, hdev->devres_group_id); hid_close_report(hdev); hdev->driver = NULL; } return ret; } static int hid_device_probe(struct device *dev) { struct hid_device *hdev = to_hid_device(dev); struct hid_driver *hdrv = to_hid_driver(dev->driver); int ret = 0; if (down_interruptible(&hdev->driver_input_lock)) return -EINTR; hdev->io_started = false; clear_bit(ffs(HID_STAT_REPROBED), &hdev->status); if (!hdev->driver) ret = __hid_device_probe(hdev, hdrv); if (!hdev->io_started) up(&hdev->driver_input_lock); return ret; } static void hid_device_remove(struct device *dev) { struct hid_device *hdev = to_hid_device(dev); struct hid_driver *hdrv; down(&hdev->driver_input_lock); hdev->io_started = false; hdrv = hdev->driver; if (hdrv) { if (hdrv->remove) hdrv->remove(hdev); else /* default remove */ hid_hw_stop(hdev); /* Release all devres resources allocated by the driver */ devres_release_group(&hdev->dev, hdev->devres_group_id); hid_close_report(hdev); hdev->driver = NULL; } if (!hdev->io_started) up(&hdev->driver_input_lock); } static ssize_t modalias_show(struct device *dev, struct device_attribute *a, char *buf) { struct hid_device *hdev = container_of(dev, struct hid_device, dev); return scnprintf(buf, PAGE_SIZE, "hid:b%04Xg%04Xv%08Xp%08X\n", hdev->bus, hdev->group, hdev->vendor, hdev->product); } static DEVICE_ATTR_RO(modalias); static struct attribute *hid_dev_attrs[] = { &dev_attr_modalias.attr, NULL, }; static const struct bin_attribute *hid_dev_bin_attrs[] = { &bin_attr_report_descriptor, NULL }; static const struct attribute_group hid_dev_group = { .attrs = hid_dev_attrs, .bin_attrs_new = hid_dev_bin_attrs, }; __ATTRIBUTE_GROUPS(hid_dev); static int hid_uevent(const struct device *dev, struct kobj_uevent_env *env) { const struct hid_device *hdev = to_hid_device(dev); if (add_uevent_var(env, "HID_ID=%04X:%08X:%08X", hdev->bus, hdev->vendor, hdev->product)) return -ENOMEM; if (add_uevent_var(env, "HID_NAME=%s", hdev->name)) return -ENOMEM; if (add_uevent_var(env, "HID_PHYS=%s", hdev->phys)) return -ENOMEM; if (add_uevent_var(env, "HID_UNIQ=%s", hdev->uniq)) return -ENOMEM; if (add_uevent_var(env, "MODALIAS=hid:b%04Xg%04Xv%08Xp%08X", hdev->bus, hdev->group, hdev->vendor, hdev->product)) return -ENOMEM; return 0; } const struct bus_type hid_bus_type = { .name = "hid", .dev_groups = hid_dev_groups, .drv_groups = hid_drv_groups, .match = hid_bus_match, .probe = hid_device_probe, .remove = hid_device_remove, .uevent = hid_uevent, }; EXPORT_SYMBOL(hid_bus_type); int hid_add_device(struct hid_device *hdev) { static atomic_t id = ATOMIC_INIT(0); int ret; if (WARN_ON(hdev->status & HID_STAT_ADDED)) return -EBUSY; hdev->quirks = hid_lookup_quirk(hdev); /* we need to kill them here, otherwise they will stay allocated to * wait for coming driver */ if (hid_ignore(hdev)) return -ENODEV; /* * Check for the mandatory transport channel. */ if (!hdev->ll_driver->raw_request) { hid_err(hdev, "transport driver missing .raw_request()\n"); return -EINVAL; } /* * Read the device report descriptor once and use as template * for the driver-specific modifications. */ ret = hdev->ll_driver->parse(hdev); if (ret) return ret; if (!hdev->dev_rdesc) return -ENODEV; /* * Scan generic devices for group information */ if (hid_ignore_special_drivers) { hdev->group = HID_GROUP_GENERIC; } else if (!hdev->group && !(hdev->quirks & HID_QUIRK_HAVE_SPECIAL_DRIVER)) { ret = hid_scan_report(hdev); if (ret) hid_warn(hdev, "bad device descriptor (%d)\n", ret); } hdev->id = atomic_inc_return(&id); /* XXX hack, any other cleaner solution after the driver core * is converted to allow more than 20 bytes as the device name? */ dev_set_name(&hdev->dev, "%04X:%04X:%04X.%04X", hdev->bus, hdev->vendor, hdev->product, hdev->id); hid_debug_register(hdev, dev_name(&hdev->dev)); ret = device_add(&hdev->dev); if (!ret) hdev->status |= HID_STAT_ADDED; else hid_debug_unregister(hdev); return ret; } EXPORT_SYMBOL_GPL(hid_add_device); /** * hid_allocate_device - allocate new hid device descriptor * * Allocate and initialize hid device, so that hid_destroy_device might be * used to free it. * * New hid_device pointer is returned on success, otherwise ERR_PTR encoded * error value. */ struct hid_device *hid_allocate_device(void) { struct hid_device *hdev; int ret = -ENOMEM; hdev = kzalloc(sizeof(*hdev), GFP_KERNEL); if (hdev == NULL) return ERR_PTR(ret); device_initialize(&hdev->dev); hdev->dev.release = hid_device_release; hdev->dev.bus = &hid_bus_type; device_enable_async_suspend(&hdev->dev); hid_close_report(hdev); init_waitqueue_head(&hdev->debug_wait); INIT_LIST_HEAD(&hdev->debug_list); spin_lock_init(&hdev->debug_list_lock); sema_init(&hdev->driver_input_lock, 1); mutex_init(&hdev->ll_open_lock); kref_init(&hdev->ref); ret = hid_bpf_device_init(hdev); if (ret) goto out_err; return hdev; out_err: hid_destroy_device(hdev); return ERR_PTR(ret); } EXPORT_SYMBOL_GPL(hid_allocate_device); static void hid_remove_device(struct hid_device *hdev) { if (hdev->status & HID_STAT_ADDED) { device_del(&hdev->dev); hid_debug_unregister(hdev); hdev->status &= ~HID_STAT_ADDED; } hid_free_bpf_rdesc(hdev); kfree(hdev->dev_rdesc); hdev->dev_rdesc = NULL; hdev->dev_rsize = 0; hdev->bpf_rsize = 0; } /** * hid_destroy_device - free previously allocated device * * @hdev: hid device * * If you allocate hid_device through hid_allocate_device, you should ever * free by this function. */ void hid_destroy_device(struct hid_device *hdev) { hid_bpf_destroy_device(hdev); hid_remove_device(hdev); put_device(&hdev->dev); } EXPORT_SYMBOL_GPL(hid_destroy_device); static int __hid_bus_reprobe_drivers(struct device *dev, void *data) { struct hid_driver *hdrv = data; struct hid_device *hdev = to_hid_device(dev); if (hdev->driver == hdrv && !hdrv->match(hdev, hid_ignore_special_drivers) && !test_and_set_bit(ffs(HID_STAT_REPROBED), &hdev->status)) return device_reprobe(dev); return 0; } static int __hid_bus_driver_added(struct device_driver *drv, void *data) { struct hid_driver *hdrv = to_hid_driver(drv); if (hdrv->match) { bus_for_each_dev(&hid_bus_type, NULL, hdrv, __hid_bus_reprobe_drivers); } return 0; } static int __bus_removed_driver(struct device_driver *drv, void *data) { return bus_rescan_devices(&hid_bus_type); } int __hid_register_driver(struct hid_driver *hdrv, struct module *owner, const char *mod_name) { int ret; hdrv->driver.name = hdrv->name; hdrv->driver.bus = &hid_bus_type; hdrv->driver.owner = owner; hdrv->driver.mod_name = mod_name; INIT_LIST_HEAD(&hdrv->dyn_list); spin_lock_init(&hdrv->dyn_lock); ret = driver_register(&hdrv->driver); if (ret == 0) bus_for_each_drv(&hid_bus_type, NULL, NULL, __hid_bus_driver_added); return ret; } EXPORT_SYMBOL_GPL(__hid_register_driver); void hid_unregister_driver(struct hid_driver *hdrv) { driver_unregister(&hdrv->driver); hid_free_dynids(hdrv); bus_for_each_drv(&hid_bus_type, NULL, hdrv, __bus_removed_driver); } EXPORT_SYMBOL_GPL(hid_unregister_driver); int hid_check_keys_pressed(struct hid_device *hid) { struct hid_input *hidinput; int i; if (!(hid->claimed & HID_CLAIMED_INPUT)) return 0; list_for_each_entry(hidinput, &hid->inputs, list) { for (i = 0; i < BITS_TO_LONGS(KEY_MAX); i++) if (hidinput->input->key[i]) return 1; } return 0; } EXPORT_SYMBOL_GPL(hid_check_keys_pressed); #ifdef CONFIG_HID_BPF static const struct hid_ops __hid_ops = { .hid_get_report = hid_get_report, .hid_hw_raw_request = __hid_hw_raw_request, .hid_hw_output_report = __hid_hw_output_report, .hid_input_report = __hid_input_report, .owner = THIS_MODULE, .bus_type = &hid_bus_type, }; #endif static int __init hid_init(void) { int ret; ret = bus_register(&hid_bus_type); if (ret) { pr_err("can't register hid bus\n"); goto err; } #ifdef CONFIG_HID_BPF hid_ops = &__hid_ops; #endif ret = hidraw_init(); if (ret) goto err_bus; hid_debug_init(); return 0; err_bus: bus_unregister(&hid_bus_type); err: return ret; } static void __exit hid_exit(void) { #ifdef CONFIG_HID_BPF hid_ops = NULL; #endif hid_debug_exit(); hidraw_exit(); bus_unregister(&hid_bus_type); hid_quirks_exit(HID_BUS_ANY); } module_init(hid_init); module_exit(hid_exit); MODULE_AUTHOR("Andreas Gal"); MODULE_AUTHOR("Vojtech Pavlik"); MODULE_AUTHOR("Jiri Kosina"); MODULE_DESCRIPTION("HID support for Linux"); MODULE_LICENSE("GPL");
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 /* SPDX-License-Identifier: GPL-2.0-only */ /* * pm_runtime.h - Device run-time power management helper functions. * * Copyright (C) 2009 Rafael J. Wysocki <rjw@sisk.pl> */ #ifndef _LINUX_PM_RUNTIME_H #define _LINUX_PM_RUNTIME_H #include <linux/device.h> #include <linux/notifier.h> #include <linux/pm.h> #include <linux/jiffies.h> /* Runtime PM flag argument bits */ #define RPM_ASYNC 0x01 /* Request is asynchronous */ #define RPM_NOWAIT 0x02 /* Don't wait for concurrent state change */ #define RPM_GET_PUT 0x04 /* Increment/decrement the usage_count */ #define RPM_AUTO 0x08 /* Use autosuspend_delay */ /* * Use this for defining a set of PM operations to be used in all situations * (system suspend, hibernation or runtime PM). * * Note that the behaviour differs from the deprecated UNIVERSAL_DEV_PM_OPS() * macro, which uses the provided callbacks for both runtime PM and system * sleep, while DEFINE_RUNTIME_DEV_PM_OPS() uses pm_runtime_force_suspend() * and pm_runtime_force_resume() for its system sleep callbacks. * * If the underlying dev_pm_ops struct symbol has to be exported, use * EXPORT_RUNTIME_DEV_PM_OPS() or EXPORT_GPL_RUNTIME_DEV_PM_OPS() instead. */ #define DEFINE_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \ _DEFINE_DEV_PM_OPS(name, pm_runtime_force_suspend, \ pm_runtime_force_resume, suspend_fn, \ resume_fn, idle_fn) #define EXPORT_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \ EXPORT_DEV_PM_OPS(name) = { \ RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \ } #define EXPORT_GPL_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \ EXPORT_GPL_DEV_PM_OPS(name) = { \ RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \ } #define EXPORT_NS_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn, ns) \ EXPORT_NS_DEV_PM_OPS(name, ns) = { \ RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \ } #define EXPORT_NS_GPL_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn, ns) \ EXPORT_NS_GPL_DEV_PM_OPS(name, ns) = { \ RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \ } #ifdef CONFIG_PM extern struct workqueue_struct *pm_wq; static inline bool queue_pm_work(struct work_struct *work) { return queue_work(pm_wq, work); } extern int pm_generic_runtime_suspend(struct device *dev); extern int pm_generic_runtime_resume(struct device *dev); extern int pm_runtime_force_suspend(struct device *dev); extern int pm_runtime_force_resume(struct device *dev); extern int __pm_runtime_idle(struct device *dev, int rpmflags); extern int __pm_runtime_suspend(struct device *dev, int rpmflags); extern int __pm_runtime_resume(struct device *dev, int rpmflags); extern int pm_runtime_get_if_active(struct device *dev); extern int pm_runtime_get_if_in_use(struct device *dev); extern int pm_schedule_suspend(struct device *dev, unsigned int delay); extern int __pm_runtime_set_status(struct device *dev, unsigned int status); extern int pm_runtime_barrier(struct device *dev); extern void pm_runtime_enable(struct device *dev); extern void __pm_runtime_disable(struct device *dev, bool check_resume); extern void pm_runtime_allow(struct device *dev); extern void pm_runtime_forbid(struct device *dev); extern void pm_runtime_no_callbacks(struct device *dev); extern void pm_runtime_irq_safe(struct device *dev); extern void __pm_runtime_use_autosuspend(struct device *dev, bool use); extern void pm_runtime_set_autosuspend_delay(struct device *dev, int delay); extern u64 pm_runtime_autosuspend_expiration(struct device *dev); extern void pm_runtime_set_memalloc_noio(struct device *dev, bool enable); extern void pm_runtime_get_suppliers(struct device *dev); extern void pm_runtime_put_suppliers(struct device *dev); extern void pm_runtime_new_link(struct device *dev); extern void pm_runtime_drop_link(struct device_link *link); extern void pm_runtime_release_supplier(struct device_link *link); extern int devm_pm_runtime_enable(struct device *dev); /** * pm_suspend_ignore_children - Set runtime PM behavior regarding children. * @dev: Target device. * @enable: Whether or not to ignore possible dependencies on children. * * The dependencies of @dev on its children will not be taken into account by * the runtime PM framework going forward if @enable is %true, or they will * be taken into account otherwise. */ static inline void pm_suspend_ignore_children(struct device *dev, bool enable) { dev->power.ignore_children = enable; } /** * pm_runtime_get_noresume - Bump up runtime PM usage counter of a device. * @dev: Target device. */ static inline void pm_runtime_get_noresume(struct device *dev) { atomic_inc(&dev->power.usage_count); } /** * pm_runtime_put_noidle - Drop runtime PM usage counter of a device. * @dev: Target device. * * Decrement the runtime PM usage counter of @dev unless it is 0 already. */ static inline void pm_runtime_put_noidle(struct device *dev) { atomic_add_unless(&dev->power.usage_count, -1, 0); } /** * pm_runtime_suspended - Check whether or not a device is runtime-suspended. * @dev: Target device. * * Return %true if runtime PM is enabled for @dev and its runtime PM status is * %RPM_SUSPENDED, or %false otherwise. * * Note that the return value of this function can only be trusted if it is * called under the runtime PM lock of @dev or under conditions in which * runtime PM cannot be either disabled or enabled for @dev and its runtime PM * status cannot change. */ static inline bool pm_runtime_suspended(struct device *dev) { return dev->power.runtime_status == RPM_SUSPENDED && !dev->power.disable_depth; } /** * pm_runtime_active - Check whether or not a device is runtime-active. * @dev: Target device. * * Return %true if runtime PM is disabled for @dev or its runtime PM status is * %RPM_ACTIVE, or %false otherwise. * * Note that the return value of this function can only be trusted if it is * called under the runtime PM lock of @dev or under conditions in which * runtime PM cannot be either disabled or enabled for @dev and its runtime PM * status cannot change. */ static inline bool pm_runtime_active(struct device *dev) { return dev->power.runtime_status == RPM_ACTIVE || dev->power.disable_depth; } /** * pm_runtime_status_suspended - Check if runtime PM status is "suspended". * @dev: Target device. * * Return %true if the runtime PM status of @dev is %RPM_SUSPENDED, or %false * otherwise, regardless of whether or not runtime PM has been enabled for @dev. * * Note that the return value of this function can only be trusted if it is * called under the runtime PM lock of @dev or under conditions in which the * runtime PM status of @dev cannot change. */ static inline bool pm_runtime_status_suspended(struct device *dev) { return dev->power.runtime_status == RPM_SUSPENDED; } /** * pm_runtime_enabled - Check if runtime PM is enabled. * @dev: Target device. * * Return %true if runtime PM is enabled for @dev or %false otherwise. * * Note that the return value of this function can only be trusted if it is * called under the runtime PM lock of @dev or under conditions in which * runtime PM cannot be either disabled or enabled for @dev. */ static inline bool pm_runtime_enabled(struct device *dev) { return !dev->power.disable_depth; } /** * pm_runtime_has_no_callbacks - Check if runtime PM callbacks may be present. * @dev: Target device. * * Return %true if @dev is a special device without runtime PM callbacks or * %false otherwise. */ static inline bool pm_runtime_has_no_callbacks(struct device *dev) { return dev->power.no_callbacks; } /** * pm_runtime_mark_last_busy - Update the last access time of a device. * @dev: Target device. * * Update the last access time of @dev used by the runtime PM autosuspend * mechanism to the current time as returned by ktime_get_mono_fast_ns(). */ static inline void pm_runtime_mark_last_busy(struct device *dev) { WRITE_ONCE(dev->power.last_busy, ktime_get_mono_fast_ns()); } /** * pm_runtime_is_irq_safe - Check if runtime PM can work in interrupt context. * @dev: Target device. * * Return %true if @dev has been marked as an "IRQ-safe" device (with respect * to runtime PM), in which case its runtime PM callabcks can be expected to * work correctly when invoked from interrupt handlers. */ static inline bool pm_runtime_is_irq_safe(struct device *dev) { return dev->power.irq_safe; } extern u64 pm_runtime_suspended_time(struct device *dev); #else /* !CONFIG_PM */ static inline bool queue_pm_work(struct work_struct *work) { return false; } static inline int pm_generic_runtime_suspend(struct device *dev) { return 0; } static inline int pm_generic_runtime_resume(struct device *dev) { return 0; } static inline int pm_runtime_force_suspend(struct device *dev) { return 0; } static inline int pm_runtime_force_resume(struct device *dev) { return 0; } static inline int __pm_runtime_idle(struct device *dev, int rpmflags) { return -ENOSYS; } static inline int __pm_runtime_suspend(struct device *dev, int rpmflags) { return -ENOSYS; } static inline int __pm_runtime_resume(struct device *dev, int rpmflags) { return 1; } static inline int pm_schedule_suspend(struct device *dev, unsigned int delay) { return -ENOSYS; } static inline int pm_runtime_get_if_in_use(struct device *dev) { return -EINVAL; } static inline int pm_runtime_get_if_active(struct device *dev) { return -EINVAL; } static inline int __pm_runtime_set_status(struct device *dev, unsigned int status) { return 0; } static inline int pm_runtime_barrier(struct device *dev) { return 0; } static inline void pm_runtime_enable(struct device *dev) {} static inline void __pm_runtime_disable(struct device *dev, bool c) {} static inline void pm_runtime_allow(struct device *dev) {} static inline void pm_runtime_forbid(struct device *dev) {} static inline int devm_pm_runtime_enable(struct device *dev) { return 0; } static inline void pm_suspend_ignore_children(struct device *dev, bool enable) {} static inline void pm_runtime_get_noresume(struct device *dev) {} static inline void pm_runtime_put_noidle(struct device *dev) {} static inline bool pm_runtime_suspended(struct device *dev) { return false; } static inline bool pm_runtime_active(struct device *dev) { return true; } static inline bool pm_runtime_status_suspended(struct device *dev) { return false; } static inline bool pm_runtime_enabled(struct device *dev) { return false; } static inline void pm_runtime_no_callbacks(struct device *dev) {} static inline void pm_runtime_irq_safe(struct device *dev) {} static inline bool pm_runtime_is_irq_safe(struct device *dev) { return false; } static inline bool pm_runtime_has_no_callbacks(struct device *dev) { return false; } static inline void pm_runtime_mark_last_busy(struct device *dev) {} static inline void __pm_runtime_use_autosuspend(struct device *dev, bool use) {} static inline void pm_runtime_set_autosuspend_delay(struct device *dev, int delay) {} static inline u64 pm_runtime_autosuspend_expiration( struct device *dev) { return 0; } static inline void pm_runtime_set_memalloc_noio(struct device *dev, bool enable){} static inline void pm_runtime_get_suppliers(struct device *dev) {} static inline void pm_runtime_put_suppliers(struct device *dev) {} static inline void pm_runtime_new_link(struct device *dev) {} static inline void pm_runtime_drop_link(struct device_link *link) {} static inline void pm_runtime_release_supplier(struct device_link *link) {} #endif /* !CONFIG_PM */ /** * pm_runtime_idle - Conditionally set up autosuspend of a device or suspend it. * @dev: Target device. * * Invoke the "idle check" callback of @dev and, depending on its return value, * set up autosuspend of @dev or suspend it (depending on whether or not * autosuspend has been enabled for it). */ static inline int pm_runtime_idle(struct device *dev) { return __pm_runtime_idle(dev, 0); } /** * pm_runtime_suspend - Suspend a device synchronously. * @dev: Target device. */ static inline int pm_runtime_suspend(struct device *dev) { return __pm_runtime_suspend(dev, 0); } /** * pm_runtime_autosuspend - Set up autosuspend of a device or suspend it. * @dev: Target device. * * Set up autosuspend of @dev or suspend it (depending on whether or not * autosuspend is enabled for it) without engaging its "idle check" callback. */ static inline int pm_runtime_autosuspend(struct device *dev) { return __pm_runtime_suspend(dev, RPM_AUTO); } /** * pm_runtime_resume - Resume a device synchronously. * @dev: Target device. */ static inline int pm_runtime_resume(struct device *dev) { return __pm_runtime_resume(dev, 0); } /** * pm_request_idle - Queue up "idle check" execution for a device. * @dev: Target device. * * Queue up a work item to run an equivalent of pm_runtime_idle() for @dev * asynchronously. */ static inline int pm_request_idle(struct device *dev) { return __pm_runtime_idle(dev, RPM_ASYNC); } /** * pm_request_resume - Queue up runtime-resume of a device. * @dev: Target device. */ static inline int pm_request_resume(struct device *dev) { return __pm_runtime_resume(dev, RPM_ASYNC); } /** * pm_request_autosuspend - Queue up autosuspend of a device. * @dev: Target device. * * Queue up a work item to run an equivalent pm_runtime_autosuspend() for @dev * asynchronously. */ static inline int pm_request_autosuspend(struct device *dev) { return __pm_runtime_suspend(dev, RPM_ASYNC | RPM_AUTO); } /** * pm_runtime_get - Bump up usage counter and queue up resume of a device. * @dev: Target device. * * Bump up the runtime PM usage counter of @dev and queue up a work item to * carry out runtime-resume of it. */ static inline int pm_runtime_get(struct device *dev) { return __pm_runtime_resume(dev, RPM_GET_PUT | RPM_ASYNC); } /** * pm_runtime_get_sync - Bump up usage counter of a device and resume it. * @dev: Target device. * * Bump up the runtime PM usage counter of @dev and carry out runtime-resume of * it synchronously. * * The possible return values of this function are the same as for * pm_runtime_resume() and the runtime PM usage counter of @dev remains * incremented in all cases, even if it returns an error code. * Consider using pm_runtime_resume_and_get() instead of it, especially * if its return value is checked by the caller, as this is likely to result * in cleaner code. */ static inline int pm_runtime_get_sync(struct device *dev) { return __pm_runtime_resume(dev, RPM_GET_PUT); } /** * pm_runtime_resume_and_get - Bump up usage counter of a device and resume it. * @dev: Target device. * * Resume @dev synchronously and if that is successful, increment its runtime * PM usage counter. Return 0 if the runtime PM usage counter of @dev has been * incremented or a negative error code otherwise. */ static inline int pm_runtime_resume_and_get(struct device *dev) { int ret; ret = __pm_runtime_resume(dev, RPM_GET_PUT); if (ret < 0) { pm_runtime_put_noidle(dev); return ret; } return 0; } /** * pm_runtime_put - Drop device usage counter and queue up "idle check" if 0. * @dev: Target device. * * Decrement the runtime PM usage counter of @dev and if it turns out to be * equal to 0, queue up a work item for @dev like in pm_request_idle(). */ static inline int pm_runtime_put(struct device *dev) { return __pm_runtime_idle(dev, RPM_GET_PUT | RPM_ASYNC); } /** * __pm_runtime_put_autosuspend - Drop device usage counter and queue autosuspend if 0. * @dev: Target device. * * Decrement the runtime PM usage counter of @dev and if it turns out to be * equal to 0, queue up a work item for @dev like in pm_request_autosuspend(). */ static inline int __pm_runtime_put_autosuspend(struct device *dev) { return __pm_runtime_suspend(dev, RPM_GET_PUT | RPM_ASYNC | RPM_AUTO); } /** * pm_runtime_put_autosuspend - Drop device usage counter and queue autosuspend if 0. * @dev: Target device. * * Decrement the runtime PM usage counter of @dev and if it turns out to be * equal to 0, queue up a work item for @dev like in pm_request_autosuspend(). */ static inline int pm_runtime_put_autosuspend(struct device *dev) { return __pm_runtime_suspend(dev, RPM_GET_PUT | RPM_ASYNC | RPM_AUTO); } /** * pm_runtime_put_sync - Drop device usage counter and run "idle check" if 0. * @dev: Target device. * * Decrement the runtime PM usage counter of @dev and if it turns out to be * equal to 0, invoke the "idle check" callback of @dev and, depending on its * return value, set up autosuspend of @dev or suspend it (depending on whether * or not autosuspend has been enabled for it). * * The possible return values of this function are the same as for * pm_runtime_idle() and the runtime PM usage counter of @dev remains * decremented in all cases, even if it returns an error code. */ static inline int pm_runtime_put_sync(struct device *dev) { return __pm_runtime_idle(dev, RPM_GET_PUT); } /** * pm_runtime_put_sync_suspend - Drop device usage counter and suspend if 0. * @dev: Target device. * * Decrement the runtime PM usage counter of @dev and if it turns out to be * equal to 0, carry out runtime-suspend of @dev synchronously. * * The possible return values of this function are the same as for * pm_runtime_suspend() and the runtime PM usage counter of @dev remains * decremented in all cases, even if it returns an error code. */ static inline int pm_runtime_put_sync_suspend(struct device *dev) { return __pm_runtime_suspend(dev, RPM_GET_PUT); } /** * pm_runtime_put_sync_autosuspend - Drop device usage counter and autosuspend if 0. * @dev: Target device. * * Decrement the runtime PM usage counter of @dev and if it turns out to be * equal to 0, set up autosuspend of @dev or suspend it synchronously (depending * on whether or not autosuspend has been enabled for it). * * The possible return values of this function are the same as for * pm_runtime_autosuspend() and the runtime PM usage counter of @dev remains * decremented in all cases, even if it returns an error code. */ static inline int pm_runtime_put_sync_autosuspend(struct device *dev) { return __pm_runtime_suspend(dev, RPM_GET_PUT | RPM_AUTO); } /** * pm_runtime_set_active - Set runtime PM status to "active". * @dev: Target device. * * Set the runtime PM status of @dev to %RPM_ACTIVE and ensure that dependencies * of it will be taken into account. * * It is not valid to call this function for devices with runtime PM enabled. */ static inline int pm_runtime_set_active(struct device *dev) { return __pm_runtime_set_status(dev, RPM_ACTIVE); } /** * pm_runtime_set_suspended - Set runtime PM status to "suspended". * @dev: Target device. * * Set the runtime PM status of @dev to %RPM_SUSPENDED and ensure that * dependencies of it will be taken into account. * * It is not valid to call this function for devices with runtime PM enabled. */ static inline int pm_runtime_set_suspended(struct device *dev) { return __pm_runtime_set_status(dev, RPM_SUSPENDED); } /** * pm_runtime_disable - Disable runtime PM for a device. * @dev: Target device. * * Prevent the runtime PM framework from working with @dev (by incrementing its * "blocking" counter). * * For each invocation of this function for @dev there must be a matching * pm_runtime_enable() call in order for runtime PM to be enabled for it. */ static inline void pm_runtime_disable(struct device *dev) { __pm_runtime_disable(dev, true); } /** * pm_runtime_use_autosuspend - Allow autosuspend to be used for a device. * @dev: Target device. * * Allow the runtime PM autosuspend mechanism to be used for @dev whenever * requested (or "autosuspend" will be handled as direct runtime-suspend for * it). * * NOTE: It's important to undo this with pm_runtime_dont_use_autosuspend() * at driver exit time unless your driver initially enabled pm_runtime * with devm_pm_runtime_enable() (which handles it for you). */ static inline void pm_runtime_use_autosuspend(struct device *dev) { __pm_runtime_use_autosuspend(dev, true); } /** * pm_runtime_dont_use_autosuspend - Prevent autosuspend from being used. * @dev: Target device. * * Prevent the runtime PM autosuspend mechanism from being used for @dev which * means that "autosuspend" will be handled as direct runtime-suspend for it * going forward. */ static inline void pm_runtime_dont_use_autosuspend(struct device *dev) { __pm_runtime_use_autosuspend(dev, false); } #endif
157 156 143 144 93 5 113 141 139 16 93 120 121 110 5 121 156 154 99 98 98 30 30 30 4 4 157 15 4 143 156 9 2 156 157 2 157 11 10 155 158 158 157 12 157 156 7 157 1 1 1 1 1 1 1 119 120 6 10 5 120 120 121 11 120 119 1 1 1 117 12 69 88 157 157 156 154 121 121 121 121 1 1 2 21 111 117 9 120 9 157 107 107 6 11 10 4 5 157 144 57 4 7 7 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * * This file is part of the SCTP kernel implementation * * These functions handle output processing. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Jon Grimm <jgrimm@austin.ibm.com> * Sridhar Samudrala <sri@us.ibm.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/kernel.h> #include <linux/wait.h> #include <linux/time.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/init.h> #include <linux/slab.h> #include <net/inet_ecn.h> #include <net/ip.h> #include <net/icmp.h> #include <net/net_namespace.h> #include <linux/socket.h> /* for sa_family_t */ #include <net/sock.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/checksum.h> /* Forward declarations for private helpers. */ static enum sctp_xmit __sctp_packet_append_chunk(struct sctp_packet *packet, struct sctp_chunk *chunk); static enum sctp_xmit sctp_packet_can_append_data(struct sctp_packet *packet, struct sctp_chunk *chunk); static void sctp_packet_append_data(struct sctp_packet *packet, struct sctp_chunk *chunk); static enum sctp_xmit sctp_packet_will_fit(struct sctp_packet *packet, struct sctp_chunk *chunk, u16 chunk_len); static void sctp_packet_reset(struct sctp_packet *packet) { /* sctp_packet_transmit() relies on this to reset size to the * current overhead after sending packets. */ packet->size = packet->overhead; packet->has_cookie_echo = 0; packet->has_sack = 0; packet->has_data = 0; packet->has_auth = 0; packet->ipfragok = 0; packet->auth = NULL; } /* Config a packet. * This appears to be a followup set of initializations. */ void sctp_packet_config(struct sctp_packet *packet, __u32 vtag, int ecn_capable) { struct sctp_transport *tp = packet->transport; struct sctp_association *asoc = tp->asoc; struct sctp_sock *sp = NULL; struct sock *sk; pr_debug("%s: packet:%p vtag:0x%x\n", __func__, packet, vtag); packet->vtag = vtag; /* do the following jobs only once for a flush schedule */ if (!sctp_packet_empty(packet)) return; /* set packet max_size with pathmtu, then calculate overhead */ packet->max_size = tp->pathmtu; if (asoc) { sk = asoc->base.sk; sp = sctp_sk(sk); } packet->overhead = sctp_mtu_payload(sp, 0, 0); packet->size = packet->overhead; if (!asoc) return; /* update dst or transport pathmtu if in need */ if (!sctp_transport_dst_check(tp)) { sctp_transport_route(tp, NULL, sp); if (asoc->param_flags & SPP_PMTUD_ENABLE) sctp_assoc_sync_pmtu(asoc); } else if (!sctp_transport_pl_enabled(tp) && asoc->param_flags & SPP_PMTUD_ENABLE) { if (!sctp_transport_pmtu_check(tp)) sctp_assoc_sync_pmtu(asoc); } if (asoc->pmtu_pending) { if (asoc->param_flags & SPP_PMTUD_ENABLE) sctp_assoc_sync_pmtu(asoc); asoc->pmtu_pending = 0; } /* If there a is a prepend chunk stick it on the list before * any other chunks get appended. */ if (ecn_capable) { struct sctp_chunk *chunk = sctp_get_ecne_prepend(asoc); if (chunk) sctp_packet_append_chunk(packet, chunk); } if (!tp->dst) return; /* set packet max_size with gso_max_size if gso is enabled*/ rcu_read_lock(); if (__sk_dst_get(sk) != tp->dst) { dst_hold(tp->dst); sk_setup_caps(sk, tp->dst); } packet->max_size = sk_can_gso(sk) ? min(READ_ONCE(tp->dst->dev->gso_max_size), GSO_LEGACY_MAX_SIZE) : asoc->pathmtu; rcu_read_unlock(); } /* Initialize the packet structure. */ void sctp_packet_init(struct sctp_packet *packet, struct sctp_transport *transport, __u16 sport, __u16 dport) { pr_debug("%s: packet:%p transport:%p\n", __func__, packet, transport); packet->transport = transport; packet->source_port = sport; packet->destination_port = dport; INIT_LIST_HEAD(&packet->chunk_list); /* The overhead will be calculated by sctp_packet_config() */ packet->overhead = 0; sctp_packet_reset(packet); packet->vtag = 0; } /* Free a packet. */ void sctp_packet_free(struct sctp_packet *packet) { struct sctp_chunk *chunk, *tmp; pr_debug("%s: packet:%p\n", __func__, packet); list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { list_del_init(&chunk->list); sctp_chunk_free(chunk); } } /* This routine tries to append the chunk to the offered packet. If adding * the chunk causes the packet to exceed the path MTU and COOKIE_ECHO chunk * is not present in the packet, it transmits the input packet. * Data can be bundled with a packet containing a COOKIE_ECHO chunk as long * as it can fit in the packet, but any more data that does not fit in this * packet can be sent only after receiving the COOKIE_ACK. */ enum sctp_xmit sctp_packet_transmit_chunk(struct sctp_packet *packet, struct sctp_chunk *chunk, int one_packet, gfp_t gfp) { enum sctp_xmit retval; pr_debug("%s: packet:%p size:%zu chunk:%p size:%d\n", __func__, packet, packet->size, chunk, chunk->skb ? chunk->skb->len : -1); switch ((retval = (sctp_packet_append_chunk(packet, chunk)))) { case SCTP_XMIT_PMTU_FULL: if (!packet->has_cookie_echo) { int error = 0; error = sctp_packet_transmit(packet, gfp); if (error < 0) chunk->skb->sk->sk_err = -error; /* If we have an empty packet, then we can NOT ever * return PMTU_FULL. */ if (!one_packet) retval = sctp_packet_append_chunk(packet, chunk); } break; case SCTP_XMIT_RWND_FULL: case SCTP_XMIT_OK: case SCTP_XMIT_DELAY: break; } return retval; } /* Try to bundle a pad chunk into a packet with a heartbeat chunk for PLPMTUTD probe */ static enum sctp_xmit sctp_packet_bundle_pad(struct sctp_packet *pkt, struct sctp_chunk *chunk) { struct sctp_transport *t = pkt->transport; struct sctp_chunk *pad; int overhead = 0; if (!chunk->pmtu_probe) return SCTP_XMIT_OK; /* calculate the Padding Data size for the pad chunk */ overhead += sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr); overhead += sizeof(struct sctp_sender_hb_info) + sizeof(struct sctp_pad_chunk); pad = sctp_make_pad(t->asoc, t->pl.probe_size - overhead); if (!pad) return SCTP_XMIT_DELAY; list_add_tail(&pad->list, &pkt->chunk_list); pkt->size += SCTP_PAD4(ntohs(pad->chunk_hdr->length)); chunk->transport = t; return SCTP_XMIT_OK; } /* Try to bundle an auth chunk into the packet. */ static enum sctp_xmit sctp_packet_bundle_auth(struct sctp_packet *pkt, struct sctp_chunk *chunk) { struct sctp_association *asoc = pkt->transport->asoc; enum sctp_xmit retval = SCTP_XMIT_OK; struct sctp_chunk *auth; /* if we don't have an association, we can't do authentication */ if (!asoc) return retval; /* See if this is an auth chunk we are bundling or if * auth is already bundled. */ if (chunk->chunk_hdr->type == SCTP_CID_AUTH || pkt->has_auth) return retval; /* if the peer did not request this chunk to be authenticated, * don't do it */ if (!chunk->auth) return retval; auth = sctp_make_auth(asoc, chunk->shkey->key_id); if (!auth) return retval; auth->shkey = chunk->shkey; sctp_auth_shkey_hold(auth->shkey); retval = __sctp_packet_append_chunk(pkt, auth); if (retval != SCTP_XMIT_OK) sctp_chunk_free(auth); return retval; } /* Try to bundle a SACK with the packet. */ static enum sctp_xmit sctp_packet_bundle_sack(struct sctp_packet *pkt, struct sctp_chunk *chunk) { enum sctp_xmit retval = SCTP_XMIT_OK; /* If sending DATA and haven't aleady bundled a SACK, try to * bundle one in to the packet. */ if (sctp_chunk_is_data(chunk) && !pkt->has_sack && !pkt->has_cookie_echo) { struct sctp_association *asoc; struct timer_list *timer; asoc = pkt->transport->asoc; timer = &asoc->timers[SCTP_EVENT_TIMEOUT_SACK]; /* If the SACK timer is running, we have a pending SACK */ if (timer_pending(timer)) { struct sctp_chunk *sack; if (pkt->transport->sack_generation != pkt->transport->asoc->peer.sack_generation) return retval; asoc->a_rwnd = asoc->rwnd; sack = sctp_make_sack(asoc); if (sack) { retval = __sctp_packet_append_chunk(pkt, sack); if (retval != SCTP_XMIT_OK) { sctp_chunk_free(sack); goto out; } SCTP_INC_STATS(asoc->base.net, SCTP_MIB_OUTCTRLCHUNKS); asoc->stats.octrlchunks++; asoc->peer.sack_needed = 0; if (del_timer(timer)) sctp_association_put(asoc); } } } out: return retval; } /* Append a chunk to the offered packet reporting back any inability to do * so. */ static enum sctp_xmit __sctp_packet_append_chunk(struct sctp_packet *packet, struct sctp_chunk *chunk) { __u16 chunk_len = SCTP_PAD4(ntohs(chunk->chunk_hdr->length)); enum sctp_xmit retval = SCTP_XMIT_OK; /* Check to see if this chunk will fit into the packet */ retval = sctp_packet_will_fit(packet, chunk, chunk_len); if (retval != SCTP_XMIT_OK) goto finish; /* We believe that this chunk is OK to add to the packet */ switch (chunk->chunk_hdr->type) { case SCTP_CID_DATA: case SCTP_CID_I_DATA: /* Account for the data being in the packet */ sctp_packet_append_data(packet, chunk); /* Disallow SACK bundling after DATA. */ packet->has_sack = 1; /* Disallow AUTH bundling after DATA */ packet->has_auth = 1; /* Let it be knows that packet has DATA in it */ packet->has_data = 1; /* timestamp the chunk for rtx purposes */ chunk->sent_at = jiffies; /* Mainly used for prsctp RTX policy */ chunk->sent_count++; break; case SCTP_CID_COOKIE_ECHO: packet->has_cookie_echo = 1; break; case SCTP_CID_SACK: packet->has_sack = 1; if (chunk->asoc) chunk->asoc->stats.osacks++; break; case SCTP_CID_AUTH: packet->has_auth = 1; packet->auth = chunk; break; } /* It is OK to send this chunk. */ list_add_tail(&chunk->list, &packet->chunk_list); packet->size += chunk_len; chunk->transport = packet->transport; finish: return retval; } /* Append a chunk to the offered packet reporting back any inability to do * so. */ enum sctp_xmit sctp_packet_append_chunk(struct sctp_packet *packet, struct sctp_chunk *chunk) { enum sctp_xmit retval = SCTP_XMIT_OK; pr_debug("%s: packet:%p chunk:%p\n", __func__, packet, chunk); /* Data chunks are special. Before seeing what else we can * bundle into this packet, check to see if we are allowed to * send this DATA. */ if (sctp_chunk_is_data(chunk)) { retval = sctp_packet_can_append_data(packet, chunk); if (retval != SCTP_XMIT_OK) goto finish; } /* Try to bundle AUTH chunk */ retval = sctp_packet_bundle_auth(packet, chunk); if (retval != SCTP_XMIT_OK) goto finish; /* Try to bundle SACK chunk */ retval = sctp_packet_bundle_sack(packet, chunk); if (retval != SCTP_XMIT_OK) goto finish; retval = __sctp_packet_append_chunk(packet, chunk); if (retval != SCTP_XMIT_OK) goto finish; retval = sctp_packet_bundle_pad(packet, chunk); finish: return retval; } static void sctp_packet_gso_append(struct sk_buff *head, struct sk_buff *skb) { if (SCTP_OUTPUT_CB(head)->last == head) skb_shinfo(head)->frag_list = skb; else SCTP_OUTPUT_CB(head)->last->next = skb; SCTP_OUTPUT_CB(head)->last = skb; head->truesize += skb->truesize; head->data_len += skb->len; head->len += skb->len; refcount_add(skb->truesize, &head->sk->sk_wmem_alloc); __skb_header_release(skb); } static int sctp_packet_pack(struct sctp_packet *packet, struct sk_buff *head, int gso, gfp_t gfp) { struct sctp_transport *tp = packet->transport; struct sctp_auth_chunk *auth = NULL; struct sctp_chunk *chunk, *tmp; int pkt_count = 0, pkt_size; struct sock *sk = head->sk; struct sk_buff *nskb; int auth_len = 0; if (gso) { skb_shinfo(head)->gso_type = sk->sk_gso_type; SCTP_OUTPUT_CB(head)->last = head; } else { nskb = head; pkt_size = packet->size; goto merge; } do { /* calculate the pkt_size and alloc nskb */ pkt_size = packet->overhead; list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { int padded = SCTP_PAD4(chunk->skb->len); if (chunk == packet->auth) auth_len = padded; else if (auth_len + padded + packet->overhead > tp->pathmtu) return 0; else if (pkt_size + padded > tp->pathmtu) break; pkt_size += padded; } nskb = alloc_skb(pkt_size + MAX_HEADER, gfp); if (!nskb) return 0; skb_reserve(nskb, packet->overhead + MAX_HEADER); merge: /* merge chunks into nskb and append nskb into head list */ pkt_size -= packet->overhead; list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { int padding; list_del_init(&chunk->list); if (sctp_chunk_is_data(chunk)) { if (!sctp_chunk_retransmitted(chunk) && !tp->rto_pending) { chunk->rtt_in_progress = 1; tp->rto_pending = 1; } } padding = SCTP_PAD4(chunk->skb->len) - chunk->skb->len; if (padding) skb_put_zero(chunk->skb, padding); if (chunk == packet->auth) auth = (struct sctp_auth_chunk *) skb_tail_pointer(nskb); skb_put_data(nskb, chunk->skb->data, chunk->skb->len); pr_debug("*** Chunk:%p[%s] %s 0x%x, length:%d, chunk->skb->len:%d, rtt_in_progress:%d\n", chunk, sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)), chunk->has_tsn ? "TSN" : "No TSN", chunk->has_tsn ? ntohl(chunk->subh.data_hdr->tsn) : 0, ntohs(chunk->chunk_hdr->length), chunk->skb->len, chunk->rtt_in_progress); pkt_size -= SCTP_PAD4(chunk->skb->len); if (!sctp_chunk_is_data(chunk) && chunk != packet->auth) sctp_chunk_free(chunk); if (!pkt_size) break; } if (auth) { sctp_auth_calculate_hmac(tp->asoc, nskb, auth, packet->auth->shkey, gfp); /* free auth if no more chunks, or add it back */ if (list_empty(&packet->chunk_list)) sctp_chunk_free(packet->auth); else list_add(&packet->auth->list, &packet->chunk_list); } if (gso) sctp_packet_gso_append(head, nskb); pkt_count++; } while (!list_empty(&packet->chunk_list)); if (gso) { memset(head->cb, 0, max(sizeof(struct inet_skb_parm), sizeof(struct inet6_skb_parm))); skb_shinfo(head)->gso_segs = pkt_count; skb_shinfo(head)->gso_size = GSO_BY_FRAGS; goto chksum; } if (sctp_checksum_disable) return 1; if (!(tp->dst->dev->features & NETIF_F_SCTP_CRC) || dst_xfrm(tp->dst) || packet->ipfragok || tp->encap_port) { struct sctphdr *sh = (struct sctphdr *)skb_transport_header(head); sh->checksum = sctp_compute_cksum(head, 0); } else { chksum: head->ip_summed = CHECKSUM_PARTIAL; head->csum_not_inet = 1; head->csum_start = skb_transport_header(head) - head->head; head->csum_offset = offsetof(struct sctphdr, checksum); } return pkt_count; } /* All packets are sent to the network through this function from * sctp_outq_tail(). * * The return value is always 0 for now. */ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) { struct sctp_transport *tp = packet->transport; struct sctp_association *asoc = tp->asoc; struct sctp_chunk *chunk, *tmp; int pkt_count, gso = 0; struct sk_buff *head; struct sctphdr *sh; struct sock *sk; pr_debug("%s: packet:%p\n", __func__, packet); if (list_empty(&packet->chunk_list)) return 0; chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list); sk = chunk->skb->sk; if (packet->size > tp->pathmtu && !packet->ipfragok && !chunk->pmtu_probe) { if (tp->pl.state == SCTP_PL_ERROR) { /* do IP fragmentation if in Error state */ packet->ipfragok = 1; } else { if (!sk_can_gso(sk)) { /* check gso */ pr_err_once("Trying to GSO but underlying device doesn't support it."); goto out; } gso = 1; } } /* alloc head skb */ head = alloc_skb((gso ? packet->overhead : packet->size) + MAX_HEADER, gfp); if (!head) goto out; skb_reserve(head, packet->overhead + MAX_HEADER); skb_set_owner_w(head, sk); /* set sctp header */ sh = skb_push(head, sizeof(struct sctphdr)); skb_reset_transport_header(head); sh->source = htons(packet->source_port); sh->dest = htons(packet->destination_port); sh->vtag = htonl(packet->vtag); sh->checksum = 0; /* drop packet if no dst */ if (!tp->dst) { IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); kfree_skb(head); goto out; } /* pack up chunks */ pkt_count = sctp_packet_pack(packet, head, gso, gfp); if (!pkt_count) { kfree_skb(head); goto out; } pr_debug("***sctp_transmit_packet*** skb->len:%d\n", head->len); /* start autoclose timer */ if (packet->has_data && sctp_state(asoc, ESTABLISHED) && asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE]) { struct timer_list *timer = &asoc->timers[SCTP_EVENT_TIMEOUT_AUTOCLOSE]; unsigned long timeout = asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE]; if (!mod_timer(timer, jiffies + timeout)) sctp_association_hold(asoc); } /* sctp xmit */ tp->af_specific->ecn_capable(sk); if (asoc) { asoc->stats.opackets += pkt_count; if (asoc->peer.last_sent_to != tp) asoc->peer.last_sent_to = tp; } head->ignore_df = packet->ipfragok; if (tp->dst_pending_confirm) skb_set_dst_pending_confirm(head, 1); /* neighbour should be confirmed on successful transmission or * positive error */ if (tp->af_specific->sctp_xmit(head, tp) >= 0 && tp->dst_pending_confirm) tp->dst_pending_confirm = 0; out: list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { list_del_init(&chunk->list); if (!sctp_chunk_is_data(chunk)) sctp_chunk_free(chunk); } sctp_packet_reset(packet); return 0; } /******************************************************************** * 2nd Level Abstractions ********************************************************************/ /* This private function check to see if a chunk can be added */ static enum sctp_xmit sctp_packet_can_append_data(struct sctp_packet *packet, struct sctp_chunk *chunk) { size_t datasize, rwnd, inflight, flight_size; struct sctp_transport *transport = packet->transport; struct sctp_association *asoc = transport->asoc; struct sctp_outq *q = &asoc->outqueue; /* RFC 2960 6.1 Transmission of DATA Chunks * * A) At any given time, the data sender MUST NOT transmit new data to * any destination transport address if its peer's rwnd indicates * that the peer has no buffer space (i.e. rwnd is 0, see Section * 6.2.1). However, regardless of the value of rwnd (including if it * is 0), the data sender can always have one DATA chunk in flight to * the receiver if allowed by cwnd (see rule B below). This rule * allows the sender to probe for a change in rwnd that the sender * missed due to the SACK having been lost in transit from the data * receiver to the data sender. */ rwnd = asoc->peer.rwnd; inflight = q->outstanding_bytes; flight_size = transport->flight_size; datasize = sctp_data_size(chunk); if (datasize > rwnd && inflight > 0) /* We have (at least) one data chunk in flight, * so we can't fall back to rule 6.1 B). */ return SCTP_XMIT_RWND_FULL; /* RFC 2960 6.1 Transmission of DATA Chunks * * B) At any given time, the sender MUST NOT transmit new data * to a given transport address if it has cwnd or more bytes * of data outstanding to that transport address. */ /* RFC 7.2.4 & the Implementers Guide 2.8. * * 3) ... * When a Fast Retransmit is being performed the sender SHOULD * ignore the value of cwnd and SHOULD NOT delay retransmission. */ if (chunk->fast_retransmit != SCTP_NEED_FRTX && flight_size >= transport->cwnd) return SCTP_XMIT_RWND_FULL; /* Nagle's algorithm to solve small-packet problem: * Inhibit the sending of new chunks when new outgoing data arrives * if any previously transmitted data on the connection remains * unacknowledged. */ if ((sctp_sk(asoc->base.sk)->nodelay || inflight == 0) && !asoc->force_delay) /* Nothing unacked */ return SCTP_XMIT_OK; if (!sctp_packet_empty(packet)) /* Append to packet */ return SCTP_XMIT_OK; if (!sctp_state(asoc, ESTABLISHED)) return SCTP_XMIT_OK; /* Check whether this chunk and all the rest of pending data will fit * or delay in hopes of bundling a full sized packet. */ if (chunk->skb->len + q->out_qlen > transport->pathmtu - packet->overhead - sctp_datachk_len(&chunk->asoc->stream) - 4) /* Enough data queued to fill a packet */ return SCTP_XMIT_OK; /* Don't delay large message writes that may have been fragmented */ if (!chunk->msg->can_delay) return SCTP_XMIT_OK; /* Defer until all data acked or packet full */ return SCTP_XMIT_DELAY; } /* This private function does management things when adding DATA chunk */ static void sctp_packet_append_data(struct sctp_packet *packet, struct sctp_chunk *chunk) { struct sctp_transport *transport = packet->transport; size_t datasize = sctp_data_size(chunk); struct sctp_association *asoc = transport->asoc; u32 rwnd = asoc->peer.rwnd; /* Keep track of how many bytes are in flight over this transport. */ transport->flight_size += datasize; /* Keep track of how many bytes are in flight to the receiver. */ asoc->outqueue.outstanding_bytes += datasize; /* Update our view of the receiver's rwnd. */ if (datasize < rwnd) rwnd -= datasize; else rwnd = 0; asoc->peer.rwnd = rwnd; sctp_chunk_assign_tsn(chunk); asoc->stream.si->assign_number(chunk); } static enum sctp_xmit sctp_packet_will_fit(struct sctp_packet *packet, struct sctp_chunk *chunk, u16 chunk_len) { enum sctp_xmit retval = SCTP_XMIT_OK; size_t psize, pmtu, maxsize; /* Don't bundle in this packet if this chunk's auth key doesn't * match other chunks already enqueued on this packet. Also, * don't bundle the chunk with auth key if other chunks in this * packet don't have auth key. */ if ((packet->auth && chunk->shkey != packet->auth->shkey) || (!packet->auth && chunk->shkey && chunk->chunk_hdr->type != SCTP_CID_AUTH)) return SCTP_XMIT_PMTU_FULL; psize = packet->size; if (packet->transport->asoc) pmtu = packet->transport->asoc->pathmtu; else pmtu = packet->transport->pathmtu; /* Decide if we need to fragment or resubmit later. */ if (psize + chunk_len > pmtu) { /* It's OK to fragment at IP level if any one of the following * is true: * 1. The packet is empty (meaning this chunk is greater * the MTU) * 2. The packet doesn't have any data in it yet and data * requires authentication. */ if (sctp_packet_empty(packet) || (!packet->has_data && chunk->auth)) { /* We no longer do re-fragmentation. * Just fragment at the IP layer, if we * actually hit this condition */ packet->ipfragok = 1; goto out; } /* Similarly, if this chunk was built before a PMTU * reduction, we have to fragment it at IP level now. So * if the packet already contains something, we need to * flush. */ maxsize = pmtu - packet->overhead; if (packet->auth) maxsize -= SCTP_PAD4(packet->auth->skb->len); if (chunk_len > maxsize) retval = SCTP_XMIT_PMTU_FULL; /* It is also okay to fragment if the chunk we are * adding is a control chunk, but only if current packet * is not a GSO one otherwise it causes fragmentation of * a large frame. So in this case we allow the * fragmentation by forcing it to be in a new packet. */ if (!sctp_chunk_is_data(chunk) && packet->has_data) retval = SCTP_XMIT_PMTU_FULL; if (psize + chunk_len > packet->max_size) /* Hit GSO/PMTU limit, gotta flush */ retval = SCTP_XMIT_PMTU_FULL; if (!packet->transport->burst_limited && psize + chunk_len > (packet->transport->cwnd >> 1)) /* Do not allow a single GSO packet to use more * than half of cwnd. */ retval = SCTP_XMIT_PMTU_FULL; if (packet->transport->burst_limited && psize + chunk_len > (packet->transport->burst_limited >> 1)) /* Do not allow a single GSO packet to use more * than half of original cwnd. */ retval = SCTP_XMIT_PMTU_FULL; /* Otherwise it will fit in the GSO packet */ } out: return retval; }
6 3 3 4 3 3 4 1 5 5 1 1 2 10 1 4 6 3 3 3 2 1 11 11 6 6 4 2 6 6 6 20 20 20 2 2 2 2 2 2 2 2 1 3 3 2 2 2 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Linus Lüssing */ #include "multicast.h" #include "main.h" #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/bug.h> #include <linux/byteorder/generic.h> #include <linux/container_of.h> #include <linux/err.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/icmpv6.h> #include <linux/if_bridge.h> #include <linux/if_ether.h> #include <linux/igmp.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/inetdevice.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/jiffies.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/printk.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/sprintf.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> #include <linux/workqueue.h> #include <net/addrconf.h> #include <net/genetlink.h> #include <net/if_inet6.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/netlink.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "bridge_loop_avoidance.h" #include "hard-interface.h" #include "hash.h" #include "log.h" #include "netlink.h" #include "send.h" #include "translation-table.h" #include "tvlv.h" static void batadv_mcast_mla_update(struct work_struct *work); /** * batadv_mcast_start_timer() - schedule the multicast periodic worker * @bat_priv: the bat priv with all the soft interface information */ static void batadv_mcast_start_timer(struct batadv_priv *bat_priv) { queue_delayed_work(batadv_event_workqueue, &bat_priv->mcast.work, msecs_to_jiffies(BATADV_MCAST_WORK_PERIOD)); } /** * batadv_mcast_get_bridge() - get the bridge on top of the softif if it exists * @soft_iface: netdev struct of the mesh interface * * If the given soft interface has a bridge on top then the refcount * of the according net device is increased. * * Return: NULL if no such bridge exists. Otherwise the net device of the * bridge. */ static struct net_device *batadv_mcast_get_bridge(struct net_device *soft_iface) { struct net_device *upper = soft_iface; rcu_read_lock(); do { upper = netdev_master_upper_dev_get_rcu(upper); } while (upper && !netif_is_bridge_master(upper)); dev_hold(upper); rcu_read_unlock(); return upper; } /** * batadv_mcast_mla_rtr_flags_softif_get_ipv4() - get mcast router flags from * node for IPv4 * @dev: the interface to check * * Checks the presence of an IPv4 multicast router on this node. * * Caller needs to hold rcu read lock. * * Return: BATADV_NO_FLAGS if present, BATADV_MCAST_WANT_NO_RTR4 otherwise. */ static u8 batadv_mcast_mla_rtr_flags_softif_get_ipv4(struct net_device *dev) { struct in_device *in_dev = __in_dev_get_rcu(dev); if (in_dev && IN_DEV_MFORWARD(in_dev)) return BATADV_NO_FLAGS; else return BATADV_MCAST_WANT_NO_RTR4; } /** * batadv_mcast_mla_rtr_flags_softif_get_ipv6() - get mcast router flags from * node for IPv6 * @dev: the interface to check * * Checks the presence of an IPv6 multicast router on this node. * * Caller needs to hold rcu read lock. * * Return: BATADV_NO_FLAGS if present, BATADV_MCAST_WANT_NO_RTR6 otherwise. */ #if IS_ENABLED(CONFIG_IPV6_MROUTE) static u8 batadv_mcast_mla_rtr_flags_softif_get_ipv6(struct net_device *dev) { struct inet6_dev *in6_dev = __in6_dev_get(dev); if (in6_dev && atomic_read(&in6_dev->cnf.mc_forwarding)) return BATADV_NO_FLAGS; else return BATADV_MCAST_WANT_NO_RTR6; } #else static inline u8 batadv_mcast_mla_rtr_flags_softif_get_ipv6(struct net_device *dev) { return BATADV_MCAST_WANT_NO_RTR6; } #endif /** * batadv_mcast_mla_rtr_flags_softif_get() - get mcast router flags from node * @bat_priv: the bat priv with all the soft interface information * @bridge: bridge interface on top of the soft_iface if present, * otherwise pass NULL * * Checks the presence of IPv4 and IPv6 multicast routers on this * node. * * Return: * BATADV_NO_FLAGS: Both an IPv4 and IPv6 multicast router is present * BATADV_MCAST_WANT_NO_RTR4: No IPv4 multicast router is present * BATADV_MCAST_WANT_NO_RTR6: No IPv6 multicast router is present * The former two OR'd: no multicast router is present */ static u8 batadv_mcast_mla_rtr_flags_softif_get(struct batadv_priv *bat_priv, struct net_device *bridge) { struct net_device *dev = bridge ? bridge : bat_priv->soft_iface; u8 flags = BATADV_NO_FLAGS; rcu_read_lock(); flags |= batadv_mcast_mla_rtr_flags_softif_get_ipv4(dev); flags |= batadv_mcast_mla_rtr_flags_softif_get_ipv6(dev); rcu_read_unlock(); return flags; } /** * batadv_mcast_mla_rtr_flags_bridge_get() - get mcast router flags from bridge * @bat_priv: the bat priv with all the soft interface information * @bridge: bridge interface on top of the soft_iface if present, * otherwise pass NULL * * Checks the presence of IPv4 and IPv6 multicast routers behind a bridge. * * Return: * BATADV_NO_FLAGS: Both an IPv4 and IPv6 multicast router is present * BATADV_MCAST_WANT_NO_RTR4: No IPv4 multicast router is present * BATADV_MCAST_WANT_NO_RTR6: No IPv6 multicast router is present * The former two OR'd: no multicast router is present */ static u8 batadv_mcast_mla_rtr_flags_bridge_get(struct batadv_priv *bat_priv, struct net_device *bridge) { struct net_device *dev = bat_priv->soft_iface; u8 flags = BATADV_NO_FLAGS; if (!bridge) return BATADV_MCAST_WANT_NO_RTR4 | BATADV_MCAST_WANT_NO_RTR6; if (!br_multicast_has_router_adjacent(dev, ETH_P_IP)) flags |= BATADV_MCAST_WANT_NO_RTR4; if (!br_multicast_has_router_adjacent(dev, ETH_P_IPV6)) flags |= BATADV_MCAST_WANT_NO_RTR6; return flags; } /** * batadv_mcast_mla_rtr_flags_get() - get multicast router flags * @bat_priv: the bat priv with all the soft interface information * @bridge: bridge interface on top of the soft_iface if present, * otherwise pass NULL * * Checks the presence of IPv4 and IPv6 multicast routers on this * node or behind its bridge. * * Return: * BATADV_NO_FLAGS: Both an IPv4 and IPv6 multicast router is present * BATADV_MCAST_WANT_NO_RTR4: No IPv4 multicast router is present * BATADV_MCAST_WANT_NO_RTR6: No IPv6 multicast router is present * The former two OR'd: no multicast router is present */ static u8 batadv_mcast_mla_rtr_flags_get(struct batadv_priv *bat_priv, struct net_device *bridge) { u8 flags = BATADV_MCAST_WANT_NO_RTR4 | BATADV_MCAST_WANT_NO_RTR6; flags &= batadv_mcast_mla_rtr_flags_softif_get(bat_priv, bridge); flags &= batadv_mcast_mla_rtr_flags_bridge_get(bat_priv, bridge); return flags; } /** * batadv_mcast_mla_forw_flags_get() - get multicast forwarding flags * @bat_priv: the bat priv with all the soft interface information * * Checks if all active hard interfaces have an MTU larger or equal to 1280 * bytes (IPv6 minimum MTU). * * Return: BATADV_MCAST_HAVE_MC_PTYPE_CAPA if yes, BATADV_NO_FLAGS otherwise. */ static u8 batadv_mcast_mla_forw_flags_get(struct batadv_priv *bat_priv) { const struct batadv_hard_iface *hard_iface; rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->if_status != BATADV_IF_ACTIVE) continue; if (hard_iface->soft_iface != bat_priv->soft_iface) continue; if (hard_iface->net_dev->mtu < IPV6_MIN_MTU) { rcu_read_unlock(); return BATADV_NO_FLAGS; } } rcu_read_unlock(); return BATADV_MCAST_HAVE_MC_PTYPE_CAPA; } /** * batadv_mcast_mla_flags_get() - get the new multicast flags * @bat_priv: the bat priv with all the soft interface information * * Return: A set of flags for the current/next TVLV, querier and * bridge state. */ static struct batadv_mcast_mla_flags batadv_mcast_mla_flags_get(struct batadv_priv *bat_priv) { struct net_device *dev = bat_priv->soft_iface; struct batadv_mcast_querier_state *qr4, *qr6; struct batadv_mcast_mla_flags mla_flags; struct net_device *bridge; bridge = batadv_mcast_get_bridge(dev); memset(&mla_flags, 0, sizeof(mla_flags)); mla_flags.enabled = 1; mla_flags.tvlv_flags |= batadv_mcast_mla_rtr_flags_get(bat_priv, bridge); mla_flags.tvlv_flags |= batadv_mcast_mla_forw_flags_get(bat_priv); if (!bridge) return mla_flags; dev_put(bridge); mla_flags.bridged = 1; qr4 = &mla_flags.querier_ipv4; qr6 = &mla_flags.querier_ipv6; if (!IS_ENABLED(CONFIG_BRIDGE_IGMP_SNOOPING)) pr_warn_once("No bridge IGMP snooping compiled - multicast optimizations disabled\n"); qr4->exists = br_multicast_has_querier_anywhere(dev, ETH_P_IP); qr4->shadowing = br_multicast_has_querier_adjacent(dev, ETH_P_IP); qr6->exists = br_multicast_has_querier_anywhere(dev, ETH_P_IPV6); qr6->shadowing = br_multicast_has_querier_adjacent(dev, ETH_P_IPV6); mla_flags.tvlv_flags |= BATADV_MCAST_WANT_ALL_UNSNOOPABLES; /* 1) If no querier exists at all, then multicast listeners on * our local TT clients behind the bridge will keep silent. * 2) If the selected querier is on one of our local TT clients, * behind the bridge, then this querier might shadow multicast * listeners on our local TT clients, behind this bridge. * * In both cases, we will signalize other batman nodes that * we need all multicast traffic of the according protocol. */ if (!qr4->exists || qr4->shadowing) { mla_flags.tvlv_flags |= BATADV_MCAST_WANT_ALL_IPV4; mla_flags.tvlv_flags &= ~BATADV_MCAST_WANT_NO_RTR4; } if (!qr6->exists || qr6->shadowing) { mla_flags.tvlv_flags |= BATADV_MCAST_WANT_ALL_IPV6; mla_flags.tvlv_flags &= ~BATADV_MCAST_WANT_NO_RTR6; } return mla_flags; } /** * batadv_mcast_mla_is_duplicate() - check whether an address is in a list * @mcast_addr: the multicast address to check * @mcast_list: the list with multicast addresses to search in * * Return: true if the given address is already in the given list. * Otherwise returns false. */ static bool batadv_mcast_mla_is_duplicate(u8 *mcast_addr, struct hlist_head *mcast_list) { struct batadv_hw_addr *mcast_entry; hlist_for_each_entry(mcast_entry, mcast_list, list) if (batadv_compare_eth(mcast_entry->addr, mcast_addr)) return true; return false; } /** * batadv_mcast_mla_softif_get_ipv4() - get softif IPv4 multicast listeners * @dev: the device to collect multicast addresses from * @mcast_list: a list to put found addresses into * @flags: flags indicating the new multicast state * * Collects multicast addresses of IPv4 multicast listeners residing * on this kernel on the given soft interface, dev, in * the given mcast_list. In general, multicast listeners provided by * your multicast receiving applications run directly on this node. * * Return: -ENOMEM on memory allocation error or the number of * items added to the mcast_list otherwise. */ static int batadv_mcast_mla_softif_get_ipv4(struct net_device *dev, struct hlist_head *mcast_list, struct batadv_mcast_mla_flags *flags) { struct batadv_hw_addr *new; struct in_device *in_dev; u8 mcast_addr[ETH_ALEN]; struct ip_mc_list *pmc; int ret = 0; if (flags->tvlv_flags & BATADV_MCAST_WANT_ALL_IPV4) return 0; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (!in_dev) { rcu_read_unlock(); return 0; } for (pmc = rcu_dereference(in_dev->mc_list); pmc; pmc = rcu_dereference(pmc->next_rcu)) { if (flags->tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && ipv4_is_local_multicast(pmc->multiaddr)) continue; if (!(flags->tvlv_flags & BATADV_MCAST_WANT_NO_RTR4) && !ipv4_is_local_multicast(pmc->multiaddr)) continue; ip_eth_mc_map(pmc->multiaddr, mcast_addr); if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list)) continue; new = kmalloc(sizeof(*new), GFP_ATOMIC); if (!new) { ret = -ENOMEM; break; } ether_addr_copy(new->addr, mcast_addr); hlist_add_head(&new->list, mcast_list); ret++; } rcu_read_unlock(); return ret; } /** * batadv_mcast_mla_softif_get_ipv6() - get softif IPv6 multicast listeners * @dev: the device to collect multicast addresses from * @mcast_list: a list to put found addresses into * @flags: flags indicating the new multicast state * * Collects multicast addresses of IPv6 multicast listeners residing * on this kernel on the given soft interface, dev, in * the given mcast_list. In general, multicast listeners provided by * your multicast receiving applications run directly on this node. * * Return: -ENOMEM on memory allocation error or the number of * items added to the mcast_list otherwise. */ #if IS_ENABLED(CONFIG_IPV6) static int batadv_mcast_mla_softif_get_ipv6(struct net_device *dev, struct hlist_head *mcast_list, struct batadv_mcast_mla_flags *flags) { struct batadv_hw_addr *new; struct inet6_dev *in6_dev; u8 mcast_addr[ETH_ALEN]; struct ifmcaddr6 *pmc6; int ret = 0; if (flags->tvlv_flags & BATADV_MCAST_WANT_ALL_IPV6) return 0; rcu_read_lock(); in6_dev = __in6_dev_get(dev); if (!in6_dev) { rcu_read_unlock(); return 0; } for (pmc6 = rcu_dereference(in6_dev->mc_list); pmc6; pmc6 = rcu_dereference(pmc6->next)) { if (IPV6_ADDR_MC_SCOPE(&pmc6->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) continue; if (flags->tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && ipv6_addr_is_ll_all_nodes(&pmc6->mca_addr)) continue; if (!(flags->tvlv_flags & BATADV_MCAST_WANT_NO_RTR6) && IPV6_ADDR_MC_SCOPE(&pmc6->mca_addr) > IPV6_ADDR_SCOPE_LINKLOCAL) continue; ipv6_eth_mc_map(&pmc6->mca_addr, mcast_addr); if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list)) continue; new = kmalloc(sizeof(*new), GFP_ATOMIC); if (!new) { ret = -ENOMEM; break; } ether_addr_copy(new->addr, mcast_addr); hlist_add_head(&new->list, mcast_list); ret++; } rcu_read_unlock(); return ret; } #else static inline int batadv_mcast_mla_softif_get_ipv6(struct net_device *dev, struct hlist_head *mcast_list, struct batadv_mcast_mla_flags *flags) { return 0; } #endif /** * batadv_mcast_mla_softif_get() - get softif multicast listeners * @dev: the device to collect multicast addresses from * @mcast_list: a list to put found addresses into * @flags: flags indicating the new multicast state * * Collects multicast addresses of multicast listeners residing * on this kernel on the given soft interface, dev, in * the given mcast_list. In general, multicast listeners provided by * your multicast receiving applications run directly on this node. * * If there is a bridge interface on top of dev, collect from that one * instead. Just like with IP addresses and routes, multicast listeners * will(/should) register to the bridge interface instead of an * enslaved bat0. * * Return: -ENOMEM on memory allocation error or the number of * items added to the mcast_list otherwise. */ static int batadv_mcast_mla_softif_get(struct net_device *dev, struct hlist_head *mcast_list, struct batadv_mcast_mla_flags *flags) { struct net_device *bridge = batadv_mcast_get_bridge(dev); int ret4, ret6 = 0; if (bridge) dev = bridge; ret4 = batadv_mcast_mla_softif_get_ipv4(dev, mcast_list, flags); if (ret4 < 0) goto out; ret6 = batadv_mcast_mla_softif_get_ipv6(dev, mcast_list, flags); if (ret6 < 0) { ret4 = 0; goto out; } out: dev_put(bridge); return ret4 + ret6; } /** * batadv_mcast_mla_br_addr_cpy() - copy a bridge multicast address * @dst: destination to write to - a multicast MAC address * @src: source to read from - a multicast IP address * * Converts a given multicast IPv4/IPv6 address from a bridge * to its matching multicast MAC address and copies it into the given * destination buffer. * * Caller needs to make sure the destination buffer can hold * at least ETH_ALEN bytes. */ static void batadv_mcast_mla_br_addr_cpy(char *dst, const struct br_ip *src) { if (src->proto == htons(ETH_P_IP)) ip_eth_mc_map(src->dst.ip4, dst); #if IS_ENABLED(CONFIG_IPV6) else if (src->proto == htons(ETH_P_IPV6)) ipv6_eth_mc_map(&src->dst.ip6, dst); #endif else eth_zero_addr(dst); } /** * batadv_mcast_mla_bridge_get() - get bridged-in multicast listeners * @dev: a bridge slave whose bridge to collect multicast addresses from * @mcast_list: a list to put found addresses into * @flags: flags indicating the new multicast state * * Collects multicast addresses of multicast listeners residing * on foreign, non-mesh devices which we gave access to our mesh via * a bridge on top of the given soft interface, dev, in the given * mcast_list. * * Return: -ENOMEM on memory allocation error or the number of * items added to the mcast_list otherwise. */ static int batadv_mcast_mla_bridge_get(struct net_device *dev, struct hlist_head *mcast_list, struct batadv_mcast_mla_flags *flags) { struct list_head bridge_mcast_list = LIST_HEAD_INIT(bridge_mcast_list); struct br_ip_list *br_ip_entry, *tmp; u8 tvlv_flags = flags->tvlv_flags; struct batadv_hw_addr *new; u8 mcast_addr[ETH_ALEN]; int ret; /* we don't need to detect these devices/listeners, the IGMP/MLD * snooping code of the Linux bridge already does that for us */ ret = br_multicast_list_adjacent(dev, &bridge_mcast_list); if (ret < 0) goto out; list_for_each_entry(br_ip_entry, &bridge_mcast_list, list) { if (br_ip_entry->addr.proto == htons(ETH_P_IP)) { if (tvlv_flags & BATADV_MCAST_WANT_ALL_IPV4) continue; if (tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && ipv4_is_local_multicast(br_ip_entry->addr.dst.ip4)) continue; if (!(tvlv_flags & BATADV_MCAST_WANT_NO_RTR4) && !ipv4_is_local_multicast(br_ip_entry->addr.dst.ip4)) continue; } #if IS_ENABLED(CONFIG_IPV6) if (br_ip_entry->addr.proto == htons(ETH_P_IPV6)) { if (tvlv_flags & BATADV_MCAST_WANT_ALL_IPV6) continue; if (tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && ipv6_addr_is_ll_all_nodes(&br_ip_entry->addr.dst.ip6)) continue; if (!(tvlv_flags & BATADV_MCAST_WANT_NO_RTR6) && IPV6_ADDR_MC_SCOPE(&br_ip_entry->addr.dst.ip6) > IPV6_ADDR_SCOPE_LINKLOCAL) continue; } #endif batadv_mcast_mla_br_addr_cpy(mcast_addr, &br_ip_entry->addr); if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list)) continue; new = kmalloc(sizeof(*new), GFP_ATOMIC); if (!new) { ret = -ENOMEM; break; } ether_addr_copy(new->addr, mcast_addr); hlist_add_head(&new->list, mcast_list); } out: list_for_each_entry_safe(br_ip_entry, tmp, &bridge_mcast_list, list) { list_del(&br_ip_entry->list); kfree(br_ip_entry); } return ret; } /** * batadv_mcast_mla_list_free() - free a list of multicast addresses * @mcast_list: the list to free * * Removes and frees all items in the given mcast_list. */ static void batadv_mcast_mla_list_free(struct hlist_head *mcast_list) { struct batadv_hw_addr *mcast_entry; struct hlist_node *tmp; hlist_for_each_entry_safe(mcast_entry, tmp, mcast_list, list) { hlist_del(&mcast_entry->list); kfree(mcast_entry); } } /** * batadv_mcast_mla_tt_retract() - clean up multicast listener announcements * @bat_priv: the bat priv with all the soft interface information * @mcast_list: a list of addresses which should _not_ be removed * * Retracts the announcement of any multicast listener from the * translation table except the ones listed in the given mcast_list. * * If mcast_list is NULL then all are retracted. */ static void batadv_mcast_mla_tt_retract(struct batadv_priv *bat_priv, struct hlist_head *mcast_list) { struct batadv_hw_addr *mcast_entry; struct hlist_node *tmp; hlist_for_each_entry_safe(mcast_entry, tmp, &bat_priv->mcast.mla_list, list) { if (mcast_list && batadv_mcast_mla_is_duplicate(mcast_entry->addr, mcast_list)) continue; batadv_tt_local_remove(bat_priv, mcast_entry->addr, BATADV_NO_FLAGS, "mcast TT outdated", false); hlist_del(&mcast_entry->list); kfree(mcast_entry); } } /** * batadv_mcast_mla_tt_add() - add multicast listener announcements * @bat_priv: the bat priv with all the soft interface information * @mcast_list: a list of addresses which are going to get added * * Adds multicast listener announcements from the given mcast_list to the * translation table if they have not been added yet. */ static void batadv_mcast_mla_tt_add(struct batadv_priv *bat_priv, struct hlist_head *mcast_list) { struct batadv_hw_addr *mcast_entry; struct hlist_node *tmp; if (!mcast_list) return; hlist_for_each_entry_safe(mcast_entry, tmp, mcast_list, list) { if (batadv_mcast_mla_is_duplicate(mcast_entry->addr, &bat_priv->mcast.mla_list)) continue; if (!batadv_tt_local_add(bat_priv->soft_iface, mcast_entry->addr, BATADV_NO_FLAGS, BATADV_NULL_IFINDEX, BATADV_NO_MARK)) continue; hlist_del(&mcast_entry->list); hlist_add_head(&mcast_entry->list, &bat_priv->mcast.mla_list); } } /** * batadv_mcast_querier_log() - debug output regarding the querier status on * link * @bat_priv: the bat priv with all the soft interface information * @str_proto: a string for the querier protocol (e.g. "IGMP" or "MLD") * @old_state: the previous querier state on our link * @new_state: the new querier state on our link * * Outputs debug messages to the logging facility with log level 'mcast' * regarding changes to the querier status on the link which are relevant * to our multicast optimizations. * * Usually this is about whether a querier appeared or vanished in * our mesh or whether the querier is in the suboptimal position of being * behind our local bridge segment: Snooping switches will directly * forward listener reports to the querier, therefore batman-adv and * the bridge will potentially not see these listeners - the querier is * potentially shadowing listeners from us then. * * This is only interesting for nodes with a bridge on top of their * soft interface. */ static void batadv_mcast_querier_log(struct batadv_priv *bat_priv, char *str_proto, struct batadv_mcast_querier_state *old_state, struct batadv_mcast_querier_state *new_state) { if (!old_state->exists && new_state->exists) batadv_info(bat_priv->soft_iface, "%s Querier appeared\n", str_proto); else if (old_state->exists && !new_state->exists) batadv_info(bat_priv->soft_iface, "%s Querier disappeared - multicast optimizations disabled\n", str_proto); else if (!bat_priv->mcast.mla_flags.bridged && !new_state->exists) batadv_info(bat_priv->soft_iface, "No %s Querier present - multicast optimizations disabled\n", str_proto); if (new_state->exists) { if ((!old_state->shadowing && new_state->shadowing) || (!old_state->exists && new_state->shadowing)) batadv_dbg(BATADV_DBG_MCAST, bat_priv, "%s Querier is behind our bridged segment: Might shadow listeners\n", str_proto); else if (old_state->shadowing && !new_state->shadowing) batadv_dbg(BATADV_DBG_MCAST, bat_priv, "%s Querier is not behind our bridged segment\n", str_proto); } } /** * batadv_mcast_bridge_log() - debug output for topology changes in bridged * setups * @bat_priv: the bat priv with all the soft interface information * @new_flags: flags indicating the new multicast state * * If no bridges are ever used on this node, then this function does nothing. * * Otherwise this function outputs debug information to the 'mcast' log level * which might be relevant to our multicast optimizations. * * More precisely, it outputs information when a bridge interface is added or * removed from a soft interface. And when a bridge is present, it further * outputs information about the querier state which is relevant for the * multicast flags this node is going to set. */ static void batadv_mcast_bridge_log(struct batadv_priv *bat_priv, struct batadv_mcast_mla_flags *new_flags) { struct batadv_mcast_mla_flags *old_flags = &bat_priv->mcast.mla_flags; if (!old_flags->bridged && new_flags->bridged) batadv_dbg(BATADV_DBG_MCAST, bat_priv, "Bridge added: Setting Unsnoopables(U)-flag\n"); else if (old_flags->bridged && !new_flags->bridged) batadv_dbg(BATADV_DBG_MCAST, bat_priv, "Bridge removed: Unsetting Unsnoopables(U)-flag\n"); if (new_flags->bridged) { batadv_mcast_querier_log(bat_priv, "IGMP", &old_flags->querier_ipv4, &new_flags->querier_ipv4); batadv_mcast_querier_log(bat_priv, "MLD", &old_flags->querier_ipv6, &new_flags->querier_ipv6); } } /** * batadv_mcast_flags_log() - output debug information about mcast flag changes * @bat_priv: the bat priv with all the soft interface information * @flags: TVLV flags indicating the new multicast state * * Whenever the multicast TVLV flags this node announces change, this function * should be used to notify userspace about the change. */ static void batadv_mcast_flags_log(struct batadv_priv *bat_priv, u8 flags) { bool old_enabled = bat_priv->mcast.mla_flags.enabled; u8 old_flags = bat_priv->mcast.mla_flags.tvlv_flags; char str_old_flags[] = "[.... . .]"; sprintf(str_old_flags, "[%c%c%c%s%s%c]", (old_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.', (old_flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.', (old_flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.', !(old_flags & BATADV_MCAST_WANT_NO_RTR4) ? "R4" : ". ", !(old_flags & BATADV_MCAST_WANT_NO_RTR6) ? "R6" : ". ", !(old_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA) ? 'P' : '.'); batadv_dbg(BATADV_DBG_MCAST, bat_priv, "Changing multicast flags from '%s' to '[%c%c%c%s%s%c]'\n", old_enabled ? str_old_flags : "<undefined>", (flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.', (flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.', (flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.', !(flags & BATADV_MCAST_WANT_NO_RTR4) ? "R4" : ". ", !(flags & BATADV_MCAST_WANT_NO_RTR6) ? "R6" : ". ", !(flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA) ? 'P' : '.'); } /** * batadv_mcast_mla_flags_update() - update multicast flags * @bat_priv: the bat priv with all the soft interface information * @flags: flags indicating the new multicast state * * Updates the own multicast tvlv with our current multicast related settings, * capabilities and inabilities. */ static void batadv_mcast_mla_flags_update(struct batadv_priv *bat_priv, struct batadv_mcast_mla_flags *flags) { struct batadv_tvlv_mcast_data mcast_data; if (!memcmp(flags, &bat_priv->mcast.mla_flags, sizeof(*flags))) return; batadv_mcast_bridge_log(bat_priv, flags); batadv_mcast_flags_log(bat_priv, flags->tvlv_flags); mcast_data.flags = flags->tvlv_flags; memset(mcast_data.reserved, 0, sizeof(mcast_data.reserved)); batadv_tvlv_container_register(bat_priv, BATADV_TVLV_MCAST, 2, &mcast_data, sizeof(mcast_data)); bat_priv->mcast.mla_flags = *flags; } /** * __batadv_mcast_mla_update() - update the own MLAs * @bat_priv: the bat priv with all the soft interface information * * Updates the own multicast listener announcements in the translation * table as well as the own, announced multicast tvlv container. * * Note that non-conflicting reads and writes to bat_priv->mcast.mla_list * in batadv_mcast_mla_tt_retract() and batadv_mcast_mla_tt_add() are * ensured by the non-parallel execution of the worker this function * belongs to. */ static void __batadv_mcast_mla_update(struct batadv_priv *bat_priv) { struct net_device *soft_iface = bat_priv->soft_iface; struct hlist_head mcast_list = HLIST_HEAD_INIT; struct batadv_mcast_mla_flags flags; int ret; flags = batadv_mcast_mla_flags_get(bat_priv); ret = batadv_mcast_mla_softif_get(soft_iface, &mcast_list, &flags); if (ret < 0) goto out; ret = batadv_mcast_mla_bridge_get(soft_iface, &mcast_list, &flags); if (ret < 0) goto out; spin_lock(&bat_priv->mcast.mla_lock); batadv_mcast_mla_tt_retract(bat_priv, &mcast_list); batadv_mcast_mla_tt_add(bat_priv, &mcast_list); batadv_mcast_mla_flags_update(bat_priv, &flags); spin_unlock(&bat_priv->mcast.mla_lock); out: batadv_mcast_mla_list_free(&mcast_list); } /** * batadv_mcast_mla_update() - update the own MLAs * @work: kernel work struct * * Updates the own multicast listener announcements in the translation * table as well as the own, announced multicast tvlv container. * * In the end, reschedules the work timer. */ static void batadv_mcast_mla_update(struct work_struct *work) { struct delayed_work *delayed_work; struct batadv_priv_mcast *priv_mcast; struct batadv_priv *bat_priv; delayed_work = to_delayed_work(work); priv_mcast = container_of(delayed_work, struct batadv_priv_mcast, work); bat_priv = container_of(priv_mcast, struct batadv_priv, mcast); __batadv_mcast_mla_update(bat_priv); batadv_mcast_start_timer(bat_priv); } /** * batadv_mcast_is_report_ipv4() - check for IGMP reports * @skb: the ethernet frame destined for the mesh * * This call might reallocate skb data. * * Checks whether the given frame is a valid IGMP report. * * Return: If so then true, otherwise false. */ static bool batadv_mcast_is_report_ipv4(struct sk_buff *skb) { if (ip_mc_check_igmp(skb) < 0) return false; switch (igmp_hdr(skb)->type) { case IGMP_HOST_MEMBERSHIP_REPORT: case IGMPV2_HOST_MEMBERSHIP_REPORT: case IGMPV3_HOST_MEMBERSHIP_REPORT: return true; } return false; } /** * batadv_mcast_forw_mode_check_ipv4() - check for optimized forwarding * potential * @bat_priv: the bat priv with all the soft interface information * @skb: the IPv4 packet to check * @is_unsnoopable: stores whether the destination is snoopable * @is_routable: stores whether the destination is routable * * Checks whether the given IPv4 packet has the potential to be forwarded with a * mode more optimal than classic flooding. * * Return: If so then 0. Otherwise -EINVAL or -ENOMEM in case of memory * allocation failure. */ static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv, struct sk_buff *skb, bool *is_unsnoopable, int *is_routable) { struct iphdr *iphdr; /* We might fail due to out-of-memory -> drop it */ if (!pskb_may_pull(skb, sizeof(struct ethhdr) + sizeof(*iphdr))) return -ENOMEM; if (batadv_mcast_is_report_ipv4(skb)) return -EINVAL; iphdr = ip_hdr(skb); /* link-local multicast listeners behind a bridge are * not snoopable (see RFC4541, section 2.1.2.2) */ if (ipv4_is_local_multicast(iphdr->daddr)) *is_unsnoopable = true; else *is_routable = ETH_P_IP; return 0; } /** * batadv_mcast_is_report_ipv6() - check for MLD reports * @skb: the ethernet frame destined for the mesh * * This call might reallocate skb data. * * Checks whether the given frame is a valid MLD report. * * Return: If so then true, otherwise false. */ static bool batadv_mcast_is_report_ipv6(struct sk_buff *skb) { if (ipv6_mc_check_mld(skb) < 0) return false; switch (icmp6_hdr(skb)->icmp6_type) { case ICMPV6_MGM_REPORT: case ICMPV6_MLD2_REPORT: return true; } return false; } /** * batadv_mcast_forw_mode_check_ipv6() - check for optimized forwarding * potential * @bat_priv: the bat priv with all the soft interface information * @skb: the IPv6 packet to check * @is_unsnoopable: stores whether the destination is snoopable * @is_routable: stores whether the destination is routable * * Checks whether the given IPv6 packet has the potential to be forwarded with a * mode more optimal than classic flooding. * * Return: If so then 0. Otherwise -EINVAL is or -ENOMEM if we are out of memory */ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv, struct sk_buff *skb, bool *is_unsnoopable, int *is_routable) { struct ipv6hdr *ip6hdr; /* We might fail due to out-of-memory -> drop it */ if (!pskb_may_pull(skb, sizeof(struct ethhdr) + sizeof(*ip6hdr))) return -ENOMEM; if (batadv_mcast_is_report_ipv6(skb)) return -EINVAL; ip6hdr = ipv6_hdr(skb); if (IPV6_ADDR_MC_SCOPE(&ip6hdr->daddr) < IPV6_ADDR_SCOPE_LINKLOCAL) return -EINVAL; /* link-local-all-nodes multicast listeners behind a bridge are * not snoopable (see RFC4541, section 3, paragraph 3) */ if (ipv6_addr_is_ll_all_nodes(&ip6hdr->daddr)) *is_unsnoopable = true; else if (IPV6_ADDR_MC_SCOPE(&ip6hdr->daddr) > IPV6_ADDR_SCOPE_LINKLOCAL) *is_routable = ETH_P_IPV6; return 0; } /** * batadv_mcast_forw_mode_check() - check for optimized forwarding potential * @bat_priv: the bat priv with all the soft interface information * @skb: the multicast frame to check * @is_unsnoopable: stores whether the destination is snoopable * @is_routable: stores whether the destination is routable * * Checks whether the given multicast ethernet frame has the potential to be * forwarded with a mode more optimal than classic flooding. * * Return: If so then 0. Otherwise -EINVAL is or -ENOMEM if we are out of memory */ static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv, struct sk_buff *skb, bool *is_unsnoopable, int *is_routable) { struct ethhdr *ethhdr = eth_hdr(skb); if (!atomic_read(&bat_priv->multicast_mode)) return -EINVAL; switch (ntohs(ethhdr->h_proto)) { case ETH_P_IP: return batadv_mcast_forw_mode_check_ipv4(bat_priv, skb, is_unsnoopable, is_routable); case ETH_P_IPV6: if (!IS_ENABLED(CONFIG_IPV6)) return -EINVAL; return batadv_mcast_forw_mode_check_ipv6(bat_priv, skb, is_unsnoopable, is_routable); default: return -EINVAL; } } /** * batadv_mcast_forw_want_all_ip_count() - count nodes with unspecific mcast * interest * @bat_priv: the bat priv with all the soft interface information * @ethhdr: ethernet header of a packet * * Return: the number of nodes which want all IPv4 multicast traffic if the * given ethhdr is from an IPv4 packet or the number of nodes which want all * IPv6 traffic if it matches an IPv6 packet. */ static int batadv_mcast_forw_want_all_ip_count(struct batadv_priv *bat_priv, struct ethhdr *ethhdr) { switch (ntohs(ethhdr->h_proto)) { case ETH_P_IP: return atomic_read(&bat_priv->mcast.num_want_all_ipv4); case ETH_P_IPV6: return atomic_read(&bat_priv->mcast.num_want_all_ipv6); default: /* we shouldn't be here... */ return 0; } } /** * batadv_mcast_forw_rtr_count() - count nodes with a multicast router * @bat_priv: the bat priv with all the soft interface information * @protocol: the ethernet protocol type to count multicast routers for * * Return: the number of nodes which want all routable IPv4 multicast traffic * if the protocol is ETH_P_IP or the number of nodes which want all routable * IPv6 traffic if the protocol is ETH_P_IPV6. Otherwise returns 0. */ static int batadv_mcast_forw_rtr_count(struct batadv_priv *bat_priv, int protocol) { switch (protocol) { case ETH_P_IP: return atomic_read(&bat_priv->mcast.num_want_all_rtr4); case ETH_P_IPV6: return atomic_read(&bat_priv->mcast.num_want_all_rtr6); default: return 0; } } /** * batadv_mcast_forw_mode_by_count() - get forwarding mode by count * @bat_priv: the bat priv with all the soft interface information * @skb: the multicast packet to check * @vid: the vlan identifier * @is_routable: stores whether the destination is routable * @count: the number of originators the multicast packet need to be sent to * * For a multicast packet with multiple destination originators, checks which * mode to use. For BATADV_FORW_MCAST it also encapsulates the packet with a * complete batman-adv multicast header. * * Return: * BATADV_FORW_MCAST: If all nodes have multicast packet routing * capabilities and an MTU >= 1280 on all hard interfaces (including us) * and the encapsulated multicast packet with all destination addresses * would still fit into an 1280 bytes batman-adv multicast packet * (excluding the outer ethernet frame) and we could successfully push * the full batman-adv multicast packet header. * BATADV_FORW_UCASTS: If the packet cannot be sent in a batman-adv * multicast packet and the amount of batman-adv unicast packets needed * is smaller or equal to the configured multicast fanout. * BATADV_FORW_BCAST: Otherwise. */ static enum batadv_forw_mode batadv_mcast_forw_mode_by_count(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid, int is_routable, int count) { unsigned int mcast_hdrlen = batadv_mcast_forw_packet_hdrlen(count); u8 own_tvlv_flags = bat_priv->mcast.mla_flags.tvlv_flags; if (!atomic_read(&bat_priv->mcast.num_no_mc_ptype_capa) && own_tvlv_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA && skb->len + mcast_hdrlen <= IPV6_MIN_MTU && batadv_mcast_forw_push(bat_priv, skb, vid, is_routable, count)) return BATADV_FORW_MCAST; if (count <= atomic_read(&bat_priv->multicast_fanout)) return BATADV_FORW_UCASTS; return BATADV_FORW_BCAST; } /** * batadv_mcast_forw_mode() - check on how to forward a multicast packet * @bat_priv: the bat priv with all the soft interface information * @skb: the multicast packet to check * @vid: the vlan identifier * @is_routable: stores whether the destination is routable * * Return: The forwarding mode as enum batadv_forw_mode. */ enum batadv_forw_mode batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid, int *is_routable) { int ret, tt_count, ip_count, unsnoop_count, total_count; bool is_unsnoopable = false; struct ethhdr *ethhdr; int rtr_count = 0; ret = batadv_mcast_forw_mode_check(bat_priv, skb, &is_unsnoopable, is_routable); if (ret == -ENOMEM) return BATADV_FORW_NONE; else if (ret < 0) return BATADV_FORW_BCAST; ethhdr = eth_hdr(skb); tt_count = batadv_tt_global_hash_count(bat_priv, ethhdr->h_dest, BATADV_NO_FLAGS); ip_count = batadv_mcast_forw_want_all_ip_count(bat_priv, ethhdr); unsnoop_count = !is_unsnoopable ? 0 : atomic_read(&bat_priv->mcast.num_want_all_unsnoopables); rtr_count = batadv_mcast_forw_rtr_count(bat_priv, *is_routable); total_count = tt_count + ip_count + unsnoop_count + rtr_count; if (!total_count) return BATADV_FORW_NONE; else if (unsnoop_count) return BATADV_FORW_BCAST; return batadv_mcast_forw_mode_by_count(bat_priv, skb, vid, *is_routable, total_count); } /** * batadv_mcast_forw_send_orig() - send a multicast packet to an originator * @bat_priv: the bat priv with all the soft interface information * @skb: the multicast packet to send * @vid: the vlan identifier * @orig_node: the originator to send the packet to * * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. */ static int batadv_mcast_forw_send_orig(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid, struct batadv_orig_node *orig_node) { /* Avoid sending multicast-in-unicast packets to other BLA * gateways - they already got the frame from the LAN side * we share with them. * TODO: Refactor to take BLA into account earlier, to avoid * reducing the mcast_fanout count. */ if (batadv_bla_is_backbone_gw_orig(bat_priv, orig_node->orig, vid)) { dev_kfree_skb(skb); return NET_XMIT_SUCCESS; } return batadv_send_skb_unicast(bat_priv, skb, BATADV_UNICAST, 0, orig_node, vid); } /** * batadv_mcast_forw_tt() - forwards a packet to multicast listeners * @bat_priv: the bat priv with all the soft interface information * @skb: the multicast packet to transmit * @vid: the vlan identifier * * Sends copies of a frame with multicast destination to any multicast * listener registered in the translation table. A transmission is performed * via a batman-adv unicast packet for each such destination node. * * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS * otherwise. */ static int batadv_mcast_forw_tt(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { int ret = NET_XMIT_SUCCESS; struct sk_buff *newskb; struct batadv_tt_orig_list_entry *orig_entry; struct batadv_tt_global_entry *tt_global; const u8 *addr = eth_hdr(skb)->h_dest; tt_global = batadv_tt_global_hash_find(bat_priv, addr, vid); if (!tt_global) goto out; rcu_read_lock(); hlist_for_each_entry_rcu(orig_entry, &tt_global->orig_list, list) { newskb = skb_copy(skb, GFP_ATOMIC); if (!newskb) { ret = NET_XMIT_DROP; break; } batadv_mcast_forw_send_orig(bat_priv, newskb, vid, orig_entry->orig_node); } rcu_read_unlock(); batadv_tt_global_entry_put(tt_global); out: return ret; } /** * batadv_mcast_forw_want_all_ipv4() - forward to nodes with want-all-ipv4 * @bat_priv: the bat priv with all the soft interface information * @skb: the multicast packet to transmit * @vid: the vlan identifier * * Sends copies of a frame with multicast destination to any node with a * BATADV_MCAST_WANT_ALL_IPV4 flag set. A transmission is performed via a * batman-adv unicast packet for each such destination node. * * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS * otherwise. */ static int batadv_mcast_forw_want_all_ipv4(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { struct batadv_orig_node *orig_node; int ret = NET_XMIT_SUCCESS; struct sk_buff *newskb; rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, &bat_priv->mcast.want_all_ipv4_list, mcast_want_all_ipv4_node) { newskb = skb_copy(skb, GFP_ATOMIC); if (!newskb) { ret = NET_XMIT_DROP; break; } batadv_mcast_forw_send_orig(bat_priv, newskb, vid, orig_node); } rcu_read_unlock(); return ret; } /** * batadv_mcast_forw_want_all_ipv6() - forward to nodes with want-all-ipv6 * @bat_priv: the bat priv with all the soft interface information * @skb: The multicast packet to transmit * @vid: the vlan identifier * * Sends copies of a frame with multicast destination to any node with a * BATADV_MCAST_WANT_ALL_IPV6 flag set. A transmission is performed via a * batman-adv unicast packet for each such destination node. * * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS * otherwise. */ static int batadv_mcast_forw_want_all_ipv6(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { struct batadv_orig_node *orig_node; int ret = NET_XMIT_SUCCESS; struct sk_buff *newskb; rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, &bat_priv->mcast.want_all_ipv6_list, mcast_want_all_ipv6_node) { newskb = skb_copy(skb, GFP_ATOMIC); if (!newskb) { ret = NET_XMIT_DROP; break; } batadv_mcast_forw_send_orig(bat_priv, newskb, vid, orig_node); } rcu_read_unlock(); return ret; } /** * batadv_mcast_forw_want_all() - forward packet to nodes in a want-all list * @bat_priv: the bat priv with all the soft interface information * @skb: the multicast packet to transmit * @vid: the vlan identifier * * Sends copies of a frame with multicast destination to any node with a * BATADV_MCAST_WANT_ALL_IPV4 or BATADV_MCAST_WANT_ALL_IPV6 flag set. A * transmission is performed via a batman-adv unicast packet for each such * destination node. * * Return: NET_XMIT_DROP on memory allocation failure or if the protocol family * is neither IPv4 nor IPv6. NET_XMIT_SUCCESS otherwise. */ static int batadv_mcast_forw_want_all(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { switch (ntohs(eth_hdr(skb)->h_proto)) { case ETH_P_IP: return batadv_mcast_forw_want_all_ipv4(bat_priv, skb, vid); case ETH_P_IPV6: return batadv_mcast_forw_want_all_ipv6(bat_priv, skb, vid); default: /* we shouldn't be here... */ return NET_XMIT_DROP; } } /** * batadv_mcast_forw_want_all_rtr4() - forward to nodes with want-all-rtr4 * @bat_priv: the bat priv with all the soft interface information * @skb: the multicast packet to transmit * @vid: the vlan identifier * * Sends copies of a frame with multicast destination to any node with a * BATADV_MCAST_WANT_NO_RTR4 flag unset. A transmission is performed via a * batman-adv unicast packet for each such destination node. * * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS * otherwise. */ static int batadv_mcast_forw_want_all_rtr4(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { struct batadv_orig_node *orig_node; int ret = NET_XMIT_SUCCESS; struct sk_buff *newskb; rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, &bat_priv->mcast.want_all_rtr4_list, mcast_want_all_rtr4_node) { newskb = skb_copy(skb, GFP_ATOMIC); if (!newskb) { ret = NET_XMIT_DROP; break; } batadv_mcast_forw_send_orig(bat_priv, newskb, vid, orig_node); } rcu_read_unlock(); return ret; } /** * batadv_mcast_forw_want_all_rtr6() - forward to nodes with want-all-rtr6 * @bat_priv: the bat priv with all the soft interface information * @skb: The multicast packet to transmit * @vid: the vlan identifier * * Sends copies of a frame with multicast destination to any node with a * BATADV_MCAST_WANT_NO_RTR6 flag unset. A transmission is performed via a * batman-adv unicast packet for each such destination node. * * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS * otherwise. */ static int batadv_mcast_forw_want_all_rtr6(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { struct batadv_orig_node *orig_node; int ret = NET_XMIT_SUCCESS; struct sk_buff *newskb; rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, &bat_priv->mcast.want_all_rtr6_list, mcast_want_all_rtr6_node) { newskb = skb_copy(skb, GFP_ATOMIC); if (!newskb) { ret = NET_XMIT_DROP; break; } batadv_mcast_forw_send_orig(bat_priv, newskb, vid, orig_node); } rcu_read_unlock(); return ret; } /** * batadv_mcast_forw_want_rtr() - forward packet to nodes in a want-all-rtr list * @bat_priv: the bat priv with all the soft interface information * @skb: the multicast packet to transmit * @vid: the vlan identifier * * Sends copies of a frame with multicast destination to any node with a * BATADV_MCAST_WANT_NO_RTR4 or BATADV_MCAST_WANT_NO_RTR6 flag unset. A * transmission is performed via a batman-adv unicast packet for each such * destination node. * * Return: NET_XMIT_DROP on memory allocation failure or if the protocol family * is neither IPv4 nor IPv6. NET_XMIT_SUCCESS otherwise. */ static int batadv_mcast_forw_want_rtr(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { switch (ntohs(eth_hdr(skb)->h_proto)) { case ETH_P_IP: return batadv_mcast_forw_want_all_rtr4(bat_priv, skb, vid); case ETH_P_IPV6: return batadv_mcast_forw_want_all_rtr6(bat_priv, skb, vid); default: /* we shouldn't be here... */ return NET_XMIT_DROP; } } /** * batadv_mcast_forw_send() - send packet to any detected multicast recipient * @bat_priv: the bat priv with all the soft interface information * @skb: the multicast packet to transmit * @vid: the vlan identifier * @is_routable: stores whether the destination is routable * * Sends copies of a frame with multicast destination to any node that signaled * interest in it, that is either via the translation table or the according * want-all flags. A transmission is performed via a batman-adv unicast packet * for each such destination node. * * The given skb is consumed/freed. * * Return: NET_XMIT_DROP on memory allocation failure or if the protocol family * is neither IPv4 nor IPv6. NET_XMIT_SUCCESS otherwise. */ int batadv_mcast_forw_send(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid, int is_routable) { int ret; ret = batadv_mcast_forw_tt(bat_priv, skb, vid); if (ret != NET_XMIT_SUCCESS) { kfree_skb(skb); return ret; } ret = batadv_mcast_forw_want_all(bat_priv, skb, vid); if (ret != NET_XMIT_SUCCESS) { kfree_skb(skb); return ret; } if (!is_routable) goto skip_mc_router; ret = batadv_mcast_forw_want_rtr(bat_priv, skb, vid); if (ret != NET_XMIT_SUCCESS) { kfree_skb(skb); return ret; } skip_mc_router: consume_skb(skb); return ret; } /** * batadv_mcast_want_unsnoop_update() - update unsnoop counter and list * @bat_priv: the bat priv with all the soft interface information * @orig: the orig_node which multicast state might have changed of * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_ALL_UNSNOOPABLES flag of this originator, * orig, has toggled then this method updates the counter and the list * accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ static void batadv_mcast_want_unsnoop_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 mcast_flags) { struct hlist_node *node = &orig->mcast_want_all_unsnoopables_node; struct hlist_head *head = &bat_priv->mcast.want_all_unsnoopables_list; lockdep_assert_held(&orig->mcast_handler_lock); /* switched from flag unset to set */ if (mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && !(orig->mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES)) { atomic_inc(&bat_priv->mcast.num_want_all_unsnoopables); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(!hlist_unhashed(node)); hlist_add_head_rcu(node, head); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); /* switched from flag set to unset */ } else if (!(mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) && orig->mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) { atomic_dec(&bat_priv->mcast.num_want_all_unsnoopables); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(hlist_unhashed(node)); hlist_del_init_rcu(node); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); } } /** * batadv_mcast_want_ipv4_update() - update want-all-ipv4 counter and list * @bat_priv: the bat priv with all the soft interface information * @orig: the orig_node which multicast state might have changed of * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_ALL_IPV4 flag of this originator, orig, has * toggled then this method updates the counter and the list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ static void batadv_mcast_want_ipv4_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 mcast_flags) { struct hlist_node *node = &orig->mcast_want_all_ipv4_node; struct hlist_head *head = &bat_priv->mcast.want_all_ipv4_list; lockdep_assert_held(&orig->mcast_handler_lock); /* switched from flag unset to set */ if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV4 && !(orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV4)) { atomic_inc(&bat_priv->mcast.num_want_all_ipv4); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(!hlist_unhashed(node)); hlist_add_head_rcu(node, head); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); /* switched from flag set to unset */ } else if (!(mcast_flags & BATADV_MCAST_WANT_ALL_IPV4) && orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV4) { atomic_dec(&bat_priv->mcast.num_want_all_ipv4); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(hlist_unhashed(node)); hlist_del_init_rcu(node); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); } } /** * batadv_mcast_want_ipv6_update() - update want-all-ipv6 counter and list * @bat_priv: the bat priv with all the soft interface information * @orig: the orig_node which multicast state might have changed of * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_ALL_IPV6 flag of this originator, orig, has * toggled then this method updates the counter and the list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ static void batadv_mcast_want_ipv6_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 mcast_flags) { struct hlist_node *node = &orig->mcast_want_all_ipv6_node; struct hlist_head *head = &bat_priv->mcast.want_all_ipv6_list; lockdep_assert_held(&orig->mcast_handler_lock); /* switched from flag unset to set */ if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV6 && !(orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV6)) { atomic_inc(&bat_priv->mcast.num_want_all_ipv6); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(!hlist_unhashed(node)); hlist_add_head_rcu(node, head); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); /* switched from flag set to unset */ } else if (!(mcast_flags & BATADV_MCAST_WANT_ALL_IPV6) && orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV6) { atomic_dec(&bat_priv->mcast.num_want_all_ipv6); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(hlist_unhashed(node)); hlist_del_init_rcu(node); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); } } /** * batadv_mcast_want_rtr4_update() - update want-all-rtr4 counter and list * @bat_priv: the bat priv with all the soft interface information * @orig: the orig_node which multicast state might have changed of * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_NO_RTR4 flag of this originator, orig, has * toggled then this method updates the counter and the list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ static void batadv_mcast_want_rtr4_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 mcast_flags) { struct hlist_node *node = &orig->mcast_want_all_rtr4_node; struct hlist_head *head = &bat_priv->mcast.want_all_rtr4_list; lockdep_assert_held(&orig->mcast_handler_lock); /* switched from flag set to unset */ if (!(mcast_flags & BATADV_MCAST_WANT_NO_RTR4) && orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR4) { atomic_inc(&bat_priv->mcast.num_want_all_rtr4); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(!hlist_unhashed(node)); hlist_add_head_rcu(node, head); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); /* switched from flag unset to set */ } else if (mcast_flags & BATADV_MCAST_WANT_NO_RTR4 && !(orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR4)) { atomic_dec(&bat_priv->mcast.num_want_all_rtr4); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(hlist_unhashed(node)); hlist_del_init_rcu(node); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); } } /** * batadv_mcast_want_rtr6_update() - update want-all-rtr6 counter and list * @bat_priv: the bat priv with all the soft interface information * @orig: the orig_node which multicast state might have changed of * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_NO_RTR6 flag of this originator, orig, has * toggled then this method updates the counter and the list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ static void batadv_mcast_want_rtr6_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 mcast_flags) { struct hlist_node *node = &orig->mcast_want_all_rtr6_node; struct hlist_head *head = &bat_priv->mcast.want_all_rtr6_list; lockdep_assert_held(&orig->mcast_handler_lock); /* switched from flag set to unset */ if (!(mcast_flags & BATADV_MCAST_WANT_NO_RTR6) && orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR6) { atomic_inc(&bat_priv->mcast.num_want_all_rtr6); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(!hlist_unhashed(node)); hlist_add_head_rcu(node, head); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); /* switched from flag unset to set */ } else if (mcast_flags & BATADV_MCAST_WANT_NO_RTR6 && !(orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR6)) { atomic_dec(&bat_priv->mcast.num_want_all_rtr6); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(hlist_unhashed(node)); hlist_del_init_rcu(node); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); } } /** * batadv_mcast_have_mc_ptype_update() - update multicast packet type counter * @bat_priv: the bat priv with all the soft interface information * @orig: the orig_node which multicast state might have changed of * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_HAVE_MC_PTYPE_CAPA flag of this originator, orig, has * toggled then this method updates the counter accordingly. */ static void batadv_mcast_have_mc_ptype_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 mcast_flags) { lockdep_assert_held(&orig->mcast_handler_lock); /* switched from flag set to unset */ if (!(mcast_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA) && orig->mcast_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA) atomic_inc(&bat_priv->mcast.num_no_mc_ptype_capa); /* switched from flag unset to set */ else if (mcast_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA && !(orig->mcast_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA)) atomic_dec(&bat_priv->mcast.num_no_mc_ptype_capa); } /** * batadv_mcast_tvlv_flags_get() - get multicast flags from an OGM TVLV * @enabled: whether the originator has multicast TVLV support enabled * @tvlv_value: tvlv buffer containing the multicast flags * @tvlv_value_len: tvlv buffer length * * Return: multicast flags for the given tvlv buffer */ static u8 batadv_mcast_tvlv_flags_get(bool enabled, void *tvlv_value, u16 tvlv_value_len) { u8 mcast_flags = BATADV_NO_FLAGS; if (enabled && tvlv_value && tvlv_value_len >= sizeof(mcast_flags)) mcast_flags = *(u8 *)tvlv_value; if (!enabled) { mcast_flags |= BATADV_MCAST_WANT_ALL_IPV4; mcast_flags |= BATADV_MCAST_WANT_ALL_IPV6; } /* remove redundant flags to avoid sending duplicate packets later */ if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV4) mcast_flags |= BATADV_MCAST_WANT_NO_RTR4; if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV6) mcast_flags |= BATADV_MCAST_WANT_NO_RTR6; return mcast_flags; } /** * batadv_mcast_tvlv_ogm_handler() - process incoming multicast tvlv container * @bat_priv: the bat priv with all the soft interface information * @orig: the orig_node of the ogm * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags) * @tvlv_value: tvlv buffer containing the multicast data * @tvlv_value_len: tvlv buffer length */ static void batadv_mcast_tvlv_ogm_handler(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 flags, void *tvlv_value, u16 tvlv_value_len) { bool orig_mcast_enabled = !(flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND); u8 mcast_flags; mcast_flags = batadv_mcast_tvlv_flags_get(orig_mcast_enabled, tvlv_value, tvlv_value_len); spin_lock_bh(&orig->mcast_handler_lock); if (orig_mcast_enabled && !test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities)) { set_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities); } else if (!orig_mcast_enabled && test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities)) { clear_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities); } set_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capa_initialized); batadv_mcast_want_unsnoop_update(bat_priv, orig, mcast_flags); batadv_mcast_want_ipv4_update(bat_priv, orig, mcast_flags); batadv_mcast_want_ipv6_update(bat_priv, orig, mcast_flags); batadv_mcast_want_rtr4_update(bat_priv, orig, mcast_flags); batadv_mcast_want_rtr6_update(bat_priv, orig, mcast_flags); batadv_mcast_have_mc_ptype_update(bat_priv, orig, mcast_flags); orig->mcast_flags = mcast_flags; spin_unlock_bh(&orig->mcast_handler_lock); } /** * batadv_mcast_init() - initialize the multicast optimizations structures * @bat_priv: the bat priv with all the soft interface information */ void batadv_mcast_init(struct batadv_priv *bat_priv) { batadv_tvlv_handler_register(bat_priv, batadv_mcast_tvlv_ogm_handler, NULL, NULL, BATADV_TVLV_MCAST, 2, BATADV_TVLV_HANDLER_OGM_CIFNOTFND); batadv_tvlv_handler_register(bat_priv, NULL, NULL, batadv_mcast_forw_tracker_tvlv_handler, BATADV_TVLV_MCAST_TRACKER, 1, BATADV_TVLV_HANDLER_OGM_CIFNOTFND); INIT_DELAYED_WORK(&bat_priv->mcast.work, batadv_mcast_mla_update); batadv_mcast_start_timer(bat_priv); } /** * batadv_mcast_mesh_info_put() - put multicast info into a netlink message * @msg: buffer for the message * @bat_priv: the bat priv with all the soft interface information * * Return: 0 or error code. */ int batadv_mcast_mesh_info_put(struct sk_buff *msg, struct batadv_priv *bat_priv) { u32 flags = bat_priv->mcast.mla_flags.tvlv_flags; u32 flags_priv = BATADV_NO_FLAGS; if (bat_priv->mcast.mla_flags.bridged) { flags_priv |= BATADV_MCAST_FLAGS_BRIDGED; if (bat_priv->mcast.mla_flags.querier_ipv4.exists) flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV4_EXISTS; if (bat_priv->mcast.mla_flags.querier_ipv6.exists) flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV6_EXISTS; if (bat_priv->mcast.mla_flags.querier_ipv4.shadowing) flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV4_SHADOWING; if (bat_priv->mcast.mla_flags.querier_ipv6.shadowing) flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV6_SHADOWING; } if (nla_put_u32(msg, BATADV_ATTR_MCAST_FLAGS, flags) || nla_put_u32(msg, BATADV_ATTR_MCAST_FLAGS_PRIV, flags_priv)) return -EMSGSIZE; return 0; } /** * batadv_mcast_flags_dump_entry() - dump one entry of the multicast flags table * to a netlink socket * @msg: buffer for the message * @portid: netlink port * @cb: Control block containing additional options * @orig_node: originator to dump the multicast flags of * * Return: 0 or error code. */ static int batadv_mcast_flags_dump_entry(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_orig_node *orig_node) { void *hdr; hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_MCAST_FLAGS); if (!hdr) return -ENOBUFS; genl_dump_check_consistent(cb, hdr); if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, orig_node->orig)) { genlmsg_cancel(msg, hdr); return -EMSGSIZE; } if (test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig_node->capabilities)) { if (nla_put_u32(msg, BATADV_ATTR_MCAST_FLAGS, orig_node->mcast_flags)) { genlmsg_cancel(msg, hdr); return -EMSGSIZE; } } genlmsg_end(msg, hdr); return 0; } /** * batadv_mcast_flags_dump_bucket() - dump one bucket of the multicast flags * table to a netlink socket * @msg: buffer for the message * @portid: netlink port * @cb: Control block containing additional options * @hash: hash to dump * @bucket: bucket index to dump * @idx_skip: How many entries to skip * * Return: 0 or error code. */ static int batadv_mcast_flags_dump_bucket(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_hashtable *hash, unsigned int bucket, long *idx_skip) { struct batadv_orig_node *orig_node; long idx = 0; spin_lock_bh(&hash->list_locks[bucket]); cb->seq = atomic_read(&hash->generation) << 1 | 1; hlist_for_each_entry(orig_node, &hash->table[bucket], hash_entry) { if (!test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig_node->capa_initialized)) continue; if (idx < *idx_skip) goto skip; if (batadv_mcast_flags_dump_entry(msg, portid, cb, orig_node)) { spin_unlock_bh(&hash->list_locks[bucket]); *idx_skip = idx; return -EMSGSIZE; } skip: idx++; } spin_unlock_bh(&hash->list_locks[bucket]); return 0; } /** * __batadv_mcast_flags_dump() - dump multicast flags table to a netlink socket * @msg: buffer for the message * @portid: netlink port * @cb: Control block containing additional options * @bat_priv: the bat priv with all the soft interface information * @bucket: current bucket to dump * @idx: index in current bucket to the next entry to dump * * Return: 0 or error code. */ static int __batadv_mcast_flags_dump(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_priv *bat_priv, long *bucket, long *idx) { struct batadv_hashtable *hash = bat_priv->orig_hash; long bucket_tmp = *bucket; long idx_tmp = *idx; while (bucket_tmp < hash->size) { if (batadv_mcast_flags_dump_bucket(msg, portid, cb, hash, bucket_tmp, &idx_tmp)) break; bucket_tmp++; idx_tmp = 0; } *bucket = bucket_tmp; *idx = idx_tmp; return msg->len; } /** * batadv_mcast_netlink_get_primary() - get primary interface from netlink * callback * @cb: netlink callback structure * @primary_if: the primary interface pointer to return the result in * * Return: 0 or error code. */ static int batadv_mcast_netlink_get_primary(struct netlink_callback *cb, struct batadv_hard_iface **primary_if) { struct batadv_hard_iface *hard_iface = NULL; struct net_device *soft_iface; struct batadv_priv *bat_priv; int ret = 0; soft_iface = batadv_netlink_get_softif(cb); if (IS_ERR(soft_iface)) return PTR_ERR(soft_iface); bat_priv = netdev_priv(soft_iface); hard_iface = batadv_primary_if_get_selected(bat_priv); if (!hard_iface || hard_iface->if_status != BATADV_IF_ACTIVE) { ret = -ENOENT; goto out; } out: dev_put(soft_iface); if (!ret && primary_if) *primary_if = hard_iface; else batadv_hardif_put(hard_iface); return ret; } /** * batadv_mcast_flags_dump() - dump multicast flags table to a netlink socket * @msg: buffer for the message * @cb: callback structure containing arguments * * Return: message length. */ int batadv_mcast_flags_dump(struct sk_buff *msg, struct netlink_callback *cb) { struct batadv_hard_iface *primary_if = NULL; int portid = NETLINK_CB(cb->skb).portid; struct batadv_priv *bat_priv; long *bucket = &cb->args[0]; long *idx = &cb->args[1]; int ret; ret = batadv_mcast_netlink_get_primary(cb, &primary_if); if (ret) return ret; bat_priv = netdev_priv(primary_if->soft_iface); ret = __batadv_mcast_flags_dump(msg, portid, cb, bat_priv, bucket, idx); batadv_hardif_put(primary_if); return ret; } /** * batadv_mcast_free() - free the multicast optimizations structures * @bat_priv: the bat priv with all the soft interface information */ void batadv_mcast_free(struct batadv_priv *bat_priv) { cancel_delayed_work_sync(&bat_priv->mcast.work); batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_MCAST, 2); batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_MCAST_TRACKER, 1); batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_MCAST, 2); /* safely calling outside of worker, as worker was canceled above */ batadv_mcast_mla_tt_retract(bat_priv, NULL); } /** * batadv_mcast_purge_orig() - reset originator global mcast state modifications * @orig: the originator which is going to get purged */ void batadv_mcast_purge_orig(struct batadv_orig_node *orig) { struct batadv_priv *bat_priv = orig->bat_priv; spin_lock_bh(&orig->mcast_handler_lock); batadv_mcast_want_unsnoop_update(bat_priv, orig, BATADV_NO_FLAGS); batadv_mcast_want_ipv4_update(bat_priv, orig, BATADV_NO_FLAGS); batadv_mcast_want_ipv6_update(bat_priv, orig, BATADV_NO_FLAGS); batadv_mcast_want_rtr4_update(bat_priv, orig, BATADV_MCAST_WANT_NO_RTR4); batadv_mcast_want_rtr6_update(bat_priv, orig, BATADV_MCAST_WANT_NO_RTR6); batadv_mcast_have_mc_ptype_update(bat_priv, orig, BATADV_MCAST_HAVE_MC_PTYPE_CAPA); spin_unlock_bh(&orig->mcast_handler_lock); }
4 1384 35 15 707 829 39 1359 1375 715 845 1360 119 799 809 983 992 5 5 327 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 // SPDX-License-Identifier: GPL-2.0-or-later /* * * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * IP/TCP/UDP checksumming routines * * Authors: Jorge Cwik, <jorge@laser.satlink.net> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Tom May, <ftom@netcom.com> * Andreas Schwab, <schwab@issan.informatik.uni-dortmund.de> * Lots of code moved from tcp.c and ip.c; see those files * for more names. * * 03/02/96 Jes Sorensen, Andreas Schwab, Roman Hodek: * Fixed some nasty bugs, causing some horrible crashes. * A: At some points, the sum (%0) was used as * length-counter instead of the length counter * (%1). Thanks to Roman Hodek for pointing this out. * B: GCC seems to mess up if one uses too many * data-registers to hold input values and one tries to * specify d0 and d1 as scratch registers. Letting gcc * choose these registers itself solves the problem. */ /* Revised by Kenneth Albanowski for m68knommu. Basic problem: unaligned access kills, so most of the assembly has to go. */ #include <linux/export.h> #include <net/checksum.h> #include <asm/byteorder.h> #ifndef do_csum static unsigned int do_csum(const unsigned char *buff, int len) { int odd; unsigned int result = 0; if (len <= 0) goto out; odd = 1 & (unsigned long) buff; if (odd) { #ifdef __LITTLE_ENDIAN result += (*buff << 8); #else result = *buff; #endif len--; buff++; } if (len >= 2) { if (2 & (unsigned long) buff) { result += *(unsigned short *) buff; len -= 2; buff += 2; } if (len >= 4) { const unsigned char *end = buff + ((unsigned)len & ~3); unsigned int carry = 0; do { unsigned int w = *(unsigned int *) buff; buff += 4; result += carry; result += w; carry = (w > result); } while (buff < end); result += carry; result = (result & 0xffff) + (result >> 16); } if (len & 2) { result += *(unsigned short *) buff; buff += 2; } } if (len & 1) #ifdef __LITTLE_ENDIAN result += *buff; #else result += (*buff << 8); #endif result = csum_from32to16(result); if (odd) result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); out: return result; } #endif #ifndef ip_fast_csum /* * This is a version of ip_compute_csum() optimized for IP headers, * which always checksum on 4 octet boundaries. */ __sum16 ip_fast_csum(const void *iph, unsigned int ihl) { return (__force __sum16)~do_csum(iph, ihl*4); } EXPORT_SYMBOL(ip_fast_csum); #endif /* * computes the checksum of a memory block at buff, length len, * and adds in "sum" (32-bit) * * returns a 32-bit number suitable for feeding into itself * or csum_tcpudp_magic * * this function must be called with even lengths, except * for the last fragment, which may be odd * * it's best to have buff aligned on a 32-bit boundary */ __wsum csum_partial(const void *buff, int len, __wsum wsum) { unsigned int sum = (__force unsigned int)wsum; unsigned int result = do_csum(buff, len); /* add in old sum, and carry.. */ result += sum; if (sum > result) result += 1; return (__force __wsum)result; } EXPORT_SYMBOL(csum_partial); /* * this routine is used for miscellaneous IP-like checksums, mainly * in icmp.c */ __sum16 ip_compute_csum(const void *buff, int len) { return (__force __sum16)~do_csum(buff, len); } EXPORT_SYMBOL(ip_compute_csum); #ifndef csum_tcpudp_nofold static inline u32 from64to32(u64 x) { /* add up 32-bit and 32-bit for 32+c bit */ x = (x & 0xffffffff) + (x >> 32); /* add up carry.. */ x = (x & 0xffffffff) + (x >> 32); return (u32)x; } __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __wsum sum) { unsigned long long s = (__force u32)sum; s += (__force u32)saddr; s += (__force u32)daddr; #ifdef __BIG_ENDIAN s += proto + len; #else s += (proto + len) << 8; #endif return (__force __wsum)from64to32(s); } EXPORT_SYMBOL(csum_tcpudp_nofold); #endif
2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 Mellanox Technologies. All rights reserved */ #include <linux/debugfs.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/slab.h> #include "netdevsim.h" static int nsim_dev_empty_reporter_dump(struct devlink_health_reporter *reporter, struct devlink_fmsg *fmsg, void *priv_ctx, struct netlink_ext_ack *extack) { return 0; } static int nsim_dev_empty_reporter_diagnose(struct devlink_health_reporter *reporter, struct devlink_fmsg *fmsg, struct netlink_ext_ack *extack) { return 0; } static const struct devlink_health_reporter_ops nsim_dev_empty_reporter_ops = { .name = "empty", .dump = nsim_dev_empty_reporter_dump, .diagnose = nsim_dev_empty_reporter_diagnose, }; struct nsim_dev_dummy_reporter_ctx { char *break_msg; }; static int nsim_dev_dummy_reporter_recover(struct devlink_health_reporter *reporter, void *priv_ctx, struct netlink_ext_ack *extack) { struct nsim_dev_health *health = devlink_health_reporter_priv(reporter); struct nsim_dev_dummy_reporter_ctx *ctx = priv_ctx; if (health->fail_recover) { /* For testing purposes, user set debugfs fail_recover * value to true. Fail right away. */ NL_SET_ERR_MSG_MOD(extack, "User setup the recover to fail for testing purposes"); return -EINVAL; } if (ctx) { kfree(health->recovered_break_msg); health->recovered_break_msg = kstrdup(ctx->break_msg, GFP_KERNEL); if (!health->recovered_break_msg) return -ENOMEM; } return 0; } static int nsim_dev_dummy_fmsg_put(struct devlink_fmsg *fmsg, u32 binary_len) { char *binary; int i; devlink_fmsg_bool_pair_put(fmsg, "test_bool", true); devlink_fmsg_u8_pair_put(fmsg, "test_u8", 1); devlink_fmsg_u32_pair_put(fmsg, "test_u32", 3); devlink_fmsg_u64_pair_put(fmsg, "test_u64", 4); devlink_fmsg_string_pair_put(fmsg, "test_string", "somestring"); binary = kmalloc(binary_len, GFP_KERNEL | __GFP_NOWARN); if (!binary) return -ENOMEM; get_random_bytes(binary, binary_len); devlink_fmsg_binary_pair_put(fmsg, "test_binary", binary, binary_len); kfree(binary); devlink_fmsg_pair_nest_start(fmsg, "test_nest"); devlink_fmsg_obj_nest_start(fmsg); devlink_fmsg_bool_pair_put(fmsg, "nested_test_bool", false); devlink_fmsg_u8_pair_put(fmsg, "nested_test_u8", false); devlink_fmsg_obj_nest_end(fmsg); devlink_fmsg_pair_nest_end(fmsg); devlink_fmsg_arr_pair_nest_end(fmsg); devlink_fmsg_arr_pair_nest_start(fmsg, "test_u32_array"); for (i = 0; i < 10; i++) devlink_fmsg_u32_put(fmsg, i); devlink_fmsg_arr_pair_nest_end(fmsg); devlink_fmsg_arr_pair_nest_start(fmsg, "test_array_of_objects"); for (i = 0; i < 10; i++) { devlink_fmsg_obj_nest_start(fmsg); devlink_fmsg_bool_pair_put(fmsg, "in_array_nested_test_bool", false); devlink_fmsg_u8_pair_put(fmsg, "in_array_nested_test_u8", i); devlink_fmsg_obj_nest_end(fmsg); } devlink_fmsg_arr_pair_nest_end(fmsg); return 0; } static int nsim_dev_dummy_reporter_dump(struct devlink_health_reporter *reporter, struct devlink_fmsg *fmsg, void *priv_ctx, struct netlink_ext_ack *extack) { struct nsim_dev_health *health = devlink_health_reporter_priv(reporter); struct nsim_dev_dummy_reporter_ctx *ctx = priv_ctx; if (ctx) devlink_fmsg_string_pair_put(fmsg, "break_message", ctx->break_msg); return nsim_dev_dummy_fmsg_put(fmsg, health->binary_len); } static int nsim_dev_dummy_reporter_diagnose(struct devlink_health_reporter *reporter, struct devlink_fmsg *fmsg, struct netlink_ext_ack *extack) { struct nsim_dev_health *health = devlink_health_reporter_priv(reporter); if (health->recovered_break_msg) devlink_fmsg_string_pair_put(fmsg, "recovered_break_message", health->recovered_break_msg); return nsim_dev_dummy_fmsg_put(fmsg, health->binary_len); } static const struct devlink_health_reporter_ops nsim_dev_dummy_reporter_ops = { .name = "dummy", .recover = nsim_dev_dummy_reporter_recover, .dump = nsim_dev_dummy_reporter_dump, .diagnose = nsim_dev_dummy_reporter_diagnose, }; static ssize_t nsim_dev_health_break_write(struct file *file, const char __user *data, size_t count, loff_t *ppos) { struct nsim_dev_health *health = file->private_data; struct nsim_dev_dummy_reporter_ctx ctx; char *break_msg; int err; if (count == 0 || count > PAGE_SIZE) return -EINVAL; break_msg = memdup_user_nul(data, count); if (IS_ERR(break_msg)) return PTR_ERR(break_msg); if (break_msg[count - 1] == '\n') break_msg[count - 1] = '\0'; ctx.break_msg = break_msg; err = devlink_health_report(health->dummy_reporter, break_msg, &ctx); if (err) goto out; out: kfree(break_msg); return err ?: count; } static const struct file_operations nsim_dev_health_break_fops = { .open = simple_open, .write = nsim_dev_health_break_write, .llseek = generic_file_llseek, .owner = THIS_MODULE, }; int nsim_dev_health_init(struct nsim_dev *nsim_dev, struct devlink *devlink) { struct nsim_dev_health *health = &nsim_dev->health; int err; health->empty_reporter = devl_health_reporter_create(devlink, &nsim_dev_empty_reporter_ops, 0, health); if (IS_ERR(health->empty_reporter)) return PTR_ERR(health->empty_reporter); health->dummy_reporter = devl_health_reporter_create(devlink, &nsim_dev_dummy_reporter_ops, 0, health); if (IS_ERR(health->dummy_reporter)) { err = PTR_ERR(health->dummy_reporter); goto err_empty_reporter_destroy; } health->ddir = debugfs_create_dir("health", nsim_dev->ddir); if (IS_ERR(health->ddir)) { err = PTR_ERR(health->ddir); goto err_dummy_reporter_destroy; } health->recovered_break_msg = NULL; debugfs_create_file("break_health", 0200, health->ddir, health, &nsim_dev_health_break_fops); health->binary_len = 16; debugfs_create_u32("binary_len", 0600, health->ddir, &health->binary_len); health->fail_recover = false; debugfs_create_bool("fail_recover", 0600, health->ddir, &health->fail_recover); return 0; err_dummy_reporter_destroy: devl_health_reporter_destroy(health->dummy_reporter); err_empty_reporter_destroy: devl_health_reporter_destroy(health->empty_reporter); return err; } void nsim_dev_health_exit(struct nsim_dev *nsim_dev) { struct nsim_dev_health *health = &nsim_dev->health; debugfs_remove_recursive(health->ddir); kfree(health->recovered_break_msg); devl_health_reporter_destroy(health->dummy_reporter); devl_health_reporter_destroy(health->empty_reporter); }
232 476 3429 1094 3100 890 142 142 321 247 362 98 299 302 302 47 387 396 268 3 275 273 845 33 846 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 /* SPDX-License-Identifier: GPL-2.0+ */ #ifndef _LINUX_XARRAY_H #define _LINUX_XARRAY_H /* * eXtensible Arrays * Copyright (c) 2017 Microsoft Corporation * Author: Matthew Wilcox <willy@infradead.org> * * See Documentation/core-api/xarray.rst for how to use the XArray. */ #include <linux/bitmap.h> #include <linux/bug.h> #include <linux/compiler.h> #include <linux/err.h> #include <linux/gfp.h> #include <linux/kconfig.h> #include <linux/limits.h> #include <linux/lockdep.h> #include <linux/rcupdate.h> #include <linux/sched/mm.h> #include <linux/spinlock.h> #include <linux/types.h> struct list_lru; /* * The bottom two bits of the entry determine how the XArray interprets * the contents: * * 00: Pointer entry * 10: Internal entry * x1: Value entry or tagged pointer * * Attempting to store internal entries in the XArray is a bug. * * Most internal entries are pointers to the next node in the tree. * The following internal entries have a special meaning: * * 0-62: Sibling entries * 256: Retry entry * 257: Zero entry * * Errors are also represented as internal entries, but use the negative * space (-4094 to -2). They're never stored in the slots array; only * returned by the normal API. */ #define BITS_PER_XA_VALUE (BITS_PER_LONG - 1) /** * xa_mk_value() - Create an XArray entry from an integer. * @v: Value to store in XArray. * * Context: Any context. * Return: An entry suitable for storing in the XArray. */ static inline void *xa_mk_value(unsigned long v) { WARN_ON((long)v < 0); return (void *)((v << 1) | 1); } /** * xa_to_value() - Get value stored in an XArray entry. * @entry: XArray entry. * * Context: Any context. * Return: The value stored in the XArray entry. */ static inline unsigned long xa_to_value(const void *entry) { return (unsigned long)entry >> 1; } /** * xa_is_value() - Determine if an entry is a value. * @entry: XArray entry. * * Context: Any context. * Return: True if the entry is a value, false if it is a pointer. */ static inline bool xa_is_value(const void *entry) { return (unsigned long)entry & 1; } /** * xa_tag_pointer() - Create an XArray entry for a tagged pointer. * @p: Plain pointer. * @tag: Tag value (0, 1 or 3). * * If the user of the XArray prefers, they can tag their pointers instead * of storing value entries. Three tags are available (0, 1 and 3). * These are distinct from the xa_mark_t as they are not replicated up * through the array and cannot be searched for. * * Context: Any context. * Return: An XArray entry. */ static inline void *xa_tag_pointer(void *p, unsigned long tag) { return (void *)((unsigned long)p | tag); } /** * xa_untag_pointer() - Turn an XArray entry into a plain pointer. * @entry: XArray entry. * * If you have stored a tagged pointer in the XArray, call this function * to get the untagged version of the pointer. * * Context: Any context. * Return: A pointer. */ static inline void *xa_untag_pointer(void *entry) { return (void *)((unsigned long)entry & ~3UL); } /** * xa_pointer_tag() - Get the tag stored in an XArray entry. * @entry: XArray entry. * * If you have stored a tagged pointer in the XArray, call this function * to get the tag of that pointer. * * Context: Any context. * Return: A tag. */ static inline unsigned int xa_pointer_tag(void *entry) { return (unsigned long)entry & 3UL; } /* * xa_mk_internal() - Create an internal entry. * @v: Value to turn into an internal entry. * * Internal entries are used for a number of purposes. Entries 0-255 are * used for sibling entries (only 0-62 are used by the current code). 256 * is used for the retry entry. 257 is used for the reserved / zero entry. * Negative internal entries are used to represent errnos. Node pointers * are also tagged as internal entries in some situations. * * Context: Any context. * Return: An XArray internal entry corresponding to this value. */ static inline void *xa_mk_internal(unsigned long v) { return (void *)((v << 2) | 2); } /* * xa_to_internal() - Extract the value from an internal entry. * @entry: XArray entry. * * Context: Any context. * Return: The value which was stored in the internal entry. */ static inline unsigned long xa_to_internal(const void *entry) { return (unsigned long)entry >> 2; } /* * xa_is_internal() - Is the entry an internal entry? * @entry: XArray entry. * * Context: Any context. * Return: %true if the entry is an internal entry. */ static inline bool xa_is_internal(const void *entry) { return ((unsigned long)entry & 3) == 2; } #define XA_ZERO_ENTRY xa_mk_internal(257) /** * xa_is_zero() - Is the entry a zero entry? * @entry: Entry retrieved from the XArray * * The normal API will return NULL as the contents of a slot containing * a zero entry. You can only see zero entries by using the advanced API. * * Return: %true if the entry is a zero entry. */ static inline bool xa_is_zero(const void *entry) { return unlikely(entry == XA_ZERO_ENTRY); } /** * xa_is_err() - Report whether an XArray operation returned an error * @entry: Result from calling an XArray function * * If an XArray operation cannot complete an operation, it will return * a special value indicating an error. This function tells you * whether an error occurred; xa_err() tells you which error occurred. * * Context: Any context. * Return: %true if the entry indicates an error. */ static inline bool xa_is_err(const void *entry) { return unlikely(xa_is_internal(entry) && entry >= xa_mk_internal(-MAX_ERRNO)); } /** * xa_err() - Turn an XArray result into an errno. * @entry: Result from calling an XArray function. * * If an XArray operation cannot complete an operation, it will return * a special pointer value which encodes an errno. This function extracts * the errno from the pointer value, or returns 0 if the pointer does not * represent an errno. * * Context: Any context. * Return: A negative errno or 0. */ static inline int xa_err(void *entry) { /* xa_to_internal() would not do sign extension. */ if (xa_is_err(entry)) return (long)entry >> 2; return 0; } /** * struct xa_limit - Represents a range of IDs. * @min: The lowest ID to allocate (inclusive). * @max: The maximum ID to allocate (inclusive). * * This structure is used either directly or via the XA_LIMIT() macro * to communicate the range of IDs that are valid for allocation. * Three common ranges are predefined for you: * * xa_limit_32b - [0 - UINT_MAX] * * xa_limit_31b - [0 - INT_MAX] * * xa_limit_16b - [0 - USHRT_MAX] */ struct xa_limit { u32 max; u32 min; }; #define XA_LIMIT(_min, _max) (struct xa_limit) { .min = _min, .max = _max } #define xa_limit_32b XA_LIMIT(0, UINT_MAX) #define xa_limit_31b XA_LIMIT(0, INT_MAX) #define xa_limit_16b XA_LIMIT(0, USHRT_MAX) typedef unsigned __bitwise xa_mark_t; #define XA_MARK_0 ((__force xa_mark_t)0U) #define XA_MARK_1 ((__force xa_mark_t)1U) #define XA_MARK_2 ((__force xa_mark_t)2U) #define XA_PRESENT ((__force xa_mark_t)8U) #define XA_MARK_MAX XA_MARK_2 #define XA_FREE_MARK XA_MARK_0 enum xa_lock_type { XA_LOCK_IRQ = 1, XA_LOCK_BH = 2, }; /* * Values for xa_flags. The radix tree stores its GFP flags in the xa_flags, * and we remain compatible with that. */ #define XA_FLAGS_LOCK_IRQ ((__force gfp_t)XA_LOCK_IRQ) #define XA_FLAGS_LOCK_BH ((__force gfp_t)XA_LOCK_BH) #define XA_FLAGS_TRACK_FREE ((__force gfp_t)4U) #define XA_FLAGS_ZERO_BUSY ((__force gfp_t)8U) #define XA_FLAGS_ALLOC_WRAPPED ((__force gfp_t)16U) #define XA_FLAGS_ACCOUNT ((__force gfp_t)32U) #define XA_FLAGS_MARK(mark) ((__force gfp_t)((1U << __GFP_BITS_SHIFT) << \ (__force unsigned)(mark))) /* ALLOC is for a normal 0-based alloc. ALLOC1 is for an 1-based alloc */ #define XA_FLAGS_ALLOC (XA_FLAGS_TRACK_FREE | XA_FLAGS_MARK(XA_FREE_MARK)) #define XA_FLAGS_ALLOC1 (XA_FLAGS_TRACK_FREE | XA_FLAGS_ZERO_BUSY) /** * struct xarray - The anchor of the XArray. * @xa_lock: Lock that protects the contents of the XArray. * * To use the xarray, define it statically or embed it in your data structure. * It is a very small data structure, so it does not usually make sense to * allocate it separately and keep a pointer to it in your data structure. * * You may use the xa_lock to protect your own data structures as well. */ /* * If all of the entries in the array are NULL, @xa_head is a NULL pointer. * If the only non-NULL entry in the array is at index 0, @xa_head is that * entry. If any other entry in the array is non-NULL, @xa_head points * to an @xa_node. */ struct xarray { spinlock_t xa_lock; /* private: The rest of the data structure is not to be used directly. */ gfp_t xa_flags; void __rcu * xa_head; }; #define XARRAY_INIT(name, flags) { \ .xa_lock = __SPIN_LOCK_UNLOCKED(name.xa_lock), \ .xa_flags = flags, \ .xa_head = NULL, \ } /** * DEFINE_XARRAY_FLAGS() - Define an XArray with custom flags. * @name: A string that names your XArray. * @flags: XA_FLAG values. * * This is intended for file scope definitions of XArrays. It declares * and initialises an empty XArray with the chosen name and flags. It is * equivalent to calling xa_init_flags() on the array, but it does the * initialisation at compiletime instead of runtime. */ #define DEFINE_XARRAY_FLAGS(name, flags) \ struct xarray name = XARRAY_INIT(name, flags) /** * DEFINE_XARRAY() - Define an XArray. * @name: A string that names your XArray. * * This is intended for file scope definitions of XArrays. It declares * and initialises an empty XArray with the chosen name. It is equivalent * to calling xa_init() on the array, but it does the initialisation at * compiletime instead of runtime. */ #define DEFINE_XARRAY(name) DEFINE_XARRAY_FLAGS(name, 0) /** * DEFINE_XARRAY_ALLOC() - Define an XArray which allocates IDs starting at 0. * @name: A string that names your XArray. * * This is intended for file scope definitions of allocating XArrays. * See also DEFINE_XARRAY(). */ #define DEFINE_XARRAY_ALLOC(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC) /** * DEFINE_XARRAY_ALLOC1() - Define an XArray which allocates IDs starting at 1. * @name: A string that names your XArray. * * This is intended for file scope definitions of allocating XArrays. * See also DEFINE_XARRAY(). */ #define DEFINE_XARRAY_ALLOC1(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC1) void *xa_load(struct xarray *, unsigned long index); void *xa_store(struct xarray *, unsigned long index, void *entry, gfp_t); void *xa_erase(struct xarray *, unsigned long index); void *xa_store_range(struct xarray *, unsigned long first, unsigned long last, void *entry, gfp_t); bool xa_get_mark(struct xarray *, unsigned long index, xa_mark_t); void xa_set_mark(struct xarray *, unsigned long index, xa_mark_t); void xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); void *xa_find(struct xarray *xa, unsigned long *index, unsigned long max, xa_mark_t) __attribute__((nonnull(2))); void *xa_find_after(struct xarray *xa, unsigned long *index, unsigned long max, xa_mark_t) __attribute__((nonnull(2))); unsigned int xa_extract(struct xarray *, void **dst, unsigned long start, unsigned long max, unsigned int n, xa_mark_t); void xa_destroy(struct xarray *); /** * xa_init_flags() - Initialise an empty XArray with flags. * @xa: XArray. * @flags: XA_FLAG values. * * If you need to initialise an XArray with special flags (eg you need * to take the lock from interrupt context), use this function instead * of xa_init(). * * Context: Any context. */ static inline void xa_init_flags(struct xarray *xa, gfp_t flags) { spin_lock_init(&xa->xa_lock); xa->xa_flags = flags; xa->xa_head = NULL; } /** * xa_init() - Initialise an empty XArray. * @xa: XArray. * * An empty XArray is full of NULL entries. * * Context: Any context. */ static inline void xa_init(struct xarray *xa) { xa_init_flags(xa, 0); } /** * xa_empty() - Determine if an array has any present entries. * @xa: XArray. * * Context: Any context. * Return: %true if the array contains only NULL pointers. */ static inline bool xa_empty(const struct xarray *xa) { return xa->xa_head == NULL; } /** * xa_marked() - Inquire whether any entry in this array has a mark set * @xa: Array * @mark: Mark value * * Context: Any context. * Return: %true if any entry has this mark set. */ static inline bool xa_marked(const struct xarray *xa, xa_mark_t mark) { return xa->xa_flags & XA_FLAGS_MARK(mark); } /** * xa_for_each_range() - Iterate over a portion of an XArray. * @xa: XArray. * @index: Index of @entry. * @entry: Entry retrieved from array. * @start: First index to retrieve from array. * @last: Last index to retrieve from array. * * During the iteration, @entry will have the value of the entry stored * in @xa at @index. You may modify @index during the iteration if you * want to skip or reprocess indices. It is safe to modify the array * during the iteration. At the end of the iteration, @entry will be set * to NULL and @index will have a value less than or equal to max. * * xa_for_each_range() is O(n.log(n)) while xas_for_each() is O(n). You have * to handle your own locking with xas_for_each(), and if you have to unlock * after each iteration, it will also end up being O(n.log(n)). * xa_for_each_range() will spin if it hits a retry entry; if you intend to * see retry entries, you should use the xas_for_each() iterator instead. * The xas_for_each() iterator will expand into more inline code than * xa_for_each_range(). * * Context: Any context. Takes and releases the RCU lock. */ #define xa_for_each_range(xa, index, entry, start, last) \ for (index = start, \ entry = xa_find(xa, &index, last, XA_PRESENT); \ entry; \ entry = xa_find_after(xa, &index, last, XA_PRESENT)) /** * xa_for_each_start() - Iterate over a portion of an XArray. * @xa: XArray. * @index: Index of @entry. * @entry: Entry retrieved from array. * @start: First index to retrieve from array. * * During the iteration, @entry will have the value of the entry stored * in @xa at @index. You may modify @index during the iteration if you * want to skip or reprocess indices. It is safe to modify the array * during the iteration. At the end of the iteration, @entry will be set * to NULL and @index will have a value less than or equal to max. * * xa_for_each_start() is O(n.log(n)) while xas_for_each() is O(n). You have * to handle your own locking with xas_for_each(), and if you have to unlock * after each iteration, it will also end up being O(n.log(n)). * xa_for_each_start() will spin if it hits a retry entry; if you intend to * see retry entries, you should use the xas_for_each() iterator instead. * The xas_for_each() iterator will expand into more inline code than * xa_for_each_start(). * * Context: Any context. Takes and releases the RCU lock. */ #define xa_for_each_start(xa, index, entry, start) \ xa_for_each_range(xa, index, entry, start, ULONG_MAX) /** * xa_for_each() - Iterate over present entries in an XArray. * @xa: XArray. * @index: Index of @entry. * @entry: Entry retrieved from array. * * During the iteration, @entry will have the value of the entry stored * in @xa at @index. You may modify @index during the iteration if you want * to skip or reprocess indices. It is safe to modify the array during the * iteration. At the end of the iteration, @entry will be set to NULL and * @index will have a value less than or equal to max. * * xa_for_each() is O(n.log(n)) while xas_for_each() is O(n). You have * to handle your own locking with xas_for_each(), and if you have to unlock * after each iteration, it will also end up being O(n.log(n)). xa_for_each() * will spin if it hits a retry entry; if you intend to see retry entries, * you should use the xas_for_each() iterator instead. The xas_for_each() * iterator will expand into more inline code than xa_for_each(). * * Context: Any context. Takes and releases the RCU lock. */ #define xa_for_each(xa, index, entry) \ xa_for_each_start(xa, index, entry, 0) /** * xa_for_each_marked() - Iterate over marked entries in an XArray. * @xa: XArray. * @index: Index of @entry. * @entry: Entry retrieved from array. * @filter: Selection criterion. * * During the iteration, @entry will have the value of the entry stored * in @xa at @index. The iteration will skip all entries in the array * which do not match @filter. You may modify @index during the iteration * if you want to skip or reprocess indices. It is safe to modify the array * during the iteration. At the end of the iteration, @entry will be set to * NULL and @index will have a value less than or equal to max. * * xa_for_each_marked() is O(n.log(n)) while xas_for_each_marked() is O(n). * You have to handle your own locking with xas_for_each(), and if you have * to unlock after each iteration, it will also end up being O(n.log(n)). * xa_for_each_marked() will spin if it hits a retry entry; if you intend to * see retry entries, you should use the xas_for_each_marked() iterator * instead. The xas_for_each_marked() iterator will expand into more inline * code than xa_for_each_marked(). * * Context: Any context. Takes and releases the RCU lock. */ #define xa_for_each_marked(xa, index, entry, filter) \ for (index = 0, entry = xa_find(xa, &index, ULONG_MAX, filter); \ entry; entry = xa_find_after(xa, &index, ULONG_MAX, filter)) #define xa_trylock(xa) spin_trylock(&(xa)->xa_lock) #define xa_lock(xa) spin_lock(&(xa)->xa_lock) #define xa_unlock(xa) spin_unlock(&(xa)->xa_lock) #define xa_lock_bh(xa) spin_lock_bh(&(xa)->xa_lock) #define xa_unlock_bh(xa) spin_unlock_bh(&(xa)->xa_lock) #define xa_lock_irq(xa) spin_lock_irq(&(xa)->xa_lock) #define xa_unlock_irq(xa) spin_unlock_irq(&(xa)->xa_lock) #define xa_lock_irqsave(xa, flags) \ spin_lock_irqsave(&(xa)->xa_lock, flags) #define xa_unlock_irqrestore(xa, flags) \ spin_unlock_irqrestore(&(xa)->xa_lock, flags) #define xa_lock_nested(xa, subclass) \ spin_lock_nested(&(xa)->xa_lock, subclass) #define xa_lock_bh_nested(xa, subclass) \ spin_lock_bh_nested(&(xa)->xa_lock, subclass) #define xa_lock_irq_nested(xa, subclass) \ spin_lock_irq_nested(&(xa)->xa_lock, subclass) #define xa_lock_irqsave_nested(xa, flags, subclass) \ spin_lock_irqsave_nested(&(xa)->xa_lock, flags, subclass) /* * Versions of the normal API which require the caller to hold the * xa_lock. If the GFP flags allow it, they will drop the lock to * allocate memory, then reacquire it afterwards. These functions * may also re-enable interrupts if the XArray flags indicate the * locking should be interrupt safe. */ void *__xa_erase(struct xarray *, unsigned long index); void *__xa_store(struct xarray *, unsigned long index, void *entry, gfp_t); void *__xa_cmpxchg(struct xarray *, unsigned long index, void *old, void *entry, gfp_t); int __must_check __xa_insert(struct xarray *, unsigned long index, void *entry, gfp_t); int __must_check __xa_alloc(struct xarray *, u32 *id, void *entry, struct xa_limit, gfp_t); int __must_check __xa_alloc_cyclic(struct xarray *, u32 *id, void *entry, struct xa_limit, u32 *next, gfp_t); void __xa_set_mark(struct xarray *, unsigned long index, xa_mark_t); void __xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); /** * xa_store_bh() - Store this entry in the XArray. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * This function is like calling xa_store() except it disables softirqs * while holding the array lock. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. * Return: The old entry at this index or xa_err() if an error happened. */ static inline void *xa_store_bh(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { void *curr; might_alloc(gfp); xa_lock_bh(xa); curr = __xa_store(xa, index, entry, gfp); xa_unlock_bh(xa); return curr; } /** * xa_store_irq() - Store this entry in the XArray. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * This function is like calling xa_store() except it disables interrupts * while holding the array lock. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. * Return: The old entry at this index or xa_err() if an error happened. */ static inline void *xa_store_irq(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { void *curr; might_alloc(gfp); xa_lock_irq(xa); curr = __xa_store(xa, index, entry, gfp); xa_unlock_irq(xa); return curr; } /** * xa_erase_bh() - Erase this entry from the XArray. * @xa: XArray. * @index: Index of entry. * * After this function returns, loading from @index will return %NULL. * If the index is part of a multi-index entry, all indices will be erased * and none of the entries will be part of a multi-index entry. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. * Return: The entry which used to be at this index. */ static inline void *xa_erase_bh(struct xarray *xa, unsigned long index) { void *entry; xa_lock_bh(xa); entry = __xa_erase(xa, index); xa_unlock_bh(xa); return entry; } /** * xa_erase_irq() - Erase this entry from the XArray. * @xa: XArray. * @index: Index of entry. * * After this function returns, loading from @index will return %NULL. * If the index is part of a multi-index entry, all indices will be erased * and none of the entries will be part of a multi-index entry. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. * Return: The entry which used to be at this index. */ static inline void *xa_erase_irq(struct xarray *xa, unsigned long index) { void *entry; xa_lock_irq(xa); entry = __xa_erase(xa, index); xa_unlock_irq(xa); return entry; } /** * xa_cmpxchg() - Conditionally replace an entry in the XArray. * @xa: XArray. * @index: Index into array. * @old: Old value to test against. * @entry: New value to place in array. * @gfp: Memory allocation flags. * * If the entry at @index is the same as @old, replace it with @entry. * If the return value is equal to @old, then the exchange was successful. * * Context: Any context. Takes and releases the xa_lock. May sleep * if the @gfp flags permit. * Return: The old value at this index or xa_err() if an error happened. */ static inline void *xa_cmpxchg(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp) { void *curr; might_alloc(gfp); xa_lock(xa); curr = __xa_cmpxchg(xa, index, old, entry, gfp); xa_unlock(xa); return curr; } /** * xa_cmpxchg_bh() - Conditionally replace an entry in the XArray. * @xa: XArray. * @index: Index into array. * @old: Old value to test against. * @entry: New value to place in array. * @gfp: Memory allocation flags. * * This function is like calling xa_cmpxchg() except it disables softirqs * while holding the array lock. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: The old value at this index or xa_err() if an error happened. */ static inline void *xa_cmpxchg_bh(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp) { void *curr; might_alloc(gfp); xa_lock_bh(xa); curr = __xa_cmpxchg(xa, index, old, entry, gfp); xa_unlock_bh(xa); return curr; } /** * xa_cmpxchg_irq() - Conditionally replace an entry in the XArray. * @xa: XArray. * @index: Index into array. * @old: Old value to test against. * @entry: New value to place in array. * @gfp: Memory allocation flags. * * This function is like calling xa_cmpxchg() except it disables interrupts * while holding the array lock. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: The old value at this index or xa_err() if an error happened. */ static inline void *xa_cmpxchg_irq(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp) { void *curr; might_alloc(gfp); xa_lock_irq(xa); curr = __xa_cmpxchg(xa, index, old, entry, gfp); xa_unlock_irq(xa); return curr; } /** * xa_insert() - Store this entry in the XArray unless another entry is * already present. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * Inserting a NULL entry will store a reserved entry (like xa_reserve()) * if no entry is present. Inserting will fail if a reserved entry is * present, even though loading from this index will return NULL. * * Context: Any context. Takes and releases the xa_lock. May sleep if * the @gfp flags permit. * Return: 0 if the store succeeded. -EBUSY if another entry was present. * -ENOMEM if memory could not be allocated. */ static inline int __must_check xa_insert(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { int err; might_alloc(gfp); xa_lock(xa); err = __xa_insert(xa, index, entry, gfp); xa_unlock(xa); return err; } /** * xa_insert_bh() - Store this entry in the XArray unless another entry is * already present. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * Inserting a NULL entry will store a reserved entry (like xa_reserve()) * if no entry is present. Inserting will fail if a reserved entry is * present, even though loading from this index will return NULL. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: 0 if the store succeeded. -EBUSY if another entry was present. * -ENOMEM if memory could not be allocated. */ static inline int __must_check xa_insert_bh(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_bh(xa); err = __xa_insert(xa, index, entry, gfp); xa_unlock_bh(xa); return err; } /** * xa_insert_irq() - Store this entry in the XArray unless another entry is * already present. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * Inserting a NULL entry will store a reserved entry (like xa_reserve()) * if no entry is present. Inserting will fail if a reserved entry is * present, even though loading from this index will return NULL. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: 0 if the store succeeded. -EBUSY if another entry was present. * -ENOMEM if memory could not be allocated. */ static inline int __must_check xa_insert_irq(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_irq(xa); err = __xa_insert(xa, index, entry, gfp); xa_unlock_irq(xa); return err; } /** * xa_alloc() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Any context. Takes and releases the xa_lock. May sleep if * the @gfp flags permit. * Return: 0 on success, -ENOMEM if memory could not be allocated or * -EBUSY if there are no free entries in @limit. */ static inline __must_check int xa_alloc(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, gfp_t gfp) { int err; might_alloc(gfp); xa_lock(xa); err = __xa_alloc(xa, id, entry, limit, gfp); xa_unlock(xa); return err; } /** * xa_alloc_bh() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: 0 on success, -ENOMEM if memory could not be allocated or * -EBUSY if there are no free entries in @limit. */ static inline int __must_check xa_alloc_bh(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_bh(xa); err = __xa_alloc(xa, id, entry, limit, gfp); xa_unlock_bh(xa); return err; } /** * xa_alloc_irq() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: 0 on success, -ENOMEM if memory could not be allocated or * -EBUSY if there are no free entries in @limit. */ static inline int __must_check xa_alloc_irq(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_irq(xa); err = __xa_alloc(xa, id, entry, limit, gfp); xa_unlock_irq(xa); return err; } /** * xa_alloc_cyclic() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of allocated ID. * @next: Pointer to next ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * The search for an empty entry will start at @next and will wrap * around if necessary. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Any context. Takes and releases the xa_lock. May sleep if * the @gfp flags permit. * Return: 0 if the allocation succeeded without wrapping. 1 if the * allocation succeeded after wrapping, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ static inline int xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, u32 *next, gfp_t gfp) { int err; might_alloc(gfp); xa_lock(xa); err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock(xa); return err; } /** * xa_alloc_cyclic_bh() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of allocated ID. * @next: Pointer to next ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * The search for an empty entry will start at @next and will wrap * around if necessary. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: 0 if the allocation succeeded without wrapping. 1 if the * allocation succeeded after wrapping, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ static inline int xa_alloc_cyclic_bh(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, u32 *next, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_bh(xa); err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock_bh(xa); return err; } /** * xa_alloc_cyclic_irq() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of allocated ID. * @next: Pointer to next ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * The search for an empty entry will start at @next and will wrap * around if necessary. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: 0 if the allocation succeeded without wrapping. 1 if the * allocation succeeded after wrapping, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ static inline int xa_alloc_cyclic_irq(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, u32 *next, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_irq(xa); err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock_irq(xa); return err; } /** * xa_reserve() - Reserve this index in the XArray. * @xa: XArray. * @index: Index into array. * @gfp: Memory allocation flags. * * Ensures there is somewhere to store an entry at @index in the array. * If there is already something stored at @index, this function does * nothing. If there was nothing there, the entry is marked as reserved. * Loading from a reserved entry returns a %NULL pointer. * * If you do not use the entry that you have reserved, call xa_release() * or xa_erase() to free any unnecessary memory. * * Context: Any context. Takes and releases the xa_lock. * May sleep if the @gfp flags permit. * Return: 0 if the reservation succeeded or -ENOMEM if it failed. */ static inline __must_check int xa_reserve(struct xarray *xa, unsigned long index, gfp_t gfp) { return xa_err(xa_cmpxchg(xa, index, NULL, XA_ZERO_ENTRY, gfp)); } /** * xa_reserve_bh() - Reserve this index in the XArray. * @xa: XArray. * @index: Index into array. * @gfp: Memory allocation flags. * * A softirq-disabling version of xa_reserve(). * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. * Return: 0 if the reservation succeeded or -ENOMEM if it failed. */ static inline __must_check int xa_reserve_bh(struct xarray *xa, unsigned long index, gfp_t gfp) { return xa_err(xa_cmpxchg_bh(xa, index, NULL, XA_ZERO_ENTRY, gfp)); } /** * xa_reserve_irq() - Reserve this index in the XArray. * @xa: XArray. * @index: Index into array. * @gfp: Memory allocation flags. * * An interrupt-disabling version of xa_reserve(). * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. * Return: 0 if the reservation succeeded or -ENOMEM if it failed. */ static inline __must_check int xa_reserve_irq(struct xarray *xa, unsigned long index, gfp_t gfp) { return xa_err(xa_cmpxchg_irq(xa, index, NULL, XA_ZERO_ENTRY, gfp)); } /** * xa_release() - Release a reserved entry. * @xa: XArray. * @index: Index of entry. * * After calling xa_reserve(), you can call this function to release the * reservation. If the entry at @index has been stored to, this function * will do nothing. */ static inline void xa_release(struct xarray *xa, unsigned long index) { xa_cmpxchg(xa, index, XA_ZERO_ENTRY, NULL, 0); } /* Everything below here is the Advanced API. Proceed with caution. */ /* * The xarray is constructed out of a set of 'chunks' of pointers. Choosing * the best chunk size requires some tradeoffs. A power of two recommends * itself so that we can walk the tree based purely on shifts and masks. * Generally, the larger the better; as the number of slots per level of the * tree increases, the less tall the tree needs to be. But that needs to be * balanced against the memory consumption of each node. On a 64-bit system, * xa_node is currently 576 bytes, and we get 7 of them per 4kB page. If we * doubled the number of slots per node, we'd get only 3 nodes per 4kB page. */ #ifndef XA_CHUNK_SHIFT #define XA_CHUNK_SHIFT (IS_ENABLED(CONFIG_BASE_SMALL) ? 4 : 6) #endif #define XA_CHUNK_SIZE (1UL << XA_CHUNK_SHIFT) #define XA_CHUNK_MASK (XA_CHUNK_SIZE - 1) #define XA_MAX_MARKS 3 #define XA_MARK_LONGS BITS_TO_LONGS(XA_CHUNK_SIZE) /* * @count is the count of every non-NULL element in the ->slots array * whether that is a value entry, a retry entry, a user pointer, * a sibling entry or a pointer to the next level of the tree. * @nr_values is the count of every element in ->slots which is * either a value entry or a sibling of a value entry. */ struct xa_node { unsigned char shift; /* Bits remaining in each slot */ unsigned char offset; /* Slot offset in parent */ unsigned char count; /* Total entry count */ unsigned char nr_values; /* Value entry count */ struct xa_node __rcu *parent; /* NULL at top of tree */ struct xarray *array; /* The array we belong to */ union { struct list_head private_list; /* For tree user */ struct rcu_head rcu_head; /* Used when freeing node */ }; void __rcu *slots[XA_CHUNK_SIZE]; union { unsigned long tags[XA_MAX_MARKS][XA_MARK_LONGS]; unsigned long marks[XA_MAX_MARKS][XA_MARK_LONGS]; }; }; void xa_dump(const struct xarray *); void xa_dump_node(const struct xa_node *); #ifdef XA_DEBUG #define XA_BUG_ON(xa, x) do { \ if (x) { \ xa_dump(xa); \ BUG(); \ } \ } while (0) #define XA_NODE_BUG_ON(node, x) do { \ if (x) { \ if (node) xa_dump_node(node); \ BUG(); \ } \ } while (0) #else #define XA_BUG_ON(xa, x) do { } while (0) #define XA_NODE_BUG_ON(node, x) do { } while (0) #endif /* Private */ static inline void *xa_head(const struct xarray *xa) { return rcu_dereference_check(xa->xa_head, lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline void *xa_head_locked(const struct xarray *xa) { return rcu_dereference_protected(xa->xa_head, lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline void *xa_entry(const struct xarray *xa, const struct xa_node *node, unsigned int offset) { XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE); return rcu_dereference_check(node->slots[offset], lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline void *xa_entry_locked(const struct xarray *xa, const struct xa_node *node, unsigned int offset) { XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE); return rcu_dereference_protected(node->slots[offset], lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline struct xa_node *xa_parent(const struct xarray *xa, const struct xa_node *node) { return rcu_dereference_check(node->parent, lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline struct xa_node *xa_parent_locked(const struct xarray *xa, const struct xa_node *node) { return rcu_dereference_protected(node->parent, lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline void *xa_mk_node(const struct xa_node *node) { return (void *)((unsigned long)node | 2); } /* Private */ static inline struct xa_node *xa_to_node(const void *entry) { return (struct xa_node *)((unsigned long)entry - 2); } /* Private */ static inline bool xa_is_node(const void *entry) { return xa_is_internal(entry) && (unsigned long)entry > 4096; } /* Private */ static inline void *xa_mk_sibling(unsigned int offset) { return xa_mk_internal(offset); } /* Private */ static inline unsigned long xa_to_sibling(const void *entry) { return xa_to_internal(entry); } /** * xa_is_sibling() - Is the entry a sibling entry? * @entry: Entry retrieved from the XArray * * Return: %true if the entry is a sibling entry. */ static inline bool xa_is_sibling(const void *entry) { return IS_ENABLED(CONFIG_XARRAY_MULTI) && xa_is_internal(entry) && (entry < xa_mk_sibling(XA_CHUNK_SIZE - 1)); } #define XA_RETRY_ENTRY xa_mk_internal(256) /** * xa_is_retry() - Is the entry a retry entry? * @entry: Entry retrieved from the XArray * * Return: %true if the entry is a retry entry. */ static inline bool xa_is_retry(const void *entry) { return unlikely(entry == XA_RETRY_ENTRY); } /** * xa_is_advanced() - Is the entry only permitted for the advanced API? * @entry: Entry to be stored in the XArray. * * Return: %true if the entry cannot be stored by the normal API. */ static inline bool xa_is_advanced(const void *entry) { return xa_is_internal(entry) && (entry <= XA_RETRY_ENTRY); } /** * typedef xa_update_node_t - A callback function from the XArray. * @node: The node which is being processed * * This function is called every time the XArray updates the count of * present and value entries in a node. It allows advanced users to * maintain the private_list in the node. * * Context: The xa_lock is held and interrupts may be disabled. * Implementations should not drop the xa_lock, nor re-enable * interrupts. */ typedef void (*xa_update_node_t)(struct xa_node *node); void xa_delete_node(struct xa_node *, xa_update_node_t); /* * The xa_state is opaque to its users. It contains various different pieces * of state involved in the current operation on the XArray. It should be * declared on the stack and passed between the various internal routines. * The various elements in it should not be accessed directly, but only * through the provided accessor functions. The below documentation is for * the benefit of those working on the code, not for users of the XArray. * * @xa_node usually points to the xa_node containing the slot we're operating * on (and @xa_offset is the offset in the slots array). If there is a * single entry in the array at index 0, there are no allocated xa_nodes to * point to, and so we store %NULL in @xa_node. @xa_node is set to * the value %XAS_RESTART if the xa_state is not walked to the correct * position in the tree of nodes for this operation. If an error occurs * during an operation, it is set to an %XAS_ERROR value. If we run off the * end of the allocated nodes, it is set to %XAS_BOUNDS. */ struct xa_state { struct xarray *xa; unsigned long xa_index; unsigned char xa_shift; unsigned char xa_sibs; unsigned char xa_offset; unsigned char xa_pad; /* Helps gcc generate better code */ struct xa_node *xa_node; struct xa_node *xa_alloc; xa_update_node_t xa_update; struct list_lru *xa_lru; }; /* * We encode errnos in the xas->xa_node. If an error has happened, we need to * drop the lock to fix it, and once we've done so the xa_state is invalid. */ #define XA_ERROR(errno) ((struct xa_node *)(((unsigned long)errno << 2) | 2UL)) #define XAS_BOUNDS ((struct xa_node *)1UL) #define XAS_RESTART ((struct xa_node *)3UL) #define __XA_STATE(array, index, shift, sibs) { \ .xa = array, \ .xa_index = index, \ .xa_shift = shift, \ .xa_sibs = sibs, \ .xa_offset = 0, \ .xa_pad = 0, \ .xa_node = XAS_RESTART, \ .xa_alloc = NULL, \ .xa_update = NULL, \ .xa_lru = NULL, \ } /** * XA_STATE() - Declare an XArray operation state. * @name: Name of this operation state (usually xas). * @array: Array to operate on. * @index: Initial index of interest. * * Declare and initialise an xa_state on the stack. */ #define XA_STATE(name, array, index) \ struct xa_state name = __XA_STATE(array, index, 0, 0) /** * XA_STATE_ORDER() - Declare an XArray operation state. * @name: Name of this operation state (usually xas). * @array: Array to operate on. * @index: Initial index of interest. * @order: Order of entry. * * Declare and initialise an xa_state on the stack. This variant of * XA_STATE() allows you to specify the 'order' of the element you * want to operate on.` */ #define XA_STATE_ORDER(name, array, index, order) \ struct xa_state name = __XA_STATE(array, \ (index >> order) << order, \ order - (order % XA_CHUNK_SHIFT), \ (1U << (order % XA_CHUNK_SHIFT)) - 1) #define xas_marked(xas, mark) xa_marked((xas)->xa, (mark)) #define xas_trylock(xas) xa_trylock((xas)->xa) #define xas_lock(xas) xa_lock((xas)->xa) #define xas_unlock(xas) xa_unlock((xas)->xa) #define xas_lock_bh(xas) xa_lock_bh((xas)->xa) #define xas_unlock_bh(xas) xa_unlock_bh((xas)->xa) #define xas_lock_irq(xas) xa_lock_irq((xas)->xa) #define xas_unlock_irq(xas) xa_unlock_irq((xas)->xa) #define xas_lock_irqsave(xas, flags) \ xa_lock_irqsave((xas)->xa, flags) #define xas_unlock_irqrestore(xas, flags) \ xa_unlock_irqrestore((xas)->xa, flags) /** * xas_error() - Return an errno stored in the xa_state. * @xas: XArray operation state. * * Return: 0 if no error has been noted. A negative errno if one has. */ static inline int xas_error(const struct xa_state *xas) { return xa_err(xas->xa_node); } /** * xas_set_err() - Note an error in the xa_state. * @xas: XArray operation state. * @err: Negative error number. * * Only call this function with a negative @err; zero or positive errors * will probably not behave the way you think they should. If you want * to clear the error from an xa_state, use xas_reset(). */ static inline void xas_set_err(struct xa_state *xas, long err) { xas->xa_node = XA_ERROR(err); } /** * xas_invalid() - Is the xas in a retry or error state? * @xas: XArray operation state. * * Return: %true if the xas cannot be used for operations. */ static inline bool xas_invalid(const struct xa_state *xas) { return (unsigned long)xas->xa_node & 3; } /** * xas_valid() - Is the xas a valid cursor into the array? * @xas: XArray operation state. * * Return: %true if the xas can be used for operations. */ static inline bool xas_valid(const struct xa_state *xas) { return !xas_invalid(xas); } /** * xas_is_node() - Does the xas point to a node? * @xas: XArray operation state. * * Return: %true if the xas currently references a node. */ static inline bool xas_is_node(const struct xa_state *xas) { return xas_valid(xas) && xas->xa_node; } /* True if the pointer is something other than a node */ static inline bool xas_not_node(struct xa_node *node) { return ((unsigned long)node & 3) || !node; } /* True if the node represents RESTART or an error */ static inline bool xas_frozen(struct xa_node *node) { return (unsigned long)node & 2; } /* True if the node represents head-of-tree, RESTART or BOUNDS */ static inline bool xas_top(struct xa_node *node) { return node <= XAS_RESTART; } /** * xas_reset() - Reset an XArray operation state. * @xas: XArray operation state. * * Resets the error or walk state of the @xas so future walks of the * array will start from the root. Use this if you have dropped the * xarray lock and want to reuse the xa_state. * * Context: Any context. */ static inline void xas_reset(struct xa_state *xas) { xas->xa_node = XAS_RESTART; } /** * xas_retry() - Retry the operation if appropriate. * @xas: XArray operation state. * @entry: Entry from xarray. * * The advanced functions may sometimes return an internal entry, such as * a retry entry or a zero entry. This function sets up the @xas to restart * the walk from the head of the array if needed. * * Context: Any context. * Return: true if the operation needs to be retried. */ static inline bool xas_retry(struct xa_state *xas, const void *entry) { if (xa_is_zero(entry)) return true; if (!xa_is_retry(entry)) return false; xas_reset(xas); return true; } void *xas_load(struct xa_state *); void *xas_store(struct xa_state *, void *entry); void *xas_find(struct xa_state *, unsigned long max); void *xas_find_conflict(struct xa_state *); bool xas_get_mark(const struct xa_state *, xa_mark_t); void xas_set_mark(const struct xa_state *, xa_mark_t); void xas_clear_mark(const struct xa_state *, xa_mark_t); void *xas_find_marked(struct xa_state *, unsigned long max, xa_mark_t); void xas_init_marks(const struct xa_state *); bool xas_nomem(struct xa_state *, gfp_t); void xas_destroy(struct xa_state *); void xas_pause(struct xa_state *); void xas_create_range(struct xa_state *); #ifdef CONFIG_XARRAY_MULTI int xa_get_order(struct xarray *, unsigned long index); int xas_get_order(struct xa_state *xas); void xas_split(struct xa_state *, void *entry, unsigned int order); void xas_split_alloc(struct xa_state *, void *entry, unsigned int order, gfp_t); #else static inline int xa_get_order(struct xarray *xa, unsigned long index) { return 0; } static inline int xas_get_order(struct xa_state *xas) { return 0; } static inline void xas_split(struct xa_state *xas, void *entry, unsigned int order) { xas_store(xas, entry); } static inline void xas_split_alloc(struct xa_state *xas, void *entry, unsigned int order, gfp_t gfp) { } #endif /** * xas_reload() - Refetch an entry from the xarray. * @xas: XArray operation state. * * Use this function to check that a previously loaded entry still has * the same value. This is useful for the lockless pagecache lookup where * we walk the array with only the RCU lock to protect us, lock the page, * then check that the page hasn't moved since we looked it up. * * The caller guarantees that @xas is still valid. If it may be in an * error or restart state, call xas_load() instead. * * Return: The entry at this location in the xarray. */ static inline void *xas_reload(struct xa_state *xas) { struct xa_node *node = xas->xa_node; void *entry; char offset; if (!node) return xa_head(xas->xa); if (IS_ENABLED(CONFIG_XARRAY_MULTI)) { offset = (xas->xa_index >> node->shift) & XA_CHUNK_MASK; entry = xa_entry(xas->xa, node, offset); if (!xa_is_sibling(entry)) return entry; offset = xa_to_sibling(entry); } else { offset = xas->xa_offset; } return xa_entry(xas->xa, node, offset); } /** * xas_set() - Set up XArray operation state for a different index. * @xas: XArray operation state. * @index: New index into the XArray. * * Move the operation state to refer to a different index. This will * have the effect of starting a walk from the top; see xas_next() * to move to an adjacent index. */ static inline void xas_set(struct xa_state *xas, unsigned long index) { xas->xa_index = index; xas->xa_node = XAS_RESTART; } /** * xas_advance() - Skip over sibling entries. * @xas: XArray operation state. * @index: Index of last sibling entry. * * Move the operation state to refer to the last sibling entry. * This is useful for loops that normally want to see sibling * entries but sometimes want to skip them. Use xas_set() if you * want to move to an index which is not part of this entry. */ static inline void xas_advance(struct xa_state *xas, unsigned long index) { unsigned char shift = xas_is_node(xas) ? xas->xa_node->shift : 0; xas->xa_index = index; xas->xa_offset = (index >> shift) & XA_CHUNK_MASK; } /** * xas_set_order() - Set up XArray operation state for a multislot entry. * @xas: XArray operation state. * @index: Target of the operation. * @order: Entry occupies 2^@order indices. */ static inline void xas_set_order(struct xa_state *xas, unsigned long index, unsigned int order) { #ifdef CONFIG_XARRAY_MULTI xas->xa_index = order < BITS_PER_LONG ? (index >> order) << order : 0; xas->xa_shift = order - (order % XA_CHUNK_SHIFT); xas->xa_sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1; xas->xa_node = XAS_RESTART; #else BUG_ON(order > 0); xas_set(xas, index); #endif } /** * xas_set_update() - Set up XArray operation state for a callback. * @xas: XArray operation state. * @update: Function to call when updating a node. * * The XArray can notify a caller after it has updated an xa_node. * This is advanced functionality and is only needed by the page * cache and swap cache. */ static inline void xas_set_update(struct xa_state *xas, xa_update_node_t update) { xas->xa_update = update; } static inline void xas_set_lru(struct xa_state *xas, struct list_lru *lru) { xas->xa_lru = lru; } /** * xas_next_entry() - Advance iterator to next present entry. * @xas: XArray operation state. * @max: Highest index to return. * * xas_next_entry() is an inline function to optimise xarray traversal for * speed. It is equivalent to calling xas_find(), and will call xas_find() * for all the hard cases. * * Return: The next present entry after the one currently referred to by @xas. */ static inline void *xas_next_entry(struct xa_state *xas, unsigned long max) { struct xa_node *node = xas->xa_node; void *entry; if (unlikely(xas_not_node(node) || node->shift || xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK))) return xas_find(xas, max); do { if (unlikely(xas->xa_index >= max)) return xas_find(xas, max); if (unlikely(xas->xa_offset == XA_CHUNK_MASK)) return xas_find(xas, max); entry = xa_entry(xas->xa, node, xas->xa_offset + 1); if (unlikely(xa_is_internal(entry))) return xas_find(xas, max); xas->xa_offset++; xas->xa_index++; } while (!entry); return entry; } /* Private */ static inline unsigned int xas_find_chunk(struct xa_state *xas, bool advance, xa_mark_t mark) { unsigned long *addr = xas->xa_node->marks[(__force unsigned)mark]; unsigned int offset = xas->xa_offset; if (advance) offset++; if (XA_CHUNK_SIZE == BITS_PER_LONG) { if (offset < XA_CHUNK_SIZE) { unsigned long data = *addr & (~0UL << offset); if (data) return __ffs(data); } return XA_CHUNK_SIZE; } return find_next_bit(addr, XA_CHUNK_SIZE, offset); } /** * xas_next_marked() - Advance iterator to next marked entry. * @xas: XArray operation state. * @max: Highest index to return. * @mark: Mark to search for. * * xas_next_marked() is an inline function to optimise xarray traversal for * speed. It is equivalent to calling xas_find_marked(), and will call * xas_find_marked() for all the hard cases. * * Return: The next marked entry after the one currently referred to by @xas. */ static inline void *xas_next_marked(struct xa_state *xas, unsigned long max, xa_mark_t mark) { struct xa_node *node = xas->xa_node; void *entry; unsigned int offset; if (unlikely(xas_not_node(node) || node->shift)) return xas_find_marked(xas, max, mark); offset = xas_find_chunk(xas, true, mark); xas->xa_offset = offset; xas->xa_index = (xas->xa_index & ~XA_CHUNK_MASK) + offset; if (xas->xa_index > max) return NULL; if (offset == XA_CHUNK_SIZE) return xas_find_marked(xas, max, mark); entry = xa_entry(xas->xa, node, offset); if (!entry) return xas_find_marked(xas, max, mark); return entry; } /* * If iterating while holding a lock, drop the lock and reschedule * every %XA_CHECK_SCHED loops. */ enum { XA_CHECK_SCHED = 4096, }; /** * xas_for_each() - Iterate over a range of an XArray. * @xas: XArray operation state. * @entry: Entry retrieved from the array. * @max: Maximum index to retrieve from array. * * The loop body will be executed for each entry present in the xarray * between the current xas position and @max. @entry will be set to * the entry retrieved from the xarray. It is safe to delete entries * from the array in the loop body. You should hold either the RCU lock * or the xa_lock while iterating. If you need to drop the lock, call * xas_pause() first. */ #define xas_for_each(xas, entry, max) \ for (entry = xas_find(xas, max); entry; \ entry = xas_next_entry(xas, max)) /** * xas_for_each_marked() - Iterate over a range of an XArray. * @xas: XArray operation state. * @entry: Entry retrieved from the array. * @max: Maximum index to retrieve from array. * @mark: Mark to search for. * * The loop body will be executed for each marked entry in the xarray * between the current xas position and @max. @entry will be set to * the entry retrieved from the xarray. It is safe to delete entries * from the array in the loop body. You should hold either the RCU lock * or the xa_lock while iterating. If you need to drop the lock, call * xas_pause() first. */ #define xas_for_each_marked(xas, entry, max, mark) \ for (entry = xas_find_marked(xas, max, mark); entry; \ entry = xas_next_marked(xas, max, mark)) /** * xas_for_each_conflict() - Iterate over a range of an XArray. * @xas: XArray operation state. * @entry: Entry retrieved from the array. * * The loop body will be executed for each entry in the XArray that * lies within the range specified by @xas. If the loop terminates * normally, @entry will be %NULL. The user may break out of the loop, * which will leave @entry set to the conflicting entry. The caller * may also call xa_set_err() to exit the loop while setting an error * to record the reason. */ #define xas_for_each_conflict(xas, entry) \ while ((entry = xas_find_conflict(xas))) void *__xas_next(struct xa_state *); void *__xas_prev(struct xa_state *); /** * xas_prev() - Move iterator to previous index. * @xas: XArray operation state. * * If the @xas was in an error state, it will remain in an error state * and this function will return %NULL. If the @xas has never been walked, * it will have the effect of calling xas_load(). Otherwise one will be * subtracted from the index and the state will be walked to the correct * location in the array for the next operation. * * If the iterator was referencing index 0, this function wraps * around to %ULONG_MAX. * * Return: The entry at the new index. This may be %NULL or an internal * entry. */ static inline void *xas_prev(struct xa_state *xas) { struct xa_node *node = xas->xa_node; if (unlikely(xas_not_node(node) || node->shift || xas->xa_offset == 0)) return __xas_prev(xas); xas->xa_index--; xas->xa_offset--; return xa_entry(xas->xa, node, xas->xa_offset); } /** * xas_next() - Move state to next index. * @xas: XArray operation state. * * If the @xas was in an error state, it will remain in an error state * and this function will return %NULL. If the @xas has never been walked, * it will have the effect of calling xas_load(). Otherwise one will be * added to the index and the state will be walked to the correct * location in the array for the next operation. * * If the iterator was referencing index %ULONG_MAX, this function wraps * around to 0. * * Return: The entry at the new index. This may be %NULL or an internal * entry. */ static inline void *xas_next(struct xa_state *xas) { struct xa_node *node = xas->xa_node; if (unlikely(xas_not_node(node) || node->shift || xas->xa_offset == XA_CHUNK_MASK)) return __xas_next(xas); xas->xa_index++; xas->xa_offset++; return xa_entry(xas->xa, node, xas->xa_offset); } #endif /* _LINUX_XARRAY_H */
7 72 768 67 11 66 472 458 456 244 3 187 1 1 18 22 1 1 136 88 4 51 18 93 4 47 17 122 2 24 470 470 471 96 9 103 137 4 25 25 25 25 315 31 316 31 137 135 1 39 39 21 16 7 285 348 80 13 32 115 114 1 33 33 32 1 33 36 3 29 105 5 103 5 6 2 8 8 8 17 11 8 17 7 6 6 6 23 23 4 4 148 23 810 604 645 776 2 4 3 1 13 6 2 6 6 217 10 2 11 11 10 2 85 59 60 7 3 3 3 3 2 2 1 2 2 3 25 26 25 26 7 6 3 4 3 2 9 9 1 2 5 5 2 2 10 2 10 6 165 1 3 1 1 1 11 9 2 11 523 153 435 22 11 7 7 359 360 359 358 360 48 21 180 297 18 309 7 311 6 313 312 32 32 1 1 28 32 15 32 2 31 1 2 32 1 1 28 31 31 30 1 29 30 29 39 40 30 31 4 4 7 7 7 4 275 8 6 4 524 2 521 522 161 422 519 4 4 3 3 4 4 3 1 8 5 4 8 8 51 49 5 2 2 2 2 2 1 2 2 2 2 6 6 2 4 2 1 1 7 1 2 4 4 4 2 1 1 113 6 119 119 119 43 15 6 6 42 1 73 1 3 32 22 8 22 22 1 20 3 20 1 20 5 2 13 1 12 10 1 23 76 75 76 1 3 3 60 10 8 2 2 67 8 7 13 2 3 1 10 70 24 1 51 9 16 6 93 61 61 22 22 12 61 56 4 136 1 1 10 57 3 61 54 3 1 1 81 136 1 1 133 3 137 136 136 13 1 6 2 3 1 1 3 1 3 2 7 1 8 7 2 6 1 1 6 1 1 1 7 5 2 1 88 79 2 177 177 176 8 3 3 3 62 156 6 161 1 1 6 1 7 18 143 57 74 163 34 16 14 159 9 4 5 2 2 5 5 2 2 29 29 4 19 6 7 6 1 23 10 1 1 11 13 12 1 12 472 472 438 99 1 474 474 2 472 2 2 2 2 67 1 67 2 2 472 1 474 454 34 238 46 217 19 1 2 16 16 1 1 1 13 3 2 162 12 43 48 20 3 39 14 1 3 27 6 3 1 3 76 75 76 76 73 100 100 3 68 71 2 4 4 4 1 4 4 3 2 7 7 6 7 4 1 1 1 1 3 3 7 4 3 2 3 4 2 3 3 5 5 5 5 2 2 2 1 7 3 3 1 4 3 1 1 1 284 317 60 6 6 23 18 22 1 16 2 21 7 58 40 5 17 6 448 447 449 8 347 5 1 87 90 43 2 2 1 3 1 1 1 1 427 2 431 16 383 31 383 389 8 1 397 392 8 150 383 80 68 116 1 1 1 2 1 112 116 116 7 1 1 317 34 263 19 2 4 3 25 25 7 8 2 1 9 16 2 2 1 1 2 2 1 3 8 1 1 7 138 138 139 136 42 128 134 110 2 2 2 139 141 11 152 88 3 1 5 4 1 76 77 69 8 135 131 2 1 1 1 131 10 8 4 5 2 7 3 4 3 4 7 6 7 7 7 7 7 6 1 14 14 6 3 3 3 3 1 2 1 1 3 3 3 2 9 9 4 5 3 2 7 3 1 1 2 2 1 1 4 5 1 5 4 3 1 3 17 2 5 11 9 7 135 23 9 89 1 7 2 3 30 3 27 9 7 8 5 5 7 1 177 4 2 2 171 2 1 111 60 1 1 1 2 23 142 1 2 1 157 158 1 2 155 1 1 2 153 5 6 2 9 138 148 7 142 1 1 27 5 22 27 22 5 2 1 130 8 136 1 138 3 2 132 1 133 1 30 102 31 93 94 95 95 11 93 93 12 1 11 5 3 4 5 5 1 2 5 2 5 5 5 3 3 3 3 1 3 3 3 3 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 8 2 5 8 8 6 5 3 2 1 7 3 1 1 2 3 1 1 1 1 1 1 1 5 5 3 23 25 61 60 35 27 1 2 1 33 17 16 33 30 3 32 1 7 6 12 11 19 17 2 9 10 18 18 18 2 2 3 2 3 3 25 14 11 11 11 1 14 25 15 10 22 3 3 5 5 1 6 7 1 6 10 11 11 8 11 11 9 1 1 2 58 6 17 50 60 1 54 1 1 1 1 1 4 49 3 47 2 1 1 2 1 4 9 1 2 9 12 8 19 3 21 1 10 30 28 3 19 12 12 31 23 5 3 3 22 2 2 10 14 26 24 1 1 24 1 1 1 13 10 2 7 23 7 80 2 78 7 5 66 65 24 66 26 46 23 5 7 1 6 1 1 3 1 2 3 19 19 19 1 1 1 1 12 1 10 8 1 1 3 6 7 1 6 7 1 1 5 5 26 26 5 5 5 1 18 5 4 12 9 7 11 21 22 22 19 1 1 6 20 3 3 1 16 17 1 15 9 7 7 7 15 15 15 17 18 15 17 1 1 1 2 2 27 4 1 22 3 1 2 17 1 8 6 1 7 9 7 7 1 7 7 7 7 6 5 1 4 3 2 4 1 6 7 2 4 4 2 4 2 2 2 6 7 7 2 5 3 1 1 9 4 4 1 4 2 2 2 2 5 18 12 7 15 6 5 4 5 4 4 9 9 5 1 34 12 2 1 32 1 3 3 1 24 14 3 14 2 11 21 27 1 27 30 30 27 29 1 1 1 24 1 1 22 26 22 17 5 3 4 3 6 5 5 3 8 5 1 1 2 4 1 5 3 2 2 2 39 5 1 35 4 30 11 30 1 26 3 7 22 21 3 12 5 3 2 4 2 2 3 3 2 2 20 7 13 18 5 2 11 4 3 9 7 2 4 2 4 5 5 5 5 5 3 3 3 2 5 5 2 3 4 2 2 10 10 5 1 4 2 2 2 9 9 2 20 20 9 9 79 79 79 13 12 1 1 913 916 191 124 13 149 78 1 1 160 157 145 55 57 41 1 3 1 3 1 2 1 15 11 2 6 89 93 93 6 24 84 145 146 129 93 8 131 131 1 1 1 1 1 1 1 1 1 1 1 1 1 833 832 833 6 6 117 33 34 34 71 85 85 2 6 84 145 66 149 149 9 149 48 149 148 149 149 149 147 1 1 156 3 156 155 1 1 4 2 149 149 149 100 148 149 147 149 149 149 29 1 8 52 53 11 4 4 42 42 1 2 14 14 94 95 12 12 11 10 1 9 10 4 5 4 1 5 1 5 5 1 1 3 5 1 3 3 147 149 149 149 149 10 10 10 319 263 107 27 94 19 2 3 18 715 716 6 718 336 41 1 3 4 4 263 22 1 3 4 107 107 107 3 1 26 29 27 27 27 27 21 21 21 21 95 4 94 94 13 9 5 13 19 19 1 1 1 1 1 2 2 1 1 3 18 18 18 1 1 1 3 716 715 719 320 320 717 719 716 10 715 835 834 1 1 16 1 15 50 3 48 169 41 75 74 9 59 22 96 130 29 6 1 1 97 96 132 132 35 1 1 13 1 1 2 1 1 11 4 4 1 1 22 12 60 1 1 36 8 1 15 18 9 11 8 5 4 12 12 11 12 2 12 12 11 6 6 6 12 2 2 12 1 1 1 12 1 1 1 12 11 11 12 94 76 17 17 1 12 12 12 18 1 12 12 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620 9621 9622 9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9758 9759 9760 9761 9762 9763 9764 9765 9766 9767 9768 9769 9770 9771 9772 9773 9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784 9785 9786 9787 9788 9789 9790 9791 9792 9793 9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805 9806 9807 9808 9809 9810 9811 9812 9813 9814 9815 9816 9817 9818 9819 9820 9821 9822 9823 9824 9825 9826 9827 9828 9829 9830 9831 9832 9833 9834 9835 9836 9837 9838 9839 9840 9841 9842 9843 9844 9845 9846 9847 9848 9849 9850 9851 9852 9853 9854 9855 9856 9857 9858 9859 9860 9861 9862 9863 9864 9865 9866 9867 9868 9869 9870 9871 9872 9873 9874 9875 9876 9877 9878 9879 9880 9881 9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895 9896 9897 9898 9899 9900 9901 9902 9903 9904 9905 9906 9907 9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924 9925 9926 9927 9928 9929 9930 9931 9932 9933 9934 9935 9936 9937 9938 9939 9940 9941 9942 9943 9944 9945 9946 9947 9948 9949 9950 9951 9952 9953 9954 9955 9956 9957 9958 9959 9960 9961 9962 9963 9964 9965 9966 9967 9968 9969 9970 9971 9972 9973 9974 9975 9976 9977 9978 9979 9980 9981 9982 9983 9984 9985 9986 9987 9988 9989 9990 9991 9992 9993 9994 9995 9996 9997 9998 9999 10000 10001 10002 10003 10004 10005 10006 10007 10008 10009 10010 10011 10012 10013 10014 10015 10016 10017 10018 10019 10020 10021 10022 10023 10024 10025 10026 10027 10028 10029 10030 10031 10032 10033 10034 10035 10036 10037 10038 10039 10040 10041 10042 10043 10044 10045 10046 10047 10048 10049 10050 10051 10052 10053 10054 10055 10056 10057 10058 10059 10060 10061 10062 10063 10064 10065 10066 10067 10068 10069 10070 10071 10072 10073 10074 10075 10076 10077 10078 10079 10080 10081 10082 10083 10084 10085 10086 10087 10088 10089 10090 10091 10092 10093 10094 10095 10096 10097 10098 10099 10100 10101 10102 10103 10104 10105 10106 10107 10108 10109 10110 10111 10112 10113 10114 10115 10116 10117 10118 10119 10120 10121 10122 10123 10124 10125 10126 10127 10128 10129 10130 10131 10132 10133 10134 10135 10136 10137 10138 10139 10140 10141 10142 10143 10144 10145 10146 10147 10148 10149 10150 10151 10152 10153 10154 10155 10156 10157 10158 10159 10160 10161 10162 10163 10164 10165 10166 10167 10168 10169 10170 10171 10172 10173 10174 10175 10176 10177 10178 10179 10180 10181 10182 10183 10184 10185 10186 10187 10188 10189 10190 10191 10192 10193 10194 10195 10196 10197 10198 10199 10200 10201 10202 10203 10204 10205 10206 10207 10208 10209 10210 10211 10212 10213 10214 10215 10216 10217 10218 10219 10220 10221 10222 10223 10224 10225 10226 10227 10228 10229 10230 10231 10232 10233 10234 10235 10236 10237 10238 10239 10240 10241 10242 10243 10244 10245 10246 10247 10248 10249 10250 10251 10252 10253 10254 10255 10256 10257 10258 10259 10260 10261 10262 10263 10264 10265 10266 10267 10268 10269 10270 10271 10272 10273 10274 10275 10276 10277 10278 10279 10280 10281 10282 10283 10284 10285 10286 10287 10288 10289 10290 10291 10292 10293 10294 10295 10296 10297 10298 10299 10300 10301 10302 10303 10304 10305 10306 10307 10308 10309 10310 10311 10312 10313 10314 10315 10316 10317 10318 10319 10320 10321 10322 10323 10324 10325 10326 10327 10328 10329 10330 10331 10332 10333 10334 10335 10336 10337 10338 10339 10340 10341 10342 10343 10344 10345 10346 10347 10348 10349 10350 10351 10352 10353 10354 10355 10356 10357 10358 10359 10360 10361 10362 10363 10364 10365 10366 10367 10368 10369 10370 10371 10372 10373 10374 10375 10376 10377 10378 10379 10380 10381 10382 10383 10384 10385 10386 10387 10388 10389 10390 10391 10392 10393 10394 10395 10396 10397 10398 10399 10400 10401 10402 10403 10404 10405 10406 10407 10408 10409 10410 10411 10412 10413 10414 10415 10416 10417 10418 10419 10420 10421 10422 10423 10424 10425 10426 10427 10428 10429 10430 10431 10432 10433 10434 10435 10436 10437 10438 10439 10440 10441 10442 10443 10444 10445 10446 10447 10448 10449 10450 10451 10452 10453 10454 10455 10456 10457 10458 10459 10460 10461 10462 10463 10464 10465 10466 10467 10468 10469 10470 10471 10472 10473 10474 10475 10476 10477 10478 10479 10480 10481 10482 10483 10484 10485 10486 10487 10488 10489 10490 10491 10492 10493 10494 10495 10496 10497 10498 10499 10500 10501 10502 10503 10504 10505 10506 10507 10508 10509 10510 10511 10512 10513 10514 10515 10516 10517 10518 10519 10520 10521 10522 10523 10524 10525 10526 10527 10528 10529 10530 10531 10532 10533 10534 10535 10536 10537 10538 10539 10540 10541 10542 10543 10544 10545 10546 10547 10548 10549 10550 10551 10552 10553 10554 10555 10556 10557 10558 10559 10560 10561 10562 10563 10564 10565 10566 10567 10568 10569 10570 10571 10572 10573 10574 10575 10576 10577 10578 10579 10580 10581 10582 10583 10584 10585 10586 10587 10588 10589 10590 10591 10592 10593 10594 10595 10596 10597 10598 10599 10600 10601 10602 10603 10604 10605 10606 10607 10608 10609 10610 10611 10612 10613 10614 10615 10616 10617 10618 10619 10620 10621 10622 10623 10624 10625 10626 10627 10628 10629 10630 10631 10632 10633 10634 10635 10636 10637 10638 10639 10640 10641 10642 10643 10644 10645 10646 10647 10648 10649 10650 10651 10652 10653 10654 10655 10656 10657 10658 10659 10660 10661 10662 10663 10664 10665 10666 10667 10668 10669 10670 10671 10672 10673 10674 10675 10676 10677 10678 10679 10680 10681 10682 10683 10684 10685 10686 10687 10688 10689 10690 10691 10692 10693 10694 10695 10696 10697 10698 10699 10700 10701 10702 10703 10704 10705 10706 10707 10708 10709 10710 10711 10712 10713 10714 10715 10716 10717 10718 10719 10720 10721 10722 10723 10724 10725 10726 10727 10728 10729 10730 10731 10732 10733 10734 10735 10736 10737 10738 10739 10740 10741 10742 10743 10744 10745 10746 10747 10748 10749 10750 10751 10752 10753 10754 10755 10756 10757 10758 10759 10760 10761 10762 10763 10764 10765 10766 10767 10768 10769 10770 10771 10772 10773 10774 10775 10776 10777 10778 10779 10780 10781 10782 10783 10784 10785 10786 10787 10788 10789 10790 10791 10792 10793 10794 10795 10796 10797 10798 10799 10800 10801 10802 10803 10804 10805 10806 10807 10808 10809 10810 10811 10812 10813 10814 10815 10816 10817 10818 10819 10820 10821 10822 10823 10824 10825 10826 10827 10828 10829 10830 10831 10832 10833 10834 10835 10836 10837 10838 10839 10840 10841 10842 10843 10844 10845 10846 10847 10848 10849 10850 10851 10852 10853 10854 10855 10856 10857 10858 10859 10860 10861 10862 10863 10864 10865 10866 10867 10868 10869 10870 10871 10872 10873 10874 10875 10876 10877 10878 10879 10880 10881 10882 10883 10884 10885 10886 10887 10888 10889 10890 10891 10892 10893 10894 10895 10896 10897 10898 10899 10900 10901 10902 10903 10904 10905 10906 10907 10908 10909 10910 10911 10912 10913 10914 10915 10916 10917 10918 10919 10920 10921 10922 10923 10924 10925 10926 10927 10928 10929 10930 10931 10932 10933 10934 10935 10936 10937 10938 10939 10940 10941 10942 10943 10944 10945 10946 10947 10948 10949 10950 10951 10952 10953 10954 10955 10956 10957 10958 10959 10960 10961 10962 10963 10964 10965 10966 10967 10968 10969 10970 10971 10972 10973 10974 10975 10976 10977 10978 10979 10980 10981 10982 10983 10984 10985 10986 10987 10988 10989 10990 10991 10992 10993 10994 10995 10996 10997 10998 10999 11000 11001 11002 11003 11004 11005 11006 11007 11008 11009 11010 11011 11012 11013 11014 11015 11016 11017 11018 11019 11020 11021 11022 11023 11024 11025 11026 11027 11028 11029 11030 11031 11032 11033 11034 11035 11036 11037 11038 11039 11040 11041 11042 11043 11044 11045 11046 11047 11048 11049 11050 11051 11052 11053 11054 11055 11056 11057 11058 11059 11060 11061 11062 11063 11064 11065 11066 11067 11068 11069 11070 11071 11072 11073 11074 11075 11076 11077 11078 11079 11080 11081 11082 11083 11084 11085 11086 11087 11088 11089 11090 11091 11092 11093 11094 11095 11096 11097 11098 11099 11100 11101 11102 11103 11104 11105 11106 11107 11108 11109 11110 11111 11112 11113 11114 11115 11116 11117 11118 11119 11120 11121 11122 11123 11124 11125 11126 11127 11128 11129 11130 11131 11132 11133 11134 11135 11136 11137 11138 11139 11140 11141 11142 11143 11144 11145 11146 11147 11148 11149 11150 11151 11152 11153 11154 11155 11156 11157 11158 11159 11160 11161 11162 11163 11164 11165 11166 11167 11168 11169 11170 11171 11172 11173 11174 11175 11176 11177 11178 11179 11180 11181 11182 11183 11184 11185 11186 11187 11188 11189 11190 11191 11192 11193 11194 11195 11196 11197 11198 11199 11200 11201 11202 11203 11204 11205 11206 11207 11208 11209 11210 11211 11212 11213 11214 11215 11216 11217 11218 11219 11220 11221 11222 11223 11224 11225 11226 11227 11228 11229 11230 11231 11232 11233 11234 11235 11236 11237 11238 11239 11240 11241 11242 11243 11244 11245 11246 11247 11248 11249 11250 11251 11252 11253 11254 11255 11256 11257 11258 11259 11260 11261 11262 11263 11264 11265 11266 11267 11268 11269 11270 11271 11272 11273 11274 11275 11276 11277 11278 11279 11280 11281 11282 11283 11284 11285 11286 11287 11288 11289 11290 11291 11292 11293 11294 11295 11296 11297 11298 11299 11300 11301 11302 11303 11304 11305 11306 11307 11308 11309 11310 11311 11312 11313 11314 11315 11316 11317 11318 11319 11320 11321 11322 11323 11324 11325 11326 11327 11328 11329 11330 11331 11332 11333 11334 11335 11336 11337 11338 11339 11340 11341 11342 11343 11344 11345 11346 11347 11348 11349 11350 11351 11352 11353 11354 11355 11356 11357 11358 11359 11360 11361 11362 11363 11364 11365 11366 11367 11368 11369 11370 11371 11372 11373 11374 11375 11376 11377 11378 11379 11380 11381 11382 11383 11384 11385 11386 11387 11388 11389 11390 11391 11392 11393 11394 11395 11396 11397 11398 11399 11400 11401 11402 11403 11404 11405 11406 11407 11408 11409 11410 11411 11412 11413 11414 11415 11416 11417 11418 11419 11420 11421 11422 11423 11424 11425 11426 11427 11428 11429 11430 11431 11432 11433 11434 11435 11436 11437 11438 11439 11440 11441 11442 11443 11444 11445 11446 11447 11448 11449 11450 11451 11452 11453 11454 11455 11456 11457 11458 11459 11460 11461 11462 11463 11464 11465 11466 11467 11468 11469 11470 11471 11472 11473 11474 11475 11476 11477 11478 11479 11480 11481 11482 11483 11484 11485 11486 11487 11488 11489 11490 11491 11492 11493 11494 11495 11496 11497 11498 11499 11500 11501 11502 11503 11504 11505 11506 11507 11508 11509 11510 11511 11512 11513 11514 11515 11516 11517 11518 11519 11520 11521 11522 11523 11524 11525 11526 11527 11528 11529 11530 11531 11532 11533 11534 11535 11536 11537 11538 11539 11540 11541 11542 11543 11544 11545 11546 11547 11548 11549 11550 11551 11552 11553 11554 11555 11556 11557 11558 11559 11560 11561 11562 11563 11564 11565 11566 11567 11568 11569 11570 11571 11572 11573 11574 11575 11576 11577 11578 11579 11580 11581 11582 11583 11584 11585 11586 11587 11588 11589 11590 11591 11592 11593 11594 11595 11596 11597 11598 11599 11600 11601 11602 11603 11604 11605 11606 11607 11608 11609 11610 11611 11612 11613 11614 11615 11616 11617 11618 11619 11620 11621 11622 11623 11624 11625 11626 11627 11628 11629 11630 11631 11632 11633 11634 11635 11636 11637 11638 11639 11640 11641 11642 11643 11644 11645 11646 11647 11648 11649 11650 11651 11652 11653 11654 11655 11656 11657 11658 11659 11660 11661 11662 11663 11664 11665 11666 11667 11668 11669 11670 11671 11672 11673 11674 11675 11676 11677 11678 11679 11680 11681 11682 11683 11684 11685 11686 11687 11688 11689 11690 11691 11692 11693 11694 11695 11696 11697 11698 11699 11700 11701 11702 11703 11704 11705 11706 11707 11708 11709 11710 11711 11712 11713 11714 11715 11716 11717 11718 11719 11720 11721 11722 11723 11724 11725 11726 11727 11728 11729 11730 11731 11732 11733 11734 11735 11736 11737 11738 11739 11740 11741 11742 11743 11744 11745 11746 11747 11748 11749 11750 11751 11752 11753 11754 11755 11756 11757 11758 11759 11760 11761 11762 11763 11764 11765 11766 11767 11768 11769 11770 11771 11772 11773 11774 11775 11776 11777 11778 11779 11780 11781 11782 11783 11784 11785 11786 11787 11788 11789 11790 11791 11792 11793 11794 11795 11796 11797 11798 11799 11800 11801 11802 11803 11804 11805 11806 11807 11808 11809 11810 11811 11812 11813 11814 11815 11816 11817 11818 11819 11820 11821 11822 11823 11824 11825 11826 11827 11828 11829 11830 11831 11832 11833 11834 11835 11836 11837 11838 11839 11840 11841 11842 11843 11844 11845 11846 11847 11848 11849 11850 11851 11852 11853 11854 11855 11856 11857 11858 11859 11860 11861 11862 11863 11864 11865 11866 11867 11868 11869 11870 11871 11872 11873 11874 11875 11876 11877 11878 11879 11880 11881 11882 11883 11884 11885 11886 11887 11888 11889 11890 11891 11892 11893 11894 11895 11896 11897 11898 11899 11900 11901 11902 11903 11904 11905 11906 11907 11908 11909 11910 11911 11912 11913 11914 11915 11916 11917 11918 11919 11920 11921 11922 11923 11924 11925 11926 11927 11928 11929 11930 11931 11932 11933 11934 11935 11936 11937 11938 11939 11940 11941 11942 11943 11944 11945 11946 11947 11948 11949 11950 11951 11952 11953 11954 11955 11956 11957 11958 11959 11960 11961 11962 11963 11964 11965 11966 11967 11968 11969 11970 11971 11972 11973 11974 11975 11976 11977 11978 11979 11980 11981 11982 11983 11984 11985 11986 11987 11988 11989 11990 11991 11992 11993 11994 11995 11996 11997 11998 11999 12000 12001 12002 12003 12004 12005 12006 12007 12008 12009 12010 12011 12012 12013 12014 12015 12016 12017 12018 12019 12020 12021 12022 12023 12024 12025 12026 12027 12028 12029 12030 12031 12032 12033 12034 12035 12036 12037 12038 12039 12040 12041 12042 12043 12044 12045 12046 12047 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2007-2009 Patrick McHardy <kaber@trash.net> * * Development of this code funded by Astaro AG (http://www.astaro.com/) */ #include <linux/module.h> #include <linux/init.h> #include <linux/list.h> #include <linux/skbuff.h> #include <linux/netlink.h> #include <linux/vmalloc.h> #include <linux/rhashtable.h> #include <linux/audit.h> #include <linux/netfilter.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_flow_table.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_offload.h> #include <net/net_namespace.h> #include <net/sock.h> #define NFT_MODULE_AUTOLOAD_LIMIT (MODULE_NAME_LEN - sizeof("nft-expr-255-")) #define NFT_SET_MAX_ANONLEN 16 /* limit compaction to avoid huge kmalloc/krealloc sizes. */ #define NFT_MAX_SET_NELEMS ((2048 - sizeof(struct nft_trans_elem)) / sizeof(struct nft_trans_one_elem)) unsigned int nf_tables_net_id __read_mostly; static LIST_HEAD(nf_tables_expressions); static LIST_HEAD(nf_tables_objects); static LIST_HEAD(nf_tables_flowtables); static LIST_HEAD(nf_tables_destroy_list); static LIST_HEAD(nf_tables_gc_list); static DEFINE_SPINLOCK(nf_tables_destroy_list_lock); static DEFINE_SPINLOCK(nf_tables_gc_list_lock); enum { NFT_VALIDATE_SKIP = 0, NFT_VALIDATE_NEED, NFT_VALIDATE_DO, }; static struct rhltable nft_objname_ht; static u32 nft_chain_hash(const void *data, u32 len, u32 seed); static u32 nft_chain_hash_obj(const void *data, u32 len, u32 seed); static int nft_chain_hash_cmp(struct rhashtable_compare_arg *, const void *); static u32 nft_objname_hash(const void *data, u32 len, u32 seed); static u32 nft_objname_hash_obj(const void *data, u32 len, u32 seed); static int nft_objname_hash_cmp(struct rhashtable_compare_arg *, const void *); static const struct rhashtable_params nft_chain_ht_params = { .head_offset = offsetof(struct nft_chain, rhlhead), .key_offset = offsetof(struct nft_chain, name), .hashfn = nft_chain_hash, .obj_hashfn = nft_chain_hash_obj, .obj_cmpfn = nft_chain_hash_cmp, .automatic_shrinking = true, }; static const struct rhashtable_params nft_objname_ht_params = { .head_offset = offsetof(struct nft_object, rhlhead), .key_offset = offsetof(struct nft_object, key), .hashfn = nft_objname_hash, .obj_hashfn = nft_objname_hash_obj, .obj_cmpfn = nft_objname_hash_cmp, .automatic_shrinking = true, }; struct nft_audit_data { struct nft_table *table; int entries; int op; struct list_head list; }; static const u8 nft2audit_op[NFT_MSG_MAX] = { // enum nf_tables_msg_types [NFT_MSG_NEWTABLE] = AUDIT_NFT_OP_TABLE_REGISTER, [NFT_MSG_GETTABLE] = AUDIT_NFT_OP_INVALID, [NFT_MSG_DELTABLE] = AUDIT_NFT_OP_TABLE_UNREGISTER, [NFT_MSG_NEWCHAIN] = AUDIT_NFT_OP_CHAIN_REGISTER, [NFT_MSG_GETCHAIN] = AUDIT_NFT_OP_INVALID, [NFT_MSG_DELCHAIN] = AUDIT_NFT_OP_CHAIN_UNREGISTER, [NFT_MSG_NEWRULE] = AUDIT_NFT_OP_RULE_REGISTER, [NFT_MSG_GETRULE] = AUDIT_NFT_OP_INVALID, [NFT_MSG_DELRULE] = AUDIT_NFT_OP_RULE_UNREGISTER, [NFT_MSG_NEWSET] = AUDIT_NFT_OP_SET_REGISTER, [NFT_MSG_GETSET] = AUDIT_NFT_OP_INVALID, [NFT_MSG_DELSET] = AUDIT_NFT_OP_SET_UNREGISTER, [NFT_MSG_NEWSETELEM] = AUDIT_NFT_OP_SETELEM_REGISTER, [NFT_MSG_GETSETELEM] = AUDIT_NFT_OP_INVALID, [NFT_MSG_DELSETELEM] = AUDIT_NFT_OP_SETELEM_UNREGISTER, [NFT_MSG_NEWGEN] = AUDIT_NFT_OP_GEN_REGISTER, [NFT_MSG_GETGEN] = AUDIT_NFT_OP_INVALID, [NFT_MSG_TRACE] = AUDIT_NFT_OP_INVALID, [NFT_MSG_NEWOBJ] = AUDIT_NFT_OP_OBJ_REGISTER, [NFT_MSG_GETOBJ] = AUDIT_NFT_OP_INVALID, [NFT_MSG_DELOBJ] = AUDIT_NFT_OP_OBJ_UNREGISTER, [NFT_MSG_GETOBJ_RESET] = AUDIT_NFT_OP_OBJ_RESET, [NFT_MSG_NEWFLOWTABLE] = AUDIT_NFT_OP_FLOWTABLE_REGISTER, [NFT_MSG_GETFLOWTABLE] = AUDIT_NFT_OP_INVALID, [NFT_MSG_DELFLOWTABLE] = AUDIT_NFT_OP_FLOWTABLE_UNREGISTER, [NFT_MSG_GETSETELEM_RESET] = AUDIT_NFT_OP_SETELEM_RESET, }; static void nft_validate_state_update(struct nft_table *table, u8 new_validate_state) { switch (table->validate_state) { case NFT_VALIDATE_SKIP: WARN_ON_ONCE(new_validate_state == NFT_VALIDATE_DO); break; case NFT_VALIDATE_NEED: break; case NFT_VALIDATE_DO: if (new_validate_state == NFT_VALIDATE_NEED) return; } table->validate_state = new_validate_state; } static void nf_tables_trans_destroy_work(struct work_struct *w); static DECLARE_WORK(trans_destroy_work, nf_tables_trans_destroy_work); static void nft_trans_gc_work(struct work_struct *work); static DECLARE_WORK(trans_gc_work, nft_trans_gc_work); static void nft_ctx_init(struct nft_ctx *ctx, struct net *net, const struct sk_buff *skb, const struct nlmsghdr *nlh, u8 family, struct nft_table *table, struct nft_chain *chain, const struct nlattr * const *nla) { ctx->net = net; ctx->family = family; ctx->level = 0; ctx->table = table; ctx->chain = chain; ctx->nla = nla; ctx->portid = NETLINK_CB(skb).portid; ctx->report = nlmsg_report(nlh); ctx->flags = nlh->nlmsg_flags; ctx->seq = nlh->nlmsg_seq; bitmap_zero(ctx->reg_inited, NFT_REG32_NUM); } static struct nft_trans *nft_trans_alloc_gfp(const struct nft_ctx *ctx, int msg_type, u32 size, gfp_t gfp) { struct nft_trans *trans; trans = kzalloc(size, gfp); if (trans == NULL) return NULL; INIT_LIST_HEAD(&trans->list); trans->msg_type = msg_type; trans->net = ctx->net; trans->table = ctx->table; trans->seq = ctx->seq; trans->flags = ctx->flags; trans->report = ctx->report; return trans; } static struct nft_trans *nft_trans_alloc(const struct nft_ctx *ctx, int msg_type, u32 size) { return nft_trans_alloc_gfp(ctx, msg_type, size, GFP_KERNEL); } static struct nft_trans_binding *nft_trans_get_binding(struct nft_trans *trans) { switch (trans->msg_type) { case NFT_MSG_NEWCHAIN: case NFT_MSG_NEWSET: return container_of(trans, struct nft_trans_binding, nft_trans); } return NULL; } static void nft_trans_list_del(struct nft_trans *trans) { struct nft_trans_binding *trans_binding; list_del(&trans->list); trans_binding = nft_trans_get_binding(trans); if (trans_binding) list_del(&trans_binding->binding_list); } static void nft_trans_destroy(struct nft_trans *trans) { nft_trans_list_del(trans); kfree(trans); } static void __nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set, bool bind) { struct nftables_pernet *nft_net; struct net *net = ctx->net; struct nft_trans *trans; if (!nft_set_is_anonymous(set)) return; nft_net = nft_pernet(net); list_for_each_entry_reverse(trans, &nft_net->commit_list, list) { switch (trans->msg_type) { case NFT_MSG_NEWSET: if (nft_trans_set(trans) == set) nft_trans_set_bound(trans) = bind; break; case NFT_MSG_NEWSETELEM: if (nft_trans_elem_set(trans) == set) nft_trans_elem_set_bound(trans) = bind; break; } } } static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set) { return __nft_set_trans_bind(ctx, set, true); } static void nft_set_trans_unbind(const struct nft_ctx *ctx, struct nft_set *set) { return __nft_set_trans_bind(ctx, set, false); } static void __nft_chain_trans_bind(const struct nft_ctx *ctx, struct nft_chain *chain, bool bind) { struct nftables_pernet *nft_net; struct net *net = ctx->net; struct nft_trans *trans; if (!nft_chain_binding(chain)) return; nft_net = nft_pernet(net); list_for_each_entry_reverse(trans, &nft_net->commit_list, list) { switch (trans->msg_type) { case NFT_MSG_NEWCHAIN: if (nft_trans_chain(trans) == chain) nft_trans_chain_bound(trans) = bind; break; case NFT_MSG_NEWRULE: if (nft_trans_rule_chain(trans) == chain) nft_trans_rule_bound(trans) = bind; break; } } } static void nft_chain_trans_bind(const struct nft_ctx *ctx, struct nft_chain *chain) { __nft_chain_trans_bind(ctx, chain, true); } int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain) { if (!nft_chain_binding(chain)) return 0; if (nft_chain_binding(ctx->chain)) return -EOPNOTSUPP; if (chain->bound) return -EBUSY; if (!nft_use_inc(&chain->use)) return -EMFILE; chain->bound = true; nft_chain_trans_bind(ctx, chain); return 0; } void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain) { __nft_chain_trans_bind(ctx, chain, false); } static int nft_netdev_register_hooks(struct net *net, struct list_head *hook_list) { struct nft_hook *hook; int err, j; j = 0; list_for_each_entry(hook, hook_list, list) { err = nf_register_net_hook(net, &hook->ops); if (err < 0) goto err_register; j++; } return 0; err_register: list_for_each_entry(hook, hook_list, list) { if (j-- <= 0) break; nf_unregister_net_hook(net, &hook->ops); } return err; } static void nft_netdev_unregister_hooks(struct net *net, struct list_head *hook_list, bool release_netdev) { struct nft_hook *hook, *next; list_for_each_entry_safe(hook, next, hook_list, list) { nf_unregister_net_hook(net, &hook->ops); if (release_netdev) { list_del(&hook->list); kfree_rcu(hook, rcu); } } } static int nf_tables_register_hook(struct net *net, const struct nft_table *table, struct nft_chain *chain) { struct nft_base_chain *basechain; const struct nf_hook_ops *ops; if (table->flags & NFT_TABLE_F_DORMANT || !nft_is_base_chain(chain)) return 0; basechain = nft_base_chain(chain); ops = &basechain->ops; if (basechain->type->ops_register) return basechain->type->ops_register(net, ops); if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) return nft_netdev_register_hooks(net, &basechain->hook_list); return nf_register_net_hook(net, &basechain->ops); } static void __nf_tables_unregister_hook(struct net *net, const struct nft_table *table, struct nft_chain *chain, bool release_netdev) { struct nft_base_chain *basechain; const struct nf_hook_ops *ops; if (table->flags & NFT_TABLE_F_DORMANT || !nft_is_base_chain(chain)) return; basechain = nft_base_chain(chain); ops = &basechain->ops; if (basechain->type->ops_unregister) return basechain->type->ops_unregister(net, ops); if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) nft_netdev_unregister_hooks(net, &basechain->hook_list, release_netdev); else nf_unregister_net_hook(net, &basechain->ops); } static void nf_tables_unregister_hook(struct net *net, const struct nft_table *table, struct nft_chain *chain) { return __nf_tables_unregister_hook(net, table, chain, false); } static bool nft_trans_collapse_set_elem_allowed(const struct nft_trans_elem *a, const struct nft_trans_elem *b) { /* NB: the ->bound equality check is defensive, at this time we only merge * a new nft_trans_elem transaction request with the transaction tail * element, but a->bound != b->bound would imply a NEWRULE transaction * is queued in-between. * * The set check is mandatory, the NFT_MAX_SET_NELEMS check prevents * huge krealloc() requests. */ return a->set == b->set && a->bound == b->bound && a->nelems < NFT_MAX_SET_NELEMS; } static bool nft_trans_collapse_set_elem(struct nftables_pernet *nft_net, struct nft_trans_elem *tail, struct nft_trans_elem *trans, gfp_t gfp) { unsigned int nelems, old_nelems = tail->nelems; struct nft_trans_elem *new_trans; if (!nft_trans_collapse_set_elem_allowed(tail, trans)) return false; /* "cannot happen", at this time userspace element add * requests always allocate a new transaction element. * * This serves as a reminder to adjust the list_add_tail * logic below in case this ever changes. */ if (WARN_ON_ONCE(trans->nelems != 1)) return false; if (check_add_overflow(old_nelems, trans->nelems, &nelems)) return false; /* krealloc might free tail which invalidates list pointers */ list_del_init(&tail->nft_trans.list); new_trans = krealloc(tail, struct_size(tail, elems, nelems), gfp); if (!new_trans) { list_add_tail(&tail->nft_trans.list, &nft_net->commit_list); return false; } /* * new_trans->nft_trans.list contains garbage, but * list_add_tail() doesn't care. */ new_trans->nelems = nelems; new_trans->elems[old_nelems] = trans->elems[0]; list_add_tail(&new_trans->nft_trans.list, &nft_net->commit_list); return true; } static bool nft_trans_try_collapse(struct nftables_pernet *nft_net, struct nft_trans *trans, gfp_t gfp) { struct nft_trans *tail; if (list_empty(&nft_net->commit_list)) return false; tail = list_last_entry(&nft_net->commit_list, struct nft_trans, list); if (tail->msg_type != trans->msg_type) return false; switch (trans->msg_type) { case NFT_MSG_NEWSETELEM: case NFT_MSG_DELSETELEM: return nft_trans_collapse_set_elem(nft_net, nft_trans_container_elem(tail), nft_trans_container_elem(trans), gfp); } return false; } static void nft_trans_commit_list_add_tail(struct net *net, struct nft_trans *trans) { struct nftables_pernet *nft_net = nft_pernet(net); struct nft_trans_binding *binding; struct nft_trans_set *trans_set; list_add_tail(&trans->list, &nft_net->commit_list); binding = nft_trans_get_binding(trans); if (!binding) return; switch (trans->msg_type) { case NFT_MSG_NEWSET: trans_set = nft_trans_container_set(trans); if (!nft_trans_set_update(trans) && nft_set_is_anonymous(nft_trans_set(trans))) list_add_tail(&binding->binding_list, &nft_net->binding_list); list_add_tail(&trans_set->list_trans_newset, &nft_net->commit_set_list); break; case NFT_MSG_NEWCHAIN: if (!nft_trans_chain_update(trans) && nft_chain_binding(nft_trans_chain(trans))) list_add_tail(&binding->binding_list, &nft_net->binding_list); break; } } static void nft_trans_commit_list_add_elem(struct net *net, struct nft_trans *trans, gfp_t gfp) { struct nftables_pernet *nft_net = nft_pernet(net); WARN_ON_ONCE(trans->msg_type != NFT_MSG_NEWSETELEM && trans->msg_type != NFT_MSG_DELSETELEM); might_alloc(gfp); if (nft_trans_try_collapse(nft_net, trans, gfp)) { kfree(trans); return; } nft_trans_commit_list_add_tail(net, trans); } static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type) { struct nft_trans *trans; trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_table)); if (trans == NULL) return -ENOMEM; if (msg_type == NFT_MSG_NEWTABLE) nft_activate_next(ctx->net, ctx->table); nft_trans_commit_list_add_tail(ctx->net, trans); return 0; } static int nft_deltable(struct nft_ctx *ctx) { int err; err = nft_trans_table_add(ctx, NFT_MSG_DELTABLE); if (err < 0) return err; nft_deactivate_next(ctx->net, ctx->table); return err; } static struct nft_trans * nft_trans_alloc_chain(const struct nft_ctx *ctx, int msg_type) { struct nft_trans_chain *trans_chain; struct nft_trans *trans; trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_chain)); if (!trans) return NULL; trans_chain = nft_trans_container_chain(trans); INIT_LIST_HEAD(&trans_chain->nft_trans_binding.binding_list); trans_chain->chain = ctx->chain; return trans; } static struct nft_trans *nft_trans_chain_add(struct nft_ctx *ctx, int msg_type) { struct nft_trans *trans; trans = nft_trans_alloc_chain(ctx, msg_type); if (trans == NULL) return ERR_PTR(-ENOMEM); if (msg_type == NFT_MSG_NEWCHAIN) { nft_activate_next(ctx->net, ctx->chain); if (ctx->nla[NFTA_CHAIN_ID]) { nft_trans_chain_id(trans) = ntohl(nla_get_be32(ctx->nla[NFTA_CHAIN_ID])); } } nft_trans_commit_list_add_tail(ctx->net, trans); return trans; } static int nft_delchain(struct nft_ctx *ctx) { struct nft_trans *trans; trans = nft_trans_chain_add(ctx, NFT_MSG_DELCHAIN); if (IS_ERR(trans)) return PTR_ERR(trans); nft_use_dec(&ctx->table->use); nft_deactivate_next(ctx->net, ctx->chain); return 0; } void nft_rule_expr_activate(const struct nft_ctx *ctx, struct nft_rule *rule) { struct nft_expr *expr; expr = nft_expr_first(rule); while (nft_expr_more(rule, expr)) { if (expr->ops->activate) expr->ops->activate(ctx, expr); expr = nft_expr_next(expr); } } void nft_rule_expr_deactivate(const struct nft_ctx *ctx, struct nft_rule *rule, enum nft_trans_phase phase) { struct nft_expr *expr; expr = nft_expr_first(rule); while (nft_expr_more(rule, expr)) { if (expr->ops->deactivate) expr->ops->deactivate(ctx, expr, phase); expr = nft_expr_next(expr); } } static int nf_tables_delrule_deactivate(struct nft_ctx *ctx, struct nft_rule *rule) { /* You cannot delete the same rule twice */ if (nft_is_active_next(ctx->net, rule)) { nft_deactivate_next(ctx->net, rule); nft_use_dec(&ctx->chain->use); return 0; } return -ENOENT; } static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type, struct nft_rule *rule) { struct nft_trans *trans; trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_rule)); if (trans == NULL) return NULL; if (msg_type == NFT_MSG_NEWRULE && ctx->nla[NFTA_RULE_ID] != NULL) { nft_trans_rule_id(trans) = ntohl(nla_get_be32(ctx->nla[NFTA_RULE_ID])); } nft_trans_rule(trans) = rule; nft_trans_rule_chain(trans) = ctx->chain; nft_trans_commit_list_add_tail(ctx->net, trans); return trans; } static int nft_delrule(struct nft_ctx *ctx, struct nft_rule *rule) { struct nft_flow_rule *flow; struct nft_trans *trans; int err; trans = nft_trans_rule_add(ctx, NFT_MSG_DELRULE, rule); if (trans == NULL) return -ENOMEM; if (ctx->chain->flags & NFT_CHAIN_HW_OFFLOAD) { flow = nft_flow_rule_create(ctx->net, rule); if (IS_ERR(flow)) { nft_trans_destroy(trans); return PTR_ERR(flow); } nft_trans_flow_rule(trans) = flow; } err = nf_tables_delrule_deactivate(ctx, rule); if (err < 0) { nft_trans_destroy(trans); return err; } nft_rule_expr_deactivate(ctx, rule, NFT_TRANS_PREPARE); return 0; } static int nft_delrule_by_chain(struct nft_ctx *ctx) { struct nft_rule *rule; int err; list_for_each_entry(rule, &ctx->chain->rules, list) { if (!nft_is_active_next(ctx->net, rule)) continue; err = nft_delrule(ctx, rule); if (err < 0) return err; } return 0; } static int __nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, struct nft_set *set, const struct nft_set_desc *desc) { struct nft_trans_set *trans_set; struct nft_trans *trans; trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_set)); if (trans == NULL) return -ENOMEM; trans_set = nft_trans_container_set(trans); INIT_LIST_HEAD(&trans_set->nft_trans_binding.binding_list); INIT_LIST_HEAD(&trans_set->list_trans_newset); if (msg_type == NFT_MSG_NEWSET && ctx->nla[NFTA_SET_ID] && !desc) { nft_trans_set_id(trans) = ntohl(nla_get_be32(ctx->nla[NFTA_SET_ID])); nft_activate_next(ctx->net, set); } nft_trans_set(trans) = set; if (desc) { nft_trans_set_update(trans) = true; nft_trans_set_gc_int(trans) = desc->gc_int; nft_trans_set_timeout(trans) = desc->timeout; nft_trans_set_size(trans) = desc->size; } nft_trans_commit_list_add_tail(ctx->net, trans); return 0; } static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, struct nft_set *set) { return __nft_trans_set_add(ctx, msg_type, set, NULL); } static int nft_mapelem_deactivate(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv) { struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); if (!nft_set_elem_active(ext, iter->genmask)) return 0; nft_set_elem_change_active(ctx->net, set, ext); nft_setelem_data_deactivate(ctx->net, set, elem_priv); return 0; } struct nft_set_elem_catchall { struct list_head list; struct rcu_head rcu; struct nft_elem_priv *elem; }; static void nft_map_catchall_deactivate(const struct nft_ctx *ctx, struct nft_set *set) { u8 genmask = nft_genmask_next(ctx->net); struct nft_set_elem_catchall *catchall; struct nft_set_ext *ext; list_for_each_entry(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); if (!nft_set_elem_active(ext, genmask)) continue; nft_set_elem_change_active(ctx->net, set, ext); nft_setelem_data_deactivate(ctx->net, set, catchall->elem); break; } } static void nft_map_deactivate(const struct nft_ctx *ctx, struct nft_set *set) { struct nft_set_iter iter = { .genmask = nft_genmask_next(ctx->net), .type = NFT_ITER_UPDATE, .fn = nft_mapelem_deactivate, }; set->ops->walk(ctx, set, &iter); WARN_ON_ONCE(iter.err); nft_map_catchall_deactivate(ctx, set); } static int nft_delset(const struct nft_ctx *ctx, struct nft_set *set) { int err; err = nft_trans_set_add(ctx, NFT_MSG_DELSET, set); if (err < 0) return err; if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) nft_map_deactivate(ctx, set); nft_deactivate_next(ctx->net, set); nft_use_dec(&ctx->table->use); return err; } static int nft_trans_obj_add(struct nft_ctx *ctx, int msg_type, struct nft_object *obj) { struct nft_trans *trans; trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_obj)); if (trans == NULL) return -ENOMEM; if (msg_type == NFT_MSG_NEWOBJ) nft_activate_next(ctx->net, obj); nft_trans_obj(trans) = obj; nft_trans_commit_list_add_tail(ctx->net, trans); return 0; } static int nft_delobj(struct nft_ctx *ctx, struct nft_object *obj) { int err; err = nft_trans_obj_add(ctx, NFT_MSG_DELOBJ, obj); if (err < 0) return err; nft_deactivate_next(ctx->net, obj); nft_use_dec(&ctx->table->use); return err; } static struct nft_trans * nft_trans_flowtable_add(struct nft_ctx *ctx, int msg_type, struct nft_flowtable *flowtable) { struct nft_trans *trans; trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_flowtable)); if (trans == NULL) return ERR_PTR(-ENOMEM); if (msg_type == NFT_MSG_NEWFLOWTABLE) nft_activate_next(ctx->net, flowtable); INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans)); nft_trans_flowtable(trans) = flowtable; nft_trans_commit_list_add_tail(ctx->net, trans); return trans; } static int nft_delflowtable(struct nft_ctx *ctx, struct nft_flowtable *flowtable) { struct nft_trans *trans; trans = nft_trans_flowtable_add(ctx, NFT_MSG_DELFLOWTABLE, flowtable); if (IS_ERR(trans)) return PTR_ERR(trans); nft_deactivate_next(ctx->net, flowtable); nft_use_dec(&ctx->table->use); return 0; } static void __nft_reg_track_clobber(struct nft_regs_track *track, u8 dreg) { int i; for (i = track->regs[dreg].num_reg; i > 0; i--) __nft_reg_track_cancel(track, dreg - i); } static void __nft_reg_track_update(struct nft_regs_track *track, const struct nft_expr *expr, u8 dreg, u8 num_reg) { track->regs[dreg].selector = expr; track->regs[dreg].bitwise = NULL; track->regs[dreg].num_reg = num_reg; } void nft_reg_track_update(struct nft_regs_track *track, const struct nft_expr *expr, u8 dreg, u8 len) { unsigned int regcount; int i; __nft_reg_track_clobber(track, dreg); regcount = DIV_ROUND_UP(len, NFT_REG32_SIZE); for (i = 0; i < regcount; i++, dreg++) __nft_reg_track_update(track, expr, dreg, i); } EXPORT_SYMBOL_GPL(nft_reg_track_update); void nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg, u8 len) { unsigned int regcount; int i; __nft_reg_track_clobber(track, dreg); regcount = DIV_ROUND_UP(len, NFT_REG32_SIZE); for (i = 0; i < regcount; i++, dreg++) __nft_reg_track_cancel(track, dreg); } EXPORT_SYMBOL_GPL(nft_reg_track_cancel); void __nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg) { track->regs[dreg].selector = NULL; track->regs[dreg].bitwise = NULL; track->regs[dreg].num_reg = 0; } EXPORT_SYMBOL_GPL(__nft_reg_track_cancel); /* * Tables */ static struct nft_table *nft_table_lookup(const struct net *net, const struct nlattr *nla, u8 family, u8 genmask, u32 nlpid) { struct nftables_pernet *nft_net; struct nft_table *table; if (nla == NULL) return ERR_PTR(-EINVAL); nft_net = nft_pernet(net); list_for_each_entry_rcu(table, &nft_net->tables, list, lockdep_is_held(&nft_net->commit_mutex)) { if (!nla_strcmp(nla, table->name) && table->family == family && nft_active_genmask(table, genmask)) { if (nft_table_has_owner(table) && nlpid && table->nlpid != nlpid) return ERR_PTR(-EPERM); return table; } } return ERR_PTR(-ENOENT); } static struct nft_table *nft_table_lookup_byhandle(const struct net *net, const struct nlattr *nla, int family, u8 genmask, u32 nlpid) { struct nftables_pernet *nft_net; struct nft_table *table; nft_net = nft_pernet(net); list_for_each_entry(table, &nft_net->tables, list) { if (be64_to_cpu(nla_get_be64(nla)) == table->handle && table->family == family && nft_active_genmask(table, genmask)) { if (nft_table_has_owner(table) && nlpid && table->nlpid != nlpid) return ERR_PTR(-EPERM); return table; } } return ERR_PTR(-ENOENT); } static inline u64 nf_tables_alloc_handle(struct nft_table *table) { return ++table->hgenerator; } static const struct nft_chain_type *chain_type[NFPROTO_NUMPROTO][NFT_CHAIN_T_MAX]; static const struct nft_chain_type * __nft_chain_type_get(u8 family, enum nft_chain_types type) { if (family >= NFPROTO_NUMPROTO || type >= NFT_CHAIN_T_MAX) return NULL; return chain_type[family][type]; } static const struct nft_chain_type * __nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family) { const struct nft_chain_type *type; int i; for (i = 0; i < NFT_CHAIN_T_MAX; i++) { type = __nft_chain_type_get(family, i); if (!type) continue; if (!nla_strcmp(nla, type->name)) return type; } return NULL; } struct nft_module_request { struct list_head list; char module[MODULE_NAME_LEN]; bool done; }; #ifdef CONFIG_MODULES __printf(2, 3) int nft_request_module(struct net *net, const char *fmt, ...) { char module_name[MODULE_NAME_LEN]; struct nftables_pernet *nft_net; struct nft_module_request *req; va_list args; int ret; va_start(args, fmt); ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); va_end(args); if (ret >= MODULE_NAME_LEN) return 0; nft_net = nft_pernet(net); list_for_each_entry(req, &nft_net->module_list, list) { if (!strcmp(req->module, module_name)) { if (req->done) return 0; /* A request to load this module already exists. */ return -EAGAIN; } } req = kmalloc(sizeof(*req), GFP_KERNEL); if (!req) return -ENOMEM; req->done = false; strscpy(req->module, module_name, MODULE_NAME_LEN); list_add_tail(&req->list, &nft_net->module_list); return -EAGAIN; } EXPORT_SYMBOL_GPL(nft_request_module); #endif static void lockdep_nfnl_nft_mutex_not_held(void) { #ifdef CONFIG_PROVE_LOCKING if (debug_locks) WARN_ON_ONCE(lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES)); #endif } static const struct nft_chain_type * nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla, u8 family, bool autoload) { const struct nft_chain_type *type; type = __nf_tables_chain_type_lookup(nla, family); if (type != NULL) return type; lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (autoload) { if (nft_request_module(net, "nft-chain-%u-%.*s", family, nla_len(nla), (const char *)nla_data(nla)) == -EAGAIN) return ERR_PTR(-EAGAIN); } #endif return ERR_PTR(-ENOENT); } static __be16 nft_base_seq(const struct net *net) { struct nftables_pernet *nft_net = nft_pernet(net); return htons(nft_net->base_seq & 0xffff); } static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = { [NFTA_TABLE_NAME] = { .type = NLA_STRING, .len = NFT_TABLE_MAXNAMELEN - 1 }, [NFTA_TABLE_FLAGS] = { .type = NLA_U32 }, [NFTA_TABLE_HANDLE] = { .type = NLA_U64 }, [NFTA_TABLE_USERDATA] = { .type = NLA_BINARY, .len = NFT_USERDATA_MAXLEN } }; static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq, int event, u32 flags, int family, const struct nft_table *table) { struct nlmsghdr *nlh; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nfnl_msg_put(skb, portid, seq, event, flags, family, NFNETLINK_V0, nft_base_seq(net)); if (!nlh) goto nla_put_failure; if (nla_put_string(skb, NFTA_TABLE_NAME, table->name) || nla_put_be32(skb, NFTA_TABLE_USE, htonl(table->use)) || nla_put_be64(skb, NFTA_TABLE_HANDLE, cpu_to_be64(table->handle), NFTA_TABLE_PAD)) goto nla_put_failure; if (event == NFT_MSG_DELTABLE) { nlmsg_end(skb, nlh); return 0; } if (nla_put_be32(skb, NFTA_TABLE_FLAGS, htonl(table->flags & NFT_TABLE_F_MASK))) goto nla_put_failure; if (nft_table_has_owner(table) && nla_put_be32(skb, NFTA_TABLE_OWNER, htonl(table->nlpid))) goto nla_put_failure; if (table->udata) { if (nla_put(skb, NFTA_TABLE_USERDATA, table->udlen, table->udata)) goto nla_put_failure; } nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_trim(skb, nlh); return -1; } struct nftnl_skb_parms { bool report; }; #define NFT_CB(skb) (*(struct nftnl_skb_parms*)&((skb)->cb)) static void nft_notify_enqueue(struct sk_buff *skb, bool report, struct list_head *notify_list) { NFT_CB(skb).report = report; list_add_tail(&skb->list, notify_list); } static void nf_tables_table_notify(const struct nft_ctx *ctx, int event) { struct nftables_pernet *nft_net; struct sk_buff *skb; u16 flags = 0; int err; if (!ctx->report && !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) return; skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (skb == NULL) goto err; if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); err = nf_tables_fill_table_info(skb, ctx->net, ctx->portid, ctx->seq, event, flags, ctx->family, ctx->table); if (err < 0) { kfree_skb(skb); goto err; } nft_net = nft_pernet(ctx->net); nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list); return; err: nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); } static int nf_tables_dump_tables(struct sk_buff *skb, struct netlink_callback *cb) { const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); struct nftables_pernet *nft_net; const struct nft_table *table; unsigned int idx = 0, s_idx = cb->args[0]; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; rcu_read_lock(); nft_net = nft_pernet(net); cb->seq = READ_ONCE(nft_net->base_seq); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) continue; if (idx < s_idx) goto cont; if (idx > s_idx) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); if (!nft_is_active(net, table)) continue; if (nf_tables_fill_table_info(skb, net, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFT_MSG_NEWTABLE, NLM_F_MULTI, table->family, table) < 0) goto done; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: idx++; } done: rcu_read_unlock(); cb->args[0] = idx; return skb->len; } static int nft_netlink_dump_start_rcu(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, struct netlink_dump_control *c) { int err; if (!try_module_get(THIS_MODULE)) return -EINVAL; rcu_read_unlock(); err = netlink_dump_start(nlsk, skb, nlh, c); rcu_read_lock(); module_put(THIS_MODULE); return err; } /* called with rcu_read_lock held */ static int nf_tables_gettable(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); u8 family = info->nfmsg->nfgen_family; const struct nft_table *table; struct net *net = info->net; struct sk_buff *skb2; int err; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nf_tables_dump_tables, .module = THIS_MODULE, }; return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } table = nft_table_lookup(net, nla[NFTA_TABLE_NAME], family, genmask, 0); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_TABLE_NAME]); return PTR_ERR(table); } skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (!skb2) return -ENOMEM; err = nf_tables_fill_table_info(skb2, net, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFT_MSG_NEWTABLE, 0, family, table); if (err < 0) goto err_fill_table_info; return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); err_fill_table_info: kfree_skb(skb2); return err; } static void nft_table_disable(struct net *net, struct nft_table *table, u32 cnt) { struct nft_chain *chain; u32 i = 0; list_for_each_entry(chain, &table->chains, list) { if (!nft_is_active_next(net, chain)) continue; if (!nft_is_base_chain(chain)) continue; if (cnt && i++ == cnt) break; nf_tables_unregister_hook(net, table, chain); } } static int nf_tables_table_enable(struct net *net, struct nft_table *table) { struct nft_chain *chain; int err, i = 0; list_for_each_entry(chain, &table->chains, list) { if (!nft_is_active_next(net, chain)) continue; if (!nft_is_base_chain(chain)) continue; err = nf_tables_register_hook(net, table, chain); if (err < 0) goto err_register_hooks; i++; } return 0; err_register_hooks: if (i) nft_table_disable(net, table, i); return err; } static void nf_tables_table_disable(struct net *net, struct nft_table *table) { table->flags &= ~NFT_TABLE_F_DORMANT; nft_table_disable(net, table, 0); table->flags |= NFT_TABLE_F_DORMANT; } #define __NFT_TABLE_F_INTERNAL (NFT_TABLE_F_MASK + 1) #define __NFT_TABLE_F_WAS_DORMANT (__NFT_TABLE_F_INTERNAL << 0) #define __NFT_TABLE_F_WAS_AWAKEN (__NFT_TABLE_F_INTERNAL << 1) #define __NFT_TABLE_F_WAS_ORPHAN (__NFT_TABLE_F_INTERNAL << 2) #define __NFT_TABLE_F_UPDATE (__NFT_TABLE_F_WAS_DORMANT | \ __NFT_TABLE_F_WAS_AWAKEN | \ __NFT_TABLE_F_WAS_ORPHAN) static bool nft_table_pending_update(const struct nft_ctx *ctx) { struct nftables_pernet *nft_net = nft_pernet(ctx->net); struct nft_trans *trans; if (ctx->table->flags & __NFT_TABLE_F_UPDATE) return true; list_for_each_entry(trans, &nft_net->commit_list, list) { if (trans->table == ctx->table && ((trans->msg_type == NFT_MSG_NEWCHAIN && nft_trans_chain_update(trans)) || (trans->msg_type == NFT_MSG_DELCHAIN && nft_is_base_chain(nft_trans_chain(trans))))) return true; } return false; } static int nf_tables_updtable(struct nft_ctx *ctx) { struct nft_trans *trans; u32 flags; int ret; if (!ctx->nla[NFTA_TABLE_FLAGS]) return 0; flags = ntohl(nla_get_be32(ctx->nla[NFTA_TABLE_FLAGS])); if (flags & ~NFT_TABLE_F_MASK) return -EOPNOTSUPP; if (flags == (ctx->table->flags & NFT_TABLE_F_MASK)) return 0; if ((nft_table_has_owner(ctx->table) && !(flags & NFT_TABLE_F_OWNER)) || (flags & NFT_TABLE_F_OWNER && !nft_table_is_orphan(ctx->table))) return -EOPNOTSUPP; if ((flags ^ ctx->table->flags) & NFT_TABLE_F_PERSIST) return -EOPNOTSUPP; /* No dormant off/on/off/on games in single transaction */ if (nft_table_pending_update(ctx)) return -EINVAL; trans = nft_trans_alloc(ctx, NFT_MSG_NEWTABLE, sizeof(struct nft_trans_table)); if (trans == NULL) return -ENOMEM; if ((flags & NFT_TABLE_F_DORMANT) && !(ctx->table->flags & NFT_TABLE_F_DORMANT)) { ctx->table->flags |= NFT_TABLE_F_DORMANT; if (!(ctx->table->flags & __NFT_TABLE_F_UPDATE)) ctx->table->flags |= __NFT_TABLE_F_WAS_AWAKEN; } else if (!(flags & NFT_TABLE_F_DORMANT) && ctx->table->flags & NFT_TABLE_F_DORMANT) { ctx->table->flags &= ~NFT_TABLE_F_DORMANT; if (!(ctx->table->flags & __NFT_TABLE_F_UPDATE)) { ret = nf_tables_table_enable(ctx->net, ctx->table); if (ret < 0) goto err_register_hooks; ctx->table->flags |= __NFT_TABLE_F_WAS_DORMANT; } } if ((flags & NFT_TABLE_F_OWNER) && !nft_table_has_owner(ctx->table)) { ctx->table->nlpid = ctx->portid; ctx->table->flags |= NFT_TABLE_F_OWNER | __NFT_TABLE_F_WAS_ORPHAN; } nft_trans_table_update(trans) = true; nft_trans_commit_list_add_tail(ctx->net, trans); return 0; err_register_hooks: ctx->table->flags |= NFT_TABLE_F_DORMANT; nft_trans_destroy(trans); return ret; } static u32 nft_chain_hash(const void *data, u32 len, u32 seed) { const char *name = data; return jhash(name, strlen(name), seed); } static u32 nft_chain_hash_obj(const void *data, u32 len, u32 seed) { const struct nft_chain *chain = data; return nft_chain_hash(chain->name, 0, seed); } static int nft_chain_hash_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { const struct nft_chain *chain = ptr; const char *name = arg->key; return strcmp(chain->name, name); } static u32 nft_objname_hash(const void *data, u32 len, u32 seed) { const struct nft_object_hash_key *k = data; seed ^= hash_ptr(k->table, 32); return jhash(k->name, strlen(k->name), seed); } static u32 nft_objname_hash_obj(const void *data, u32 len, u32 seed) { const struct nft_object *obj = data; return nft_objname_hash(&obj->key, 0, seed); } static int nft_objname_hash_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { const struct nft_object_hash_key *k = arg->key; const struct nft_object *obj = ptr; if (obj->key.table != k->table) return -1; return strcmp(obj->key.name, k->name); } static bool nft_supported_family(u8 family) { return false #ifdef CONFIG_NF_TABLES_INET || family == NFPROTO_INET #endif #ifdef CONFIG_NF_TABLES_IPV4 || family == NFPROTO_IPV4 #endif #ifdef CONFIG_NF_TABLES_ARP || family == NFPROTO_ARP #endif #ifdef CONFIG_NF_TABLES_NETDEV || family == NFPROTO_NETDEV #endif #if IS_ENABLED(CONFIG_NF_TABLES_BRIDGE) || family == NFPROTO_BRIDGE #endif #ifdef CONFIG_NF_TABLES_IPV6 || family == NFPROTO_IPV6 #endif ; } static int nf_tables_newtable(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct nftables_pernet *nft_net = nft_pernet(info->net); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; struct nft_ctx ctx; u32 flags = 0; int err; if (!nft_supported_family(family)) return -EOPNOTSUPP; lockdep_assert_held(&nft_net->commit_mutex); attr = nla[NFTA_TABLE_NAME]; table = nft_table_lookup(net, attr, family, genmask, NETLINK_CB(skb).portid); if (IS_ERR(table)) { if (PTR_ERR(table) != -ENOENT) return PTR_ERR(table); } else { if (info->nlh->nlmsg_flags & NLM_F_EXCL) { NL_SET_BAD_ATTR(extack, attr); return -EEXIST; } if (info->nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); return nf_tables_updtable(&ctx); } if (nla[NFTA_TABLE_FLAGS]) { flags = ntohl(nla_get_be32(nla[NFTA_TABLE_FLAGS])); if (flags & ~NFT_TABLE_F_MASK) return -EOPNOTSUPP; } err = -ENOMEM; table = kzalloc(sizeof(*table), GFP_KERNEL_ACCOUNT); if (table == NULL) goto err_kzalloc; table->validate_state = nft_net->validate_state; table->name = nla_strdup(attr, GFP_KERNEL_ACCOUNT); if (table->name == NULL) goto err_strdup; if (nla[NFTA_TABLE_USERDATA]) { table->udata = nla_memdup(nla[NFTA_TABLE_USERDATA], GFP_KERNEL_ACCOUNT); if (table->udata == NULL) goto err_table_udata; table->udlen = nla_len(nla[NFTA_TABLE_USERDATA]); } err = rhltable_init(&table->chains_ht, &nft_chain_ht_params); if (err) goto err_chain_ht; INIT_LIST_HEAD(&table->chains); INIT_LIST_HEAD(&table->sets); INIT_LIST_HEAD(&table->objects); INIT_LIST_HEAD(&table->flowtables); table->family = family; table->flags = flags; table->handle = ++nft_net->table_handle; if (table->flags & NFT_TABLE_F_OWNER) table->nlpid = NETLINK_CB(skb).portid; nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE); if (err < 0) goto err_trans; list_add_tail_rcu(&table->list, &nft_net->tables); return 0; err_trans: rhltable_destroy(&table->chains_ht); err_chain_ht: kfree(table->udata); err_table_udata: kfree(table->name); err_strdup: kfree(table); err_kzalloc: return err; } static int nft_flush_table(struct nft_ctx *ctx) { struct nft_flowtable *flowtable, *nft; struct nft_chain *chain, *nc; struct nft_object *obj, *ne; struct nft_set *set, *ns; int err; list_for_each_entry(chain, &ctx->table->chains, list) { if (!nft_is_active_next(ctx->net, chain)) continue; if (nft_chain_binding(chain)) continue; ctx->chain = chain; err = nft_delrule_by_chain(ctx); if (err < 0) goto out; } list_for_each_entry_safe(set, ns, &ctx->table->sets, list) { if (!nft_is_active_next(ctx->net, set)) continue; if (nft_set_is_anonymous(set)) continue; err = nft_delset(ctx, set); if (err < 0) goto out; } list_for_each_entry_safe(flowtable, nft, &ctx->table->flowtables, list) { if (!nft_is_active_next(ctx->net, flowtable)) continue; err = nft_delflowtable(ctx, flowtable); if (err < 0) goto out; } list_for_each_entry_safe(obj, ne, &ctx->table->objects, list) { if (!nft_is_active_next(ctx->net, obj)) continue; err = nft_delobj(ctx, obj); if (err < 0) goto out; } list_for_each_entry_safe(chain, nc, &ctx->table->chains, list) { if (!nft_is_active_next(ctx->net, chain)) continue; if (nft_chain_binding(chain)) continue; ctx->chain = chain; err = nft_delchain(ctx); if (err < 0) goto out; } err = nft_deltable(ctx); out: return err; } static int nft_flush(struct nft_ctx *ctx, int family) { struct nftables_pernet *nft_net = nft_pernet(ctx->net); const struct nlattr * const *nla = ctx->nla; struct nft_table *table, *nt; int err = 0; list_for_each_entry_safe(table, nt, &nft_net->tables, list) { if (family != AF_UNSPEC && table->family != family) continue; ctx->family = table->family; if (!nft_is_active_next(ctx->net, table)) continue; if (nft_table_has_owner(table) && table->nlpid != ctx->portid) continue; if (nla[NFTA_TABLE_NAME] && nla_strcmp(nla[NFTA_TABLE_NAME], table->name) != 0) continue; ctx->table = table; err = nft_flush_table(ctx); if (err < 0) goto out; } out: return err; } static int nf_tables_deltable(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; struct nft_ctx ctx; nft_ctx_init(&ctx, net, skb, info->nlh, 0, NULL, NULL, nla); if (family == AF_UNSPEC || (!nla[NFTA_TABLE_NAME] && !nla[NFTA_TABLE_HANDLE])) return nft_flush(&ctx, family); if (nla[NFTA_TABLE_HANDLE]) { attr = nla[NFTA_TABLE_HANDLE]; table = nft_table_lookup_byhandle(net, attr, family, genmask, NETLINK_CB(skb).portid); } else { attr = nla[NFTA_TABLE_NAME]; table = nft_table_lookup(net, attr, family, genmask, NETLINK_CB(skb).portid); } if (IS_ERR(table)) { if (PTR_ERR(table) == -ENOENT && NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYTABLE) return 0; NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(table); } if (info->nlh->nlmsg_flags & NLM_F_NONREC && table->use > 0) return -EBUSY; ctx.family = family; ctx.table = table; return nft_flush_table(&ctx); } static void nf_tables_table_destroy(struct nft_table *table) { if (WARN_ON(table->use > 0)) return; rhltable_destroy(&table->chains_ht); kfree(table->name); kfree(table->udata); kfree(table); } void nft_register_chain_type(const struct nft_chain_type *ctype) { nfnl_lock(NFNL_SUBSYS_NFTABLES); if (WARN_ON(__nft_chain_type_get(ctype->family, ctype->type))) { nfnl_unlock(NFNL_SUBSYS_NFTABLES); return; } chain_type[ctype->family][ctype->type] = ctype; nfnl_unlock(NFNL_SUBSYS_NFTABLES); } EXPORT_SYMBOL_GPL(nft_register_chain_type); void nft_unregister_chain_type(const struct nft_chain_type *ctype) { nfnl_lock(NFNL_SUBSYS_NFTABLES); chain_type[ctype->family][ctype->type] = NULL; nfnl_unlock(NFNL_SUBSYS_NFTABLES); } EXPORT_SYMBOL_GPL(nft_unregister_chain_type); /* * Chains */ static struct nft_chain * nft_chain_lookup_byhandle(const struct nft_table *table, u64 handle, u8 genmask) { struct nft_chain *chain; list_for_each_entry(chain, &table->chains, list) { if (chain->handle == handle && nft_active_genmask(chain, genmask)) return chain; } return ERR_PTR(-ENOENT); } static bool lockdep_commit_lock_is_held(const struct net *net) { #ifdef CONFIG_PROVE_LOCKING struct nftables_pernet *nft_net = nft_pernet(net); return lockdep_is_held(&nft_net->commit_mutex); #else return true; #endif } static struct nft_chain *nft_chain_lookup(struct net *net, struct nft_table *table, const struct nlattr *nla, u8 genmask) { char search[NFT_CHAIN_MAXNAMELEN + 1]; struct rhlist_head *tmp, *list; struct nft_chain *chain; if (nla == NULL) return ERR_PTR(-EINVAL); nla_strscpy(search, nla, sizeof(search)); WARN_ON(!rcu_read_lock_held() && !lockdep_commit_lock_is_held(net)); chain = ERR_PTR(-ENOENT); rcu_read_lock(); list = rhltable_lookup(&table->chains_ht, search, nft_chain_ht_params); if (!list) goto out_unlock; rhl_for_each_entry_rcu(chain, tmp, list, rhlhead) { if (nft_active_genmask(chain, genmask)) goto out_unlock; } chain = ERR_PTR(-ENOENT); out_unlock: rcu_read_unlock(); return chain; } static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = { [NFTA_CHAIN_TABLE] = { .type = NLA_STRING, .len = NFT_TABLE_MAXNAMELEN - 1 }, [NFTA_CHAIN_HANDLE] = { .type = NLA_U64 }, [NFTA_CHAIN_NAME] = { .type = NLA_STRING, .len = NFT_CHAIN_MAXNAMELEN - 1 }, [NFTA_CHAIN_HOOK] = { .type = NLA_NESTED }, [NFTA_CHAIN_POLICY] = { .type = NLA_U32 }, [NFTA_CHAIN_TYPE] = { .type = NLA_STRING, .len = NFT_MODULE_AUTOLOAD_LIMIT }, [NFTA_CHAIN_COUNTERS] = { .type = NLA_NESTED }, [NFTA_CHAIN_FLAGS] = { .type = NLA_U32 }, [NFTA_CHAIN_ID] = { .type = NLA_U32 }, [NFTA_CHAIN_USERDATA] = { .type = NLA_BINARY, .len = NFT_USERDATA_MAXLEN }, }; static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = { [NFTA_HOOK_HOOKNUM] = { .type = NLA_U32 }, [NFTA_HOOK_PRIORITY] = { .type = NLA_U32 }, [NFTA_HOOK_DEV] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, }; static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats) { struct nft_stats *cpu_stats, total; struct nlattr *nest; unsigned int seq; u64 pkts, bytes; int cpu; if (!stats) return 0; memset(&total, 0, sizeof(total)); for_each_possible_cpu(cpu) { cpu_stats = per_cpu_ptr(stats, cpu); do { seq = u64_stats_fetch_begin(&cpu_stats->syncp); pkts = cpu_stats->pkts; bytes = cpu_stats->bytes; } while (u64_stats_fetch_retry(&cpu_stats->syncp, seq)); total.pkts += pkts; total.bytes += bytes; } nest = nla_nest_start_noflag(skb, NFTA_CHAIN_COUNTERS); if (nest == NULL) goto nla_put_failure; if (nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.pkts), NFTA_COUNTER_PAD) || nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes), NFTA_COUNTER_PAD)) goto nla_put_failure; nla_nest_end(skb, nest); return 0; nla_put_failure: return -ENOSPC; } static int nft_dump_basechain_hook(struct sk_buff *skb, const struct net *net, int family, const struct nft_base_chain *basechain, const struct list_head *hook_list) { const struct nf_hook_ops *ops = &basechain->ops; struct nft_hook *hook, *first = NULL; struct nlattr *nest, *nest_devs; int n = 0; nest = nla_nest_start_noflag(skb, NFTA_CHAIN_HOOK); if (nest == NULL) goto nla_put_failure; if (nla_put_be32(skb, NFTA_HOOK_HOOKNUM, htonl(ops->hooknum))) goto nla_put_failure; if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority))) goto nla_put_failure; if (nft_base_chain_netdev(family, ops->hooknum)) { nest_devs = nla_nest_start_noflag(skb, NFTA_HOOK_DEVS); if (!nest_devs) goto nla_put_failure; if (!hook_list) hook_list = &basechain->hook_list; list_for_each_entry_rcu(hook, hook_list, list, lockdep_commit_lock_is_held(net)) { if (!first) first = hook; if (nla_put(skb, NFTA_DEVICE_NAME, hook->ifnamelen, hook->ifname)) goto nla_put_failure; n++; } nla_nest_end(skb, nest_devs); if (n == 1 && nla_put(skb, NFTA_HOOK_DEV, first->ifnamelen, first->ifname)) goto nla_put_failure; } nla_nest_end(skb, nest); return 0; nla_put_failure: return -1; } static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq, int event, u32 flags, int family, const struct nft_table *table, const struct nft_chain *chain, const struct list_head *hook_list) { struct nlmsghdr *nlh; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nfnl_msg_put(skb, portid, seq, event, flags, family, NFNETLINK_V0, nft_base_seq(net)); if (!nlh) goto nla_put_failure; if (nla_put_string(skb, NFTA_CHAIN_TABLE, table->name) || nla_put_string(skb, NFTA_CHAIN_NAME, chain->name) || nla_put_be64(skb, NFTA_CHAIN_HANDLE, cpu_to_be64(chain->handle), NFTA_CHAIN_PAD)) goto nla_put_failure; if (event == NFT_MSG_DELCHAIN && !hook_list) { nlmsg_end(skb, nlh); return 0; } if (nft_is_base_chain(chain)) { const struct nft_base_chain *basechain = nft_base_chain(chain); struct nft_stats __percpu *stats; if (nft_dump_basechain_hook(skb, net, family, basechain, hook_list)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_CHAIN_POLICY, htonl(basechain->policy))) goto nla_put_failure; if (nla_put_string(skb, NFTA_CHAIN_TYPE, basechain->type->name)) goto nla_put_failure; stats = rcu_dereference_check(basechain->stats, lockdep_commit_lock_is_held(net)); if (nft_dump_stats(skb, stats)) goto nla_put_failure; } if (chain->flags && nla_put_be32(skb, NFTA_CHAIN_FLAGS, htonl(chain->flags))) goto nla_put_failure; if (nla_put_be32(skb, NFTA_CHAIN_USE, htonl(chain->use))) goto nla_put_failure; if (chain->udata && nla_put(skb, NFTA_CHAIN_USERDATA, chain->udlen, chain->udata)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_trim(skb, nlh); return -1; } static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event, const struct list_head *hook_list) { struct nftables_pernet *nft_net; struct sk_buff *skb; u16 flags = 0; int err; if (!ctx->report && !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) return; skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (skb == NULL) goto err; if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); err = nf_tables_fill_chain_info(skb, ctx->net, ctx->portid, ctx->seq, event, flags, ctx->family, ctx->table, ctx->chain, hook_list); if (err < 0) { kfree_skb(skb); goto err; } nft_net = nft_pernet(ctx->net); nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list); return; err: nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); } static int nf_tables_dump_chains(struct sk_buff *skb, struct netlink_callback *cb) { const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); unsigned int idx = 0, s_idx = cb->args[0]; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; struct nftables_pernet *nft_net; const struct nft_table *table; const struct nft_chain *chain; rcu_read_lock(); nft_net = nft_pernet(net); cb->seq = READ_ONCE(nft_net->base_seq); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) continue; list_for_each_entry_rcu(chain, &table->chains, list) { if (idx < s_idx) goto cont; if (idx > s_idx) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); if (!nft_is_active(net, chain)) continue; if (nf_tables_fill_chain_info(skb, net, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, NLM_F_MULTI, table->family, table, chain, NULL) < 0) goto done; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: idx++; } } done: rcu_read_unlock(); cb->args[0] = idx; return skb->len; } /* called with rcu_read_lock held */ static int nf_tables_getchain(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); u8 family = info->nfmsg->nfgen_family; const struct nft_chain *chain; struct net *net = info->net; struct nft_table *table; struct sk_buff *skb2; int err; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nf_tables_dump_chains, .module = THIS_MODULE, }; return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask, 0); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]); return PTR_ERR(table); } chain = nft_chain_lookup(net, table, nla[NFTA_CHAIN_NAME], genmask); if (IS_ERR(chain)) { NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]); return PTR_ERR(chain); } skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (!skb2) return -ENOMEM; err = nf_tables_fill_chain_info(skb2, net, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, 0, family, table, chain, NULL); if (err < 0) goto err_fill_chain_info; return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); err_fill_chain_info: kfree_skb(skb2); return err; } static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = { [NFTA_COUNTER_PACKETS] = { .type = NLA_U64 }, [NFTA_COUNTER_BYTES] = { .type = NLA_U64 }, }; static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr) { struct nlattr *tb[NFTA_COUNTER_MAX+1]; struct nft_stats __percpu *newstats; struct nft_stats *stats; int err; err = nla_parse_nested_deprecated(tb, NFTA_COUNTER_MAX, attr, nft_counter_policy, NULL); if (err < 0) return ERR_PTR_PCPU(err); if (!tb[NFTA_COUNTER_BYTES] || !tb[NFTA_COUNTER_PACKETS]) return ERR_PTR_PCPU(-EINVAL); newstats = netdev_alloc_pcpu_stats(struct nft_stats); if (newstats == NULL) return ERR_PTR_PCPU(-ENOMEM); /* Restore old counters on this cpu, no problem. Per-cpu statistics * are not exposed to userspace. */ preempt_disable(); stats = this_cpu_ptr(newstats); stats->bytes = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES])); stats->pkts = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS])); preempt_enable(); return newstats; } static void nft_chain_stats_replace(struct nft_trans_chain *trans) { const struct nft_trans *t = &trans->nft_trans_binding.nft_trans; struct nft_base_chain *chain = nft_base_chain(trans->chain); if (!trans->stats) return; trans->stats = rcu_replace_pointer(chain->stats, trans->stats, lockdep_commit_lock_is_held(t->net)); if (!trans->stats) static_branch_inc(&nft_counters_enabled); } static void nf_tables_chain_free_chain_rules(struct nft_chain *chain) { struct nft_rule_blob *g0 = rcu_dereference_raw(chain->blob_gen_0); struct nft_rule_blob *g1 = rcu_dereference_raw(chain->blob_gen_1); if (g0 != g1) kvfree(g1); kvfree(g0); /* should be NULL either via abort or via successful commit */ WARN_ON_ONCE(chain->blob_next); kvfree(chain->blob_next); } void nf_tables_chain_destroy(struct nft_chain *chain) { const struct nft_table *table = chain->table; struct nft_hook *hook, *next; if (WARN_ON(chain->use > 0)) return; /* no concurrent access possible anymore */ nf_tables_chain_free_chain_rules(chain); if (nft_is_base_chain(chain)) { struct nft_base_chain *basechain = nft_base_chain(chain); if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) { list_for_each_entry_safe(hook, next, &basechain->hook_list, list) { list_del_rcu(&hook->list); kfree_rcu(hook, rcu); } } module_put(basechain->type->owner); if (rcu_access_pointer(basechain->stats)) { static_branch_dec(&nft_counters_enabled); free_percpu(rcu_dereference_raw(basechain->stats)); } kfree(chain->name); kfree(chain->udata); kfree(basechain); } else { kfree(chain->name); kfree(chain->udata); kfree(chain); } } static struct nft_hook *nft_netdev_hook_alloc(struct net *net, const struct nlattr *attr) { struct net_device *dev; struct nft_hook *hook; int err; hook = kzalloc(sizeof(struct nft_hook), GFP_KERNEL_ACCOUNT); if (!hook) { err = -ENOMEM; goto err_hook_alloc; } err = nla_strscpy(hook->ifname, attr, IFNAMSIZ); if (err < 0) goto err_hook_dev; hook->ifnamelen = nla_len(attr); /* nf_tables_netdev_event() is called under rtnl_mutex, this is * indirectly serializing all the other holders of the commit_mutex with * the rtnl_mutex. */ dev = __dev_get_by_name(net, hook->ifname); if (!dev) { err = -ENOENT; goto err_hook_dev; } hook->ops.dev = dev; return hook; err_hook_dev: kfree(hook); err_hook_alloc: return ERR_PTR(err); } static struct nft_hook *nft_hook_list_find(struct list_head *hook_list, const struct nft_hook *this) { struct nft_hook *hook; list_for_each_entry(hook, hook_list, list) { if (!strcmp(hook->ifname, this->ifname)) return hook; } return NULL; } static int nf_tables_parse_netdev_hooks(struct net *net, const struct nlattr *attr, struct list_head *hook_list, struct netlink_ext_ack *extack) { struct nft_hook *hook, *next; const struct nlattr *tmp; int rem, n = 0, err; nla_for_each_nested(tmp, attr, rem) { if (nla_type(tmp) != NFTA_DEVICE_NAME) { err = -EINVAL; goto err_hook; } hook = nft_netdev_hook_alloc(net, tmp); if (IS_ERR(hook)) { NL_SET_BAD_ATTR(extack, tmp); err = PTR_ERR(hook); goto err_hook; } if (nft_hook_list_find(hook_list, hook)) { NL_SET_BAD_ATTR(extack, tmp); kfree(hook); err = -EEXIST; goto err_hook; } list_add_tail(&hook->list, hook_list); n++; if (n == NFT_NETDEVICE_MAX) { err = -EFBIG; goto err_hook; } } return 0; err_hook: list_for_each_entry_safe(hook, next, hook_list, list) { list_del(&hook->list); kfree(hook); } return err; } struct nft_chain_hook { u32 num; s32 priority; const struct nft_chain_type *type; struct list_head list; }; static int nft_chain_parse_netdev(struct net *net, struct nlattr *tb[], struct list_head *hook_list, struct netlink_ext_ack *extack, u32 flags) { struct nft_hook *hook; int err; if (tb[NFTA_HOOK_DEV]) { hook = nft_netdev_hook_alloc(net, tb[NFTA_HOOK_DEV]); if (IS_ERR(hook)) { NL_SET_BAD_ATTR(extack, tb[NFTA_HOOK_DEV]); return PTR_ERR(hook); } list_add_tail(&hook->list, hook_list); } else if (tb[NFTA_HOOK_DEVS]) { err = nf_tables_parse_netdev_hooks(net, tb[NFTA_HOOK_DEVS], hook_list, extack); if (err < 0) return err; } if (flags & NFT_CHAIN_HW_OFFLOAD && list_empty(hook_list)) return -EINVAL; return 0; } static int nft_chain_parse_hook(struct net *net, struct nft_base_chain *basechain, const struct nlattr * const nla[], struct nft_chain_hook *hook, u8 family, u32 flags, struct netlink_ext_ack *extack) { struct nftables_pernet *nft_net = nft_pernet(net); struct nlattr *ha[NFTA_HOOK_MAX + 1]; const struct nft_chain_type *type; int err; lockdep_assert_held(&nft_net->commit_mutex); lockdep_nfnl_nft_mutex_not_held(); err = nla_parse_nested_deprecated(ha, NFTA_HOOK_MAX, nla[NFTA_CHAIN_HOOK], nft_hook_policy, NULL); if (err < 0) return err; if (!basechain) { if (!ha[NFTA_HOOK_HOOKNUM] || !ha[NFTA_HOOK_PRIORITY]) { NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]); return -ENOENT; } hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM])); hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY])); type = __nft_chain_type_get(family, NFT_CHAIN_T_DEFAULT); if (!type) return -EOPNOTSUPP; if (nla[NFTA_CHAIN_TYPE]) { type = nf_tables_chain_type_lookup(net, nla[NFTA_CHAIN_TYPE], family, true); if (IS_ERR(type)) { NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TYPE]); return PTR_ERR(type); } } if (hook->num >= NFT_MAX_HOOKS || !(type->hook_mask & (1 << hook->num))) return -EOPNOTSUPP; if (type->type == NFT_CHAIN_T_NAT && hook->priority <= NF_IP_PRI_CONNTRACK) return -EOPNOTSUPP; } else { if (ha[NFTA_HOOK_HOOKNUM]) { hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM])); if (hook->num != basechain->ops.hooknum) return -EOPNOTSUPP; } if (ha[NFTA_HOOK_PRIORITY]) { hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY])); if (hook->priority != basechain->ops.priority) return -EOPNOTSUPP; } if (nla[NFTA_CHAIN_TYPE]) { type = __nf_tables_chain_type_lookup(nla[NFTA_CHAIN_TYPE], family); if (!type) { NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TYPE]); return -ENOENT; } } else { type = basechain->type; } } if (!try_module_get(type->owner)) { if (nla[NFTA_CHAIN_TYPE]) NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TYPE]); return -ENOENT; } hook->type = type; INIT_LIST_HEAD(&hook->list); if (nft_base_chain_netdev(family, hook->num)) { err = nft_chain_parse_netdev(net, ha, &hook->list, extack, flags); if (err < 0) { module_put(type->owner); return err; } } else if (ha[NFTA_HOOK_DEV] || ha[NFTA_HOOK_DEVS]) { module_put(type->owner); return -EOPNOTSUPP; } return 0; } static void nft_chain_release_hook(struct nft_chain_hook *hook) { struct nft_hook *h, *next; list_for_each_entry_safe(h, next, &hook->list, list) { list_del(&h->list); kfree(h); } module_put(hook->type->owner); } static void nft_last_rule(const struct nft_chain *chain, const void *ptr) { struct nft_rule_dp_last *lrule; BUILD_BUG_ON(offsetof(struct nft_rule_dp_last, end) != 0); lrule = (struct nft_rule_dp_last *)ptr; lrule->end.is_last = 1; lrule->chain = chain; /* blob size does not include the trailer rule */ } static struct nft_rule_blob *nf_tables_chain_alloc_rules(const struct nft_chain *chain, unsigned int size) { struct nft_rule_blob *blob; if (size > INT_MAX) return NULL; size += sizeof(struct nft_rule_blob) + sizeof(struct nft_rule_dp_last); blob = kvmalloc(size, GFP_KERNEL_ACCOUNT); if (!blob) return NULL; blob->size = 0; nft_last_rule(chain, blob->data); return blob; } static void nft_basechain_hook_init(struct nf_hook_ops *ops, u8 family, const struct nft_chain_hook *hook, struct nft_chain *chain) { ops->pf = family; ops->hooknum = hook->num; ops->priority = hook->priority; ops->priv = chain; ops->hook = hook->type->hooks[ops->hooknum]; ops->hook_ops_type = NF_HOOK_OP_NF_TABLES; } static int nft_basechain_init(struct nft_base_chain *basechain, u8 family, struct nft_chain_hook *hook, u32 flags) { struct nft_chain *chain; struct nft_hook *h; basechain->type = hook->type; INIT_LIST_HEAD(&basechain->hook_list); chain = &basechain->chain; if (nft_base_chain_netdev(family, hook->num)) { list_splice_init(&hook->list, &basechain->hook_list); list_for_each_entry(h, &basechain->hook_list, list) nft_basechain_hook_init(&h->ops, family, hook, chain); } nft_basechain_hook_init(&basechain->ops, family, hook, chain); chain->flags |= NFT_CHAIN_BASE | flags; basechain->policy = NF_ACCEPT; if (chain->flags & NFT_CHAIN_HW_OFFLOAD && !nft_chain_offload_support(basechain)) { list_splice_init(&basechain->hook_list, &hook->list); return -EOPNOTSUPP; } flow_block_init(&basechain->flow_block); return 0; } int nft_chain_add(struct nft_table *table, struct nft_chain *chain) { int err; err = rhltable_insert_key(&table->chains_ht, chain->name, &chain->rhlhead, nft_chain_ht_params); if (err) return err; list_add_tail_rcu(&chain->list, &table->chains); return 0; } static u64 chain_id; static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 policy, u32 flags, struct netlink_ext_ack *extack) { const struct nlattr * const *nla = ctx->nla; struct nft_table *table = ctx->table; struct nft_base_chain *basechain; struct net *net = ctx->net; char name[NFT_NAME_MAXLEN]; struct nft_rule_blob *blob; struct nft_trans *trans; struct nft_chain *chain; int err; if (nla[NFTA_CHAIN_HOOK]) { struct nft_stats __percpu *stats = NULL; struct nft_chain_hook hook = {}; if (table->flags & __NFT_TABLE_F_UPDATE) return -EINVAL; if (flags & NFT_CHAIN_BINDING) return -EOPNOTSUPP; err = nft_chain_parse_hook(net, NULL, nla, &hook, family, flags, extack); if (err < 0) return err; basechain = kzalloc(sizeof(*basechain), GFP_KERNEL_ACCOUNT); if (basechain == NULL) { nft_chain_release_hook(&hook); return -ENOMEM; } chain = &basechain->chain; if (nla[NFTA_CHAIN_COUNTERS]) { stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]); if (IS_ERR_PCPU(stats)) { nft_chain_release_hook(&hook); kfree(basechain); return PTR_ERR_PCPU(stats); } rcu_assign_pointer(basechain->stats, stats); } err = nft_basechain_init(basechain, family, &hook, flags); if (err < 0) { nft_chain_release_hook(&hook); kfree(basechain); free_percpu(stats); return err; } if (stats) static_branch_inc(&nft_counters_enabled); } else { if (flags & NFT_CHAIN_BASE) return -EINVAL; if (flags & NFT_CHAIN_HW_OFFLOAD) return -EOPNOTSUPP; chain = kzalloc(sizeof(*chain), GFP_KERNEL_ACCOUNT); if (chain == NULL) return -ENOMEM; chain->flags = flags; } ctx->chain = chain; INIT_LIST_HEAD(&chain->rules); chain->handle = nf_tables_alloc_handle(table); chain->table = table; if (nla[NFTA_CHAIN_NAME]) { chain->name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL_ACCOUNT); } else { if (!(flags & NFT_CHAIN_BINDING)) { err = -EINVAL; goto err_destroy_chain; } snprintf(name, sizeof(name), "__chain%llu", ++chain_id); chain->name = kstrdup(name, GFP_KERNEL_ACCOUNT); } if (!chain->name) { err = -ENOMEM; goto err_destroy_chain; } if (nla[NFTA_CHAIN_USERDATA]) { chain->udata = nla_memdup(nla[NFTA_CHAIN_USERDATA], GFP_KERNEL_ACCOUNT); if (chain->udata == NULL) { err = -ENOMEM; goto err_destroy_chain; } chain->udlen = nla_len(nla[NFTA_CHAIN_USERDATA]); } blob = nf_tables_chain_alloc_rules(chain, 0); if (!blob) { err = -ENOMEM; goto err_destroy_chain; } RCU_INIT_POINTER(chain->blob_gen_0, blob); RCU_INIT_POINTER(chain->blob_gen_1, blob); if (!nft_use_inc(&table->use)) { err = -EMFILE; goto err_destroy_chain; } trans = nft_trans_chain_add(ctx, NFT_MSG_NEWCHAIN); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto err_trans; } nft_trans_chain_policy(trans) = NFT_CHAIN_POLICY_UNSET; if (nft_is_base_chain(chain)) nft_trans_chain_policy(trans) = policy; err = nft_chain_add(table, chain); if (err < 0) goto err_chain_add; /* This must be LAST to ensure no packets are walking over this chain. */ err = nf_tables_register_hook(net, table, chain); if (err < 0) goto err_register_hook; return 0; err_register_hook: nft_chain_del(chain); err_chain_add: nft_trans_destroy(trans); err_trans: nft_use_dec_restore(&table->use); err_destroy_chain: nf_tables_chain_destroy(chain); return err; } static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, u32 flags, const struct nlattr *attr, struct netlink_ext_ack *extack) { const struct nlattr * const *nla = ctx->nla; struct nft_base_chain *basechain = NULL; struct nft_table *table = ctx->table; struct nft_chain *chain = ctx->chain; struct nft_chain_hook hook = {}; struct nft_stats __percpu *stats = NULL; struct nft_hook *h, *next; struct nf_hook_ops *ops; struct nft_trans *trans; bool unregister = false; int err; if (chain->flags ^ flags) return -EOPNOTSUPP; INIT_LIST_HEAD(&hook.list); if (nla[NFTA_CHAIN_HOOK]) { if (!nft_is_base_chain(chain)) { NL_SET_BAD_ATTR(extack, attr); return -EEXIST; } basechain = nft_base_chain(chain); err = nft_chain_parse_hook(ctx->net, basechain, nla, &hook, ctx->family, flags, extack); if (err < 0) return err; if (basechain->type != hook.type) { nft_chain_release_hook(&hook); NL_SET_BAD_ATTR(extack, attr); return -EEXIST; } if (nft_base_chain_netdev(ctx->family, basechain->ops.hooknum)) { list_for_each_entry_safe(h, next, &hook.list, list) { h->ops.pf = basechain->ops.pf; h->ops.hooknum = basechain->ops.hooknum; h->ops.priority = basechain->ops.priority; h->ops.priv = basechain->ops.priv; h->ops.hook = basechain->ops.hook; if (nft_hook_list_find(&basechain->hook_list, h)) { list_del(&h->list); kfree(h); } } } else { ops = &basechain->ops; if (ops->hooknum != hook.num || ops->priority != hook.priority) { nft_chain_release_hook(&hook); NL_SET_BAD_ATTR(extack, attr); return -EEXIST; } } } if (nla[NFTA_CHAIN_HANDLE] && nla[NFTA_CHAIN_NAME]) { struct nft_chain *chain2; chain2 = nft_chain_lookup(ctx->net, table, nla[NFTA_CHAIN_NAME], genmask); if (!IS_ERR(chain2)) { NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]); err = -EEXIST; goto err_hooks; } } if (table->flags & __NFT_TABLE_F_UPDATE && !list_empty(&hook.list)) { NL_SET_BAD_ATTR(extack, attr); err = -EOPNOTSUPP; goto err_hooks; } if (!(table->flags & NFT_TABLE_F_DORMANT) && nft_is_base_chain(chain) && !list_empty(&hook.list)) { basechain = nft_base_chain(chain); ops = &basechain->ops; if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) { err = nft_netdev_register_hooks(ctx->net, &hook.list); if (err < 0) goto err_hooks; } } unregister = true; if (nla[NFTA_CHAIN_COUNTERS]) { if (!nft_is_base_chain(chain)) { err = -EOPNOTSUPP; goto err_hooks; } stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]); if (IS_ERR_PCPU(stats)) { err = PTR_ERR_PCPU(stats); goto err_hooks; } } err = -ENOMEM; trans = nft_trans_alloc_chain(ctx, NFT_MSG_NEWCHAIN); if (trans == NULL) goto err_trans; nft_trans_chain_stats(trans) = stats; nft_trans_chain_update(trans) = true; if (nla[NFTA_CHAIN_POLICY]) nft_trans_chain_policy(trans) = policy; else nft_trans_chain_policy(trans) = -1; if (nla[NFTA_CHAIN_HANDLE] && nla[NFTA_CHAIN_NAME]) { struct nftables_pernet *nft_net = nft_pernet(ctx->net); struct nft_trans *tmp; char *name; err = -ENOMEM; name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL_ACCOUNT); if (!name) goto err_trans; err = -EEXIST; list_for_each_entry(tmp, &nft_net->commit_list, list) { if (tmp->msg_type == NFT_MSG_NEWCHAIN && tmp->table == table && nft_trans_chain_update(tmp) && nft_trans_chain_name(tmp) && strcmp(name, nft_trans_chain_name(tmp)) == 0) { NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]); kfree(name); goto err_trans; } } nft_trans_chain_name(trans) = name; } nft_trans_basechain(trans) = basechain; INIT_LIST_HEAD(&nft_trans_chain_hooks(trans)); list_splice(&hook.list, &nft_trans_chain_hooks(trans)); if (nla[NFTA_CHAIN_HOOK]) module_put(hook.type->owner); nft_trans_commit_list_add_tail(ctx->net, trans); return 0; err_trans: free_percpu(stats); kfree(trans); err_hooks: if (nla[NFTA_CHAIN_HOOK]) { list_for_each_entry_safe(h, next, &hook.list, list) { if (unregister) nf_unregister_net_hook(ctx->net, &h->ops); list_del(&h->list); kfree_rcu(h, rcu); } module_put(hook.type->owner); } return err; } static struct nft_chain *nft_chain_lookup_byid(const struct net *net, const struct nft_table *table, const struct nlattr *nla, u8 genmask) { struct nftables_pernet *nft_net = nft_pernet(net); u32 id = ntohl(nla_get_be32(nla)); struct nft_trans *trans; list_for_each_entry(trans, &nft_net->commit_list, list) { if (trans->msg_type == NFT_MSG_NEWCHAIN && nft_trans_chain(trans)->table == table && id == nft_trans_chain_id(trans) && nft_active_genmask(nft_trans_chain(trans), genmask)) return nft_trans_chain(trans); } return ERR_PTR(-ENOENT); } static int nf_tables_newchain(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct nftables_pernet *nft_net = nft_pernet(info->net); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); u8 family = info->nfmsg->nfgen_family; struct nft_chain *chain = NULL; struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; u8 policy = NF_ACCEPT; struct nft_ctx ctx; u64 handle = 0; u32 flags = 0; lockdep_assert_held(&nft_net->commit_mutex); table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask, NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]); return PTR_ERR(table); } chain = NULL; attr = nla[NFTA_CHAIN_NAME]; if (nla[NFTA_CHAIN_HANDLE]) { handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE])); chain = nft_chain_lookup_byhandle(table, handle, genmask); if (IS_ERR(chain)) { NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_HANDLE]); return PTR_ERR(chain); } attr = nla[NFTA_CHAIN_HANDLE]; } else if (nla[NFTA_CHAIN_NAME]) { chain = nft_chain_lookup(net, table, attr, genmask); if (IS_ERR(chain)) { if (PTR_ERR(chain) != -ENOENT) { NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(chain); } chain = NULL; } } else if (!nla[NFTA_CHAIN_ID]) { return -EINVAL; } if (nla[NFTA_CHAIN_POLICY]) { if (chain != NULL && !nft_is_base_chain(chain)) { NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_POLICY]); return -EOPNOTSUPP; } if (chain == NULL && nla[NFTA_CHAIN_HOOK] == NULL) { NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_POLICY]); return -EOPNOTSUPP; } policy = ntohl(nla_get_be32(nla[NFTA_CHAIN_POLICY])); switch (policy) { case NF_DROP: case NF_ACCEPT: break; default: return -EINVAL; } } if (nla[NFTA_CHAIN_FLAGS]) flags = ntohl(nla_get_be32(nla[NFTA_CHAIN_FLAGS])); else if (chain) flags = chain->flags; if (flags & ~NFT_CHAIN_FLAGS) return -EOPNOTSUPP; nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla); if (chain != NULL) { if (chain->flags & NFT_CHAIN_BINDING) return -EINVAL; if (info->nlh->nlmsg_flags & NLM_F_EXCL) { NL_SET_BAD_ATTR(extack, attr); return -EEXIST; } if (info->nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; flags |= chain->flags & NFT_CHAIN_BASE; return nf_tables_updchain(&ctx, genmask, policy, flags, attr, extack); } return nf_tables_addchain(&ctx, family, policy, flags, extack); } static int nft_delchain_hook(struct nft_ctx *ctx, struct nft_base_chain *basechain, struct netlink_ext_ack *extack) { const struct nft_chain *chain = &basechain->chain; const struct nlattr * const *nla = ctx->nla; struct nft_chain_hook chain_hook = {}; struct nft_hook *this, *hook; LIST_HEAD(chain_del_list); struct nft_trans *trans; int err; if (ctx->table->flags & __NFT_TABLE_F_UPDATE) return -EOPNOTSUPP; err = nft_chain_parse_hook(ctx->net, basechain, nla, &chain_hook, ctx->family, chain->flags, extack); if (err < 0) return err; list_for_each_entry(this, &chain_hook.list, list) { hook = nft_hook_list_find(&basechain->hook_list, this); if (!hook) { err = -ENOENT; goto err_chain_del_hook; } list_move(&hook->list, &chain_del_list); } trans = nft_trans_alloc_chain(ctx, NFT_MSG_DELCHAIN); if (!trans) { err = -ENOMEM; goto err_chain_del_hook; } nft_trans_basechain(trans) = basechain; nft_trans_chain_update(trans) = true; INIT_LIST_HEAD(&nft_trans_chain_hooks(trans)); list_splice(&chain_del_list, &nft_trans_chain_hooks(trans)); nft_chain_release_hook(&chain_hook); nft_trans_commit_list_add_tail(ctx->net, trans); return 0; err_chain_del_hook: list_splice(&chain_del_list, &basechain->hook_list); nft_chain_release_hook(&chain_hook); return err; } static int nf_tables_delchain(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; struct nft_chain *chain; struct nft_rule *rule; struct nft_ctx ctx; u64 handle; u32 use; int err; table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask, NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]); return PTR_ERR(table); } if (nla[NFTA_CHAIN_HANDLE]) { attr = nla[NFTA_CHAIN_HANDLE]; handle = be64_to_cpu(nla_get_be64(attr)); chain = nft_chain_lookup_byhandle(table, handle, genmask); } else { attr = nla[NFTA_CHAIN_NAME]; chain = nft_chain_lookup(net, table, attr, genmask); } if (IS_ERR(chain)) { if (PTR_ERR(chain) == -ENOENT && NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYCHAIN) return 0; NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(chain); } if (nft_chain_binding(chain)) return -EOPNOTSUPP; nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla); if (nla[NFTA_CHAIN_HOOK]) { if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYCHAIN || chain->flags & NFT_CHAIN_HW_OFFLOAD) return -EOPNOTSUPP; if (nft_is_base_chain(chain)) { struct nft_base_chain *basechain = nft_base_chain(chain); if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) return nft_delchain_hook(&ctx, basechain, extack); } } if (info->nlh->nlmsg_flags & NLM_F_NONREC && chain->use > 0) return -EBUSY; use = chain->use; list_for_each_entry(rule, &chain->rules, list) { if (!nft_is_active_next(net, rule)) continue; use--; err = nft_delrule(&ctx, rule); if (err < 0) return err; } /* There are rules and elements that are still holding references to us, * we cannot do a recursive removal in this case. */ if (use > 0) { NL_SET_BAD_ATTR(extack, attr); return -EBUSY; } return nft_delchain(&ctx); } /* * Expressions */ /** * nft_register_expr - register nf_tables expr type * @type: expr type * * Registers the expr type for use with nf_tables. Returns zero on * success or a negative errno code otherwise. */ int nft_register_expr(struct nft_expr_type *type) { if (WARN_ON_ONCE(type->maxattr > NFT_EXPR_MAXATTR)) return -ENOMEM; nfnl_lock(NFNL_SUBSYS_NFTABLES); if (type->family == NFPROTO_UNSPEC) list_add_tail_rcu(&type->list, &nf_tables_expressions); else list_add_rcu(&type->list, &nf_tables_expressions); nfnl_unlock(NFNL_SUBSYS_NFTABLES); return 0; } EXPORT_SYMBOL_GPL(nft_register_expr); /** * nft_unregister_expr - unregister nf_tables expr type * @type: expr type * * Unregisters the expr typefor use with nf_tables. */ void nft_unregister_expr(struct nft_expr_type *type) { nfnl_lock(NFNL_SUBSYS_NFTABLES); list_del_rcu(&type->list); nfnl_unlock(NFNL_SUBSYS_NFTABLES); } EXPORT_SYMBOL_GPL(nft_unregister_expr); static const struct nft_expr_type *__nft_expr_type_get(u8 family, struct nlattr *nla) { const struct nft_expr_type *type, *candidate = NULL; list_for_each_entry_rcu(type, &nf_tables_expressions, list) { if (!nla_strcmp(nla, type->name)) { if (!type->family && !candidate) candidate = type; else if (type->family == family) candidate = type; } } return candidate; } #ifdef CONFIG_MODULES static int nft_expr_type_request_module(struct net *net, u8 family, struct nlattr *nla) { if (nft_request_module(net, "nft-expr-%u-%.*s", family, nla_len(nla), (char *)nla_data(nla)) == -EAGAIN) return -EAGAIN; return 0; } #endif static const struct nft_expr_type *nft_expr_type_get(struct net *net, u8 family, struct nlattr *nla) { const struct nft_expr_type *type; if (nla == NULL) return ERR_PTR(-EINVAL); rcu_read_lock(); type = __nft_expr_type_get(family, nla); if (type != NULL && try_module_get(type->owner)) { rcu_read_unlock(); return type; } rcu_read_unlock(); lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (type == NULL) { if (nft_expr_type_request_module(net, family, nla) == -EAGAIN) return ERR_PTR(-EAGAIN); if (nft_request_module(net, "nft-expr-%.*s", nla_len(nla), (char *)nla_data(nla)) == -EAGAIN) return ERR_PTR(-EAGAIN); } #endif return ERR_PTR(-ENOENT); } static const struct nla_policy nft_expr_policy[NFTA_EXPR_MAX + 1] = { [NFTA_EXPR_NAME] = { .type = NLA_STRING, .len = NFT_MODULE_AUTOLOAD_LIMIT }, [NFTA_EXPR_DATA] = { .type = NLA_NESTED }, }; static int nf_tables_fill_expr_info(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { if (nla_put_string(skb, NFTA_EXPR_NAME, expr->ops->type->name)) goto nla_put_failure; if (expr->ops->dump) { struct nlattr *data = nla_nest_start_noflag(skb, NFTA_EXPR_DATA); if (data == NULL) goto nla_put_failure; if (expr->ops->dump(skb, expr, reset) < 0) goto nla_put_failure; nla_nest_end(skb, data); } return skb->len; nla_put_failure: return -1; }; int nft_expr_dump(struct sk_buff *skb, unsigned int attr, const struct nft_expr *expr, bool reset) { struct nlattr *nest; nest = nla_nest_start_noflag(skb, attr); if (!nest) goto nla_put_failure; if (nf_tables_fill_expr_info(skb, expr, reset) < 0) goto nla_put_failure; nla_nest_end(skb, nest); return 0; nla_put_failure: return -1; } struct nft_expr_info { const struct nft_expr_ops *ops; const struct nlattr *attr; struct nlattr *tb[NFT_EXPR_MAXATTR + 1]; }; static int nf_tables_expr_parse(const struct nft_ctx *ctx, const struct nlattr *nla, struct nft_expr_info *info) { const struct nft_expr_type *type; const struct nft_expr_ops *ops; struct nlattr *tb[NFTA_EXPR_MAX + 1]; int err; err = nla_parse_nested_deprecated(tb, NFTA_EXPR_MAX, nla, nft_expr_policy, NULL); if (err < 0) return err; type = nft_expr_type_get(ctx->net, ctx->family, tb[NFTA_EXPR_NAME]); if (IS_ERR(type)) return PTR_ERR(type); if (tb[NFTA_EXPR_DATA]) { err = nla_parse_nested_deprecated(info->tb, type->maxattr, tb[NFTA_EXPR_DATA], type->policy, NULL); if (err < 0) goto err1; } else memset(info->tb, 0, sizeof(info->tb[0]) * (type->maxattr + 1)); if (type->select_ops != NULL) { ops = type->select_ops(ctx, (const struct nlattr * const *)info->tb); if (IS_ERR(ops)) { err = PTR_ERR(ops); #ifdef CONFIG_MODULES if (err == -EAGAIN) if (nft_expr_type_request_module(ctx->net, ctx->family, tb[NFTA_EXPR_NAME]) != -EAGAIN) err = -ENOENT; #endif goto err1; } } else ops = type->ops; info->attr = nla; info->ops = ops; return 0; err1: module_put(type->owner); return err; } int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla, struct nft_expr_info *info) { struct nlattr *tb[NFTA_EXPR_MAX + 1]; const struct nft_expr_type *type; int err; err = nla_parse_nested_deprecated(tb, NFTA_EXPR_MAX, nla, nft_expr_policy, NULL); if (err < 0) return err; if (!tb[NFTA_EXPR_DATA] || !tb[NFTA_EXPR_NAME]) return -EINVAL; rcu_read_lock(); type = __nft_expr_type_get(ctx->family, tb[NFTA_EXPR_NAME]); if (!type) { err = -ENOENT; goto out_unlock; } if (!type->inner_ops) { err = -EOPNOTSUPP; goto out_unlock; } err = nla_parse_nested_deprecated(info->tb, type->maxattr, tb[NFTA_EXPR_DATA], type->policy, NULL); if (err < 0) goto out_unlock; info->attr = nla; info->ops = type->inner_ops; /* No module reference will be taken on type->owner. * Presence of type->inner_ops implies that the expression * is builtin, so it cannot go away. */ rcu_read_unlock(); return 0; out_unlock: rcu_read_unlock(); return err; } static int nf_tables_newexpr(const struct nft_ctx *ctx, const struct nft_expr_info *expr_info, struct nft_expr *expr) { const struct nft_expr_ops *ops = expr_info->ops; int err; expr->ops = ops; if (ops->init) { err = ops->init(ctx, expr, (const struct nlattr **)expr_info->tb); if (err < 0) goto err1; } return 0; err1: expr->ops = NULL; return err; } static void nf_tables_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr) { const struct nft_expr_type *type = expr->ops->type; if (expr->ops->destroy) expr->ops->destroy(ctx, expr); module_put(type->owner); } static struct nft_expr *nft_expr_init(const struct nft_ctx *ctx, const struct nlattr *nla) { struct nft_expr_info expr_info; struct nft_expr *expr; struct module *owner; int err; err = nf_tables_expr_parse(ctx, nla, &expr_info); if (err < 0) goto err_expr_parse; err = -EOPNOTSUPP; if (!(expr_info.ops->type->flags & NFT_EXPR_STATEFUL)) goto err_expr_stateful; err = -ENOMEM; expr = kzalloc(expr_info.ops->size, GFP_KERNEL_ACCOUNT); if (expr == NULL) goto err_expr_stateful; err = nf_tables_newexpr(ctx, &expr_info, expr); if (err < 0) goto err_expr_new; return expr; err_expr_new: kfree(expr); err_expr_stateful: owner = expr_info.ops->type->owner; if (expr_info.ops->type->release_ops) expr_info.ops->type->release_ops(expr_info.ops); module_put(owner); err_expr_parse: return ERR_PTR(err); } int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src, gfp_t gfp) { int err; if (WARN_ON_ONCE(!src->ops->clone)) return -EINVAL; dst->ops = src->ops; err = src->ops->clone(dst, src, gfp); if (err < 0) return err; __module_get(src->ops->type->owner); return 0; } void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr) { nf_tables_expr_destroy(ctx, expr); kfree(expr); } /* * Rules */ static struct nft_rule *__nft_rule_lookup(const struct net *net, const struct nft_chain *chain, u64 handle) { struct nft_rule *rule; // FIXME: this sucks list_for_each_entry_rcu(rule, &chain->rules, list, lockdep_commit_lock_is_held(net)) { if (handle == rule->handle) return rule; } return ERR_PTR(-ENOENT); } static struct nft_rule *nft_rule_lookup(const struct net *net, const struct nft_chain *chain, const struct nlattr *nla) { if (nla == NULL) return ERR_PTR(-EINVAL); return __nft_rule_lookup(net, chain, be64_to_cpu(nla_get_be64(nla))); } static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = { [NFTA_RULE_TABLE] = { .type = NLA_STRING, .len = NFT_TABLE_MAXNAMELEN - 1 }, [NFTA_RULE_CHAIN] = { .type = NLA_STRING, .len = NFT_CHAIN_MAXNAMELEN - 1 }, [NFTA_RULE_HANDLE] = { .type = NLA_U64 }, [NFTA_RULE_EXPRESSIONS] = NLA_POLICY_NESTED_ARRAY(nft_expr_policy), [NFTA_RULE_COMPAT] = { .type = NLA_NESTED }, [NFTA_RULE_POSITION] = { .type = NLA_U64 }, [NFTA_RULE_USERDATA] = { .type = NLA_BINARY, .len = NFT_USERDATA_MAXLEN }, [NFTA_RULE_ID] = { .type = NLA_U32 }, [NFTA_RULE_POSITION_ID] = { .type = NLA_U32 }, [NFTA_RULE_CHAIN_ID] = { .type = NLA_U32 }, }; static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq, int event, u32 flags, int family, const struct nft_table *table, const struct nft_chain *chain, const struct nft_rule *rule, u64 handle, bool reset) { struct nlmsghdr *nlh; const struct nft_expr *expr, *next; struct nlattr *list; u16 type = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nfnl_msg_put(skb, portid, seq, type, flags, family, NFNETLINK_V0, nft_base_seq(net)); if (!nlh) goto nla_put_failure; if (nla_put_string(skb, NFTA_RULE_TABLE, table->name)) goto nla_put_failure; if (nla_put_string(skb, NFTA_RULE_CHAIN, chain->name)) goto nla_put_failure; if (nla_put_be64(skb, NFTA_RULE_HANDLE, cpu_to_be64(rule->handle), NFTA_RULE_PAD)) goto nla_put_failure; if (event != NFT_MSG_DELRULE && handle) { if (nla_put_be64(skb, NFTA_RULE_POSITION, cpu_to_be64(handle), NFTA_RULE_PAD)) goto nla_put_failure; } if (chain->flags & NFT_CHAIN_HW_OFFLOAD) nft_flow_rule_stats(chain, rule); list = nla_nest_start_noflag(skb, NFTA_RULE_EXPRESSIONS); if (list == NULL) goto nla_put_failure; nft_rule_for_each_expr(expr, next, rule) { if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr, reset) < 0) goto nla_put_failure; } nla_nest_end(skb, list); if (rule->udata) { struct nft_userdata *udata = nft_userdata(rule); if (nla_put(skb, NFTA_RULE_USERDATA, udata->len + 1, udata->data) < 0) goto nla_put_failure; } nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_trim(skb, nlh); return -1; } static void nf_tables_rule_notify(const struct nft_ctx *ctx, const struct nft_rule *rule, int event) { struct nftables_pernet *nft_net = nft_pernet(ctx->net); const struct nft_rule *prule; struct sk_buff *skb; u64 handle = 0; u16 flags = 0; int err; if (!ctx->report && !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) return; skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (skb == NULL) goto err; if (event == NFT_MSG_NEWRULE && !list_is_first(&rule->list, &ctx->chain->rules) && !list_is_last(&rule->list, &ctx->chain->rules)) { prule = list_prev_entry(rule, list); handle = prule->handle; } if (ctx->flags & (NLM_F_APPEND | NLM_F_REPLACE)) flags |= NLM_F_APPEND; if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); err = nf_tables_fill_rule_info(skb, ctx->net, ctx->portid, ctx->seq, event, flags, ctx->family, ctx->table, ctx->chain, rule, handle, false); if (err < 0) { kfree_skb(skb); goto err; } nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list); return; err: nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); } static void audit_log_rule_reset(const struct nft_table *table, unsigned int base_seq, unsigned int nentries) { char *buf = kasprintf(GFP_ATOMIC, "%s:%u", table->name, base_seq); audit_log_nfcfg(buf, table->family, nentries, AUDIT_NFT_OP_RULE_RESET, GFP_ATOMIC); kfree(buf); } struct nft_rule_dump_ctx { unsigned int s_idx; char *table; char *chain; bool reset; }; static int __nf_tables_dump_rules(struct sk_buff *skb, unsigned int *idx, struct netlink_callback *cb, const struct nft_table *table, const struct nft_chain *chain) { struct nft_rule_dump_ctx *ctx = (void *)cb->ctx; struct net *net = sock_net(skb->sk); const struct nft_rule *rule, *prule; unsigned int entries = 0; int ret = 0; u64 handle; prule = NULL; list_for_each_entry_rcu(rule, &chain->rules, list) { if (!nft_is_active(net, rule)) goto cont_skip; if (*idx < ctx->s_idx) goto cont; if (prule) handle = prule->handle; else handle = 0; if (nf_tables_fill_rule_info(skb, net, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFT_MSG_NEWRULE, NLM_F_MULTI | NLM_F_APPEND, table->family, table, chain, rule, handle, ctx->reset) < 0) { ret = 1; break; } entries++; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: prule = rule; cont_skip: (*idx)++; } if (ctx->reset && entries) audit_log_rule_reset(table, cb->seq, entries); return ret; } static int nf_tables_dump_rules(struct sk_buff *skb, struct netlink_callback *cb) { const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); struct nft_rule_dump_ctx *ctx = (void *)cb->ctx; struct nft_table *table; const struct nft_chain *chain; unsigned int idx = 0; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; struct nftables_pernet *nft_net; rcu_read_lock(); nft_net = nft_pernet(net); cb->seq = READ_ONCE(nft_net->base_seq); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) continue; if (ctx->table && strcmp(ctx->table, table->name) != 0) continue; if (ctx->table && ctx->chain) { struct rhlist_head *list, *tmp; list = rhltable_lookup(&table->chains_ht, ctx->chain, nft_chain_ht_params); if (!list) goto done; rhl_for_each_entry_rcu(chain, tmp, list, rhlhead) { if (!nft_is_active(net, chain)) continue; __nf_tables_dump_rules(skb, &idx, cb, table, chain); break; } goto done; } list_for_each_entry_rcu(chain, &table->chains, list) { if (__nf_tables_dump_rules(skb, &idx, cb, table, chain)) goto done; } if (ctx->table) break; } done: rcu_read_unlock(); ctx->s_idx = idx; return skb->len; } static int nf_tables_dumpreset_rules(struct sk_buff *skb, struct netlink_callback *cb) { struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk)); int ret; /* Mutex is held is to prevent that two concurrent dump-and-reset calls * do not underrun counters and quotas. The commit_mutex is used for * the lack a better lock, this is not transaction path. */ mutex_lock(&nft_net->commit_mutex); ret = nf_tables_dump_rules(skb, cb); mutex_unlock(&nft_net->commit_mutex); return ret; } static int nf_tables_dump_rules_start(struct netlink_callback *cb) { struct nft_rule_dump_ctx *ctx = (void *)cb->ctx; const struct nlattr * const *nla = cb->data; BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); if (nla[NFTA_RULE_TABLE]) { ctx->table = nla_strdup(nla[NFTA_RULE_TABLE], GFP_ATOMIC); if (!ctx->table) return -ENOMEM; } if (nla[NFTA_RULE_CHAIN]) { ctx->chain = nla_strdup(nla[NFTA_RULE_CHAIN], GFP_ATOMIC); if (!ctx->chain) { kfree(ctx->table); return -ENOMEM; } } return 0; } static int nf_tables_dumpreset_rules_start(struct netlink_callback *cb) { struct nft_rule_dump_ctx *ctx = (void *)cb->ctx; ctx->reset = true; return nf_tables_dump_rules_start(cb); } static int nf_tables_dump_rules_done(struct netlink_callback *cb) { struct nft_rule_dump_ctx *ctx = (void *)cb->ctx; kfree(ctx->table); kfree(ctx->chain); return 0; } /* Caller must hold rcu read lock or transaction mutex */ static struct sk_buff * nf_tables_getrule_single(u32 portid, const struct nfnl_info *info, const struct nlattr * const nla[], bool reset) { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); u8 family = info->nfmsg->nfgen_family; const struct nft_chain *chain; const struct nft_rule *rule; struct net *net = info->net; struct nft_table *table; struct sk_buff *skb2; int err; table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask, 0); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]); return ERR_CAST(table); } chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask); if (IS_ERR(chain)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); return ERR_CAST(chain); } rule = nft_rule_lookup(net, chain, nla[NFTA_RULE_HANDLE]); if (IS_ERR(rule)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]); return ERR_CAST(rule); } skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (!skb2) return ERR_PTR(-ENOMEM); err = nf_tables_fill_rule_info(skb2, net, portid, info->nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0, family, table, chain, rule, 0, reset); if (err < 0) { kfree_skb(skb2); return ERR_PTR(err); } return skb2; } static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { u32 portid = NETLINK_CB(skb).portid; struct net *net = info->net; struct sk_buff *skb2; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start= nf_tables_dump_rules_start, .dump = nf_tables_dump_rules, .done = nf_tables_dump_rules_done, .module = THIS_MODULE, .data = (void *)nla, }; return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } skb2 = nf_tables_getrule_single(portid, info, nla, false); if (IS_ERR(skb2)) return PTR_ERR(skb2); return nfnetlink_unicast(skb2, net, portid); } static int nf_tables_getrule_reset(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct nftables_pernet *nft_net = nft_pernet(info->net); u32 portid = NETLINK_CB(skb).portid; struct net *net = info->net; struct sk_buff *skb2; char *buf; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start= nf_tables_dumpreset_rules_start, .dump = nf_tables_dumpreset_rules, .done = nf_tables_dump_rules_done, .module = THIS_MODULE, .data = (void *)nla, }; return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } if (!try_module_get(THIS_MODULE)) return -EINVAL; rcu_read_unlock(); mutex_lock(&nft_net->commit_mutex); skb2 = nf_tables_getrule_single(portid, info, nla, true); mutex_unlock(&nft_net->commit_mutex); rcu_read_lock(); module_put(THIS_MODULE); if (IS_ERR(skb2)) return PTR_ERR(skb2); buf = kasprintf(GFP_ATOMIC, "%.*s:%u", nla_len(nla[NFTA_RULE_TABLE]), (char *)nla_data(nla[NFTA_RULE_TABLE]), nft_net->base_seq); audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1, AUDIT_NFT_OP_RULE_RESET, GFP_ATOMIC); kfree(buf); return nfnetlink_unicast(skb2, net, portid); } void nf_tables_rule_destroy(const struct nft_ctx *ctx, struct nft_rule *rule) { struct nft_expr *expr, *next; /* * Careful: some expressions might not be initialized in case this * is called on error from nf_tables_newrule(). */ expr = nft_expr_first(rule); while (nft_expr_more(rule, expr)) { next = nft_expr_next(expr); nf_tables_expr_destroy(ctx, expr); expr = next; } kfree(rule); } /* can only be used if rule is no longer visible to dumps */ static void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *rule) { lockdep_commit_lock_is_held(ctx->net); nft_rule_expr_deactivate(ctx, rule, NFT_TRANS_RELEASE); nf_tables_rule_destroy(ctx, rule); } /** nft_chain_validate - loop detection and hook validation * * @ctx: context containing call depth and base chain * @chain: chain to validate * * Walk through the rules of the given chain and chase all jumps/gotos * and set lookups until either the jump limit is hit or all reachable * chains have been validated. */ int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain) { struct nft_expr *expr, *last; struct nft_rule *rule; int err; if (ctx->level == NFT_JUMP_STACK_SIZE) return -EMLINK; list_for_each_entry(rule, &chain->rules, list) { if (fatal_signal_pending(current)) return -EINTR; if (!nft_is_active_next(ctx->net, rule)) continue; nft_rule_for_each_expr(expr, last, rule) { if (!expr->ops->validate) continue; /* This may call nft_chain_validate() recursively, * callers that do so must increment ctx->level. */ err = expr->ops->validate(ctx, expr); if (err < 0) return err; } } return 0; } EXPORT_SYMBOL_GPL(nft_chain_validate); static int nft_table_validate(struct net *net, const struct nft_table *table) { struct nft_chain *chain; struct nft_ctx ctx = { .net = net, .family = table->family, }; int err; list_for_each_entry(chain, &table->chains, list) { if (!nft_is_base_chain(chain)) continue; ctx.chain = chain; err = nft_chain_validate(&ctx, chain); if (err < 0) return err; cond_resched(); } return 0; } int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); struct nft_ctx *pctx = (struct nft_ctx *)ctx; const struct nft_data *data; int err; if (!nft_set_elem_active(ext, iter->genmask)) return 0; if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END) return 0; data = nft_set_ext_data(ext); switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: pctx->level++; err = nft_chain_validate(ctx, data->verdict.chain); if (err < 0) return err; pctx->level--; break; default: break; } return 0; } int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set) { struct nft_set_iter dummy_iter = { .genmask = nft_genmask_next(ctx->net), }; struct nft_set_elem_catchall *catchall; struct nft_set_ext *ext; int ret = 0; list_for_each_entry_rcu(catchall, &set->catchall_list, list, lockdep_commit_lock_is_held(ctx->net)) { ext = nft_set_elem_ext(set, catchall->elem); if (!nft_set_elem_active(ext, dummy_iter.genmask)) continue; ret = nft_setelem_validate(ctx, set, &dummy_iter, catchall->elem); if (ret < 0) return ret; } return ret; } static struct nft_rule *nft_rule_lookup_byid(const struct net *net, const struct nft_chain *chain, const struct nlattr *nla); #define NFT_RULE_MAXEXPRS 128 static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct nftables_pernet *nft_net = nft_pernet(info->net); struct netlink_ext_ack *extack = info->extack; unsigned int size, i, n, ulen = 0, usize = 0; u8 genmask = nft_genmask_next(info->net); struct nft_rule *rule, *old_rule = NULL; struct nft_expr_info *expr_info = NULL; u8 family = info->nfmsg->nfgen_family; struct nft_flow_rule *flow = NULL; struct net *net = info->net; struct nft_userdata *udata; struct nft_table *table; struct nft_chain *chain; struct nft_trans *trans; u64 handle, pos_handle; struct nft_expr *expr; struct nft_ctx ctx; struct nlattr *tmp; int err, rem; lockdep_assert_held(&nft_net->commit_mutex); table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask, NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]); return PTR_ERR(table); } if (nla[NFTA_RULE_CHAIN]) { chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask); if (IS_ERR(chain)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); return PTR_ERR(chain); } } else if (nla[NFTA_RULE_CHAIN_ID]) { chain = nft_chain_lookup_byid(net, table, nla[NFTA_RULE_CHAIN_ID], genmask); if (IS_ERR(chain)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN_ID]); return PTR_ERR(chain); } } else { return -EINVAL; } if (nft_chain_is_bound(chain)) return -EOPNOTSUPP; if (nla[NFTA_RULE_HANDLE]) { handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_HANDLE])); rule = __nft_rule_lookup(net, chain, handle); if (IS_ERR(rule)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]); return PTR_ERR(rule); } if (info->nlh->nlmsg_flags & NLM_F_EXCL) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]); return -EEXIST; } if (info->nlh->nlmsg_flags & NLM_F_REPLACE) old_rule = rule; else return -EOPNOTSUPP; } else { if (!(info->nlh->nlmsg_flags & NLM_F_CREATE) || info->nlh->nlmsg_flags & NLM_F_REPLACE) return -EINVAL; handle = nf_tables_alloc_handle(table); if (nla[NFTA_RULE_POSITION]) { pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION])); old_rule = __nft_rule_lookup(net, chain, pos_handle); if (IS_ERR(old_rule)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION]); return PTR_ERR(old_rule); } } else if (nla[NFTA_RULE_POSITION_ID]) { old_rule = nft_rule_lookup_byid(net, chain, nla[NFTA_RULE_POSITION_ID]); if (IS_ERR(old_rule)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION_ID]); return PTR_ERR(old_rule); } } } nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla); n = 0; size = 0; if (nla[NFTA_RULE_EXPRESSIONS]) { expr_info = kvmalloc_array(NFT_RULE_MAXEXPRS, sizeof(struct nft_expr_info), GFP_KERNEL); if (!expr_info) return -ENOMEM; nla_for_each_nested(tmp, nla[NFTA_RULE_EXPRESSIONS], rem) { err = -EINVAL; if (nla_type(tmp) != NFTA_LIST_ELEM) goto err_release_expr; if (n == NFT_RULE_MAXEXPRS) goto err_release_expr; err = nf_tables_expr_parse(&ctx, tmp, &expr_info[n]); if (err < 0) { NL_SET_BAD_ATTR(extack, tmp); goto err_release_expr; } size += expr_info[n].ops->size; n++; } } /* Check for overflow of dlen field */ err = -EFBIG; if (size >= 1 << 12) goto err_release_expr; if (nla[NFTA_RULE_USERDATA]) { ulen = nla_len(nla[NFTA_RULE_USERDATA]); if (ulen > 0) usize = sizeof(struct nft_userdata) + ulen; } err = -ENOMEM; rule = kzalloc(sizeof(*rule) + size + usize, GFP_KERNEL_ACCOUNT); if (rule == NULL) goto err_release_expr; nft_activate_next(net, rule); rule->handle = handle; rule->dlen = size; rule->udata = ulen ? 1 : 0; if (ulen) { udata = nft_userdata(rule); udata->len = ulen - 1; nla_memcpy(udata->data, nla[NFTA_RULE_USERDATA], ulen); } expr = nft_expr_first(rule); for (i = 0; i < n; i++) { err = nf_tables_newexpr(&ctx, &expr_info[i], expr); if (err < 0) { NL_SET_BAD_ATTR(extack, expr_info[i].attr); goto err_release_rule; } if (expr_info[i].ops->validate) nft_validate_state_update(table, NFT_VALIDATE_NEED); expr_info[i].ops = NULL; expr = nft_expr_next(expr); } if (chain->flags & NFT_CHAIN_HW_OFFLOAD) { flow = nft_flow_rule_create(net, rule); if (IS_ERR(flow)) { err = PTR_ERR(flow); goto err_release_rule; } } if (!nft_use_inc(&chain->use)) { err = -EMFILE; goto err_release_rule; } if (info->nlh->nlmsg_flags & NLM_F_REPLACE) { if (nft_chain_binding(chain)) { err = -EOPNOTSUPP; goto err_destroy_flow_rule; } err = nft_delrule(&ctx, old_rule); if (err < 0) goto err_destroy_flow_rule; trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule); if (trans == NULL) { err = -ENOMEM; goto err_destroy_flow_rule; } list_add_tail_rcu(&rule->list, &old_rule->list); } else { trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule); if (!trans) { err = -ENOMEM; goto err_destroy_flow_rule; } if (info->nlh->nlmsg_flags & NLM_F_APPEND) { if (old_rule) list_add_rcu(&rule->list, &old_rule->list); else list_add_tail_rcu(&rule->list, &chain->rules); } else { if (old_rule) list_add_tail_rcu(&rule->list, &old_rule->list); else list_add_rcu(&rule->list, &chain->rules); } } kvfree(expr_info); if (flow) nft_trans_flow_rule(trans) = flow; if (table->validate_state == NFT_VALIDATE_DO) return nft_table_validate(net, table); return 0; err_destroy_flow_rule: nft_use_dec_restore(&chain->use); if (flow) nft_flow_rule_destroy(flow); err_release_rule: nft_rule_expr_deactivate(&ctx, rule, NFT_TRANS_PREPARE_ERROR); nf_tables_rule_destroy(&ctx, rule); err_release_expr: for (i = 0; i < n; i++) { if (expr_info[i].ops) { module_put(expr_info[i].ops->type->owner); if (expr_info[i].ops->type->release_ops) expr_info[i].ops->type->release_ops(expr_info[i].ops); } } kvfree(expr_info); return err; } static struct nft_rule *nft_rule_lookup_byid(const struct net *net, const struct nft_chain *chain, const struct nlattr *nla) { struct nftables_pernet *nft_net = nft_pernet(net); u32 id = ntohl(nla_get_be32(nla)); struct nft_trans *trans; list_for_each_entry(trans, &nft_net->commit_list, list) { if (trans->msg_type == NFT_MSG_NEWRULE && nft_trans_rule_chain(trans) == chain && id == nft_trans_rule_id(trans)) return nft_trans_rule(trans); } return ERR_PTR(-ENOENT); } static int nf_tables_delrule(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); u8 family = info->nfmsg->nfgen_family; struct nft_chain *chain = NULL; struct net *net = info->net; struct nft_table *table; struct nft_rule *rule; struct nft_ctx ctx; int err = 0; table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask, NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]); return PTR_ERR(table); } if (nla[NFTA_RULE_CHAIN]) { chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask); if (IS_ERR(chain)) { if (PTR_ERR(chain) == -ENOENT && NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYRULE) return 0; NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); return PTR_ERR(chain); } if (nft_chain_binding(chain)) return -EOPNOTSUPP; } nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla); if (chain) { if (nla[NFTA_RULE_HANDLE]) { rule = nft_rule_lookup(info->net, chain, nla[NFTA_RULE_HANDLE]); if (IS_ERR(rule)) { if (PTR_ERR(rule) == -ENOENT && NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYRULE) return 0; NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]); return PTR_ERR(rule); } err = nft_delrule(&ctx, rule); } else if (nla[NFTA_RULE_ID]) { rule = nft_rule_lookup_byid(net, chain, nla[NFTA_RULE_ID]); if (IS_ERR(rule)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_ID]); return PTR_ERR(rule); } err = nft_delrule(&ctx, rule); } else { err = nft_delrule_by_chain(&ctx); } } else { list_for_each_entry(chain, &table->chains, list) { if (!nft_is_active_next(net, chain)) continue; if (nft_chain_binding(chain)) continue; ctx.chain = chain; err = nft_delrule_by_chain(&ctx); if (err < 0) break; } } return err; } /* * Sets */ static const struct nft_set_type *nft_set_types[] = { &nft_set_hash_fast_type, &nft_set_hash_type, &nft_set_rhash_type, &nft_set_bitmap_type, &nft_set_rbtree_type, #if defined(CONFIG_X86_64) && !defined(CONFIG_UML) &nft_set_pipapo_avx2_type, #endif &nft_set_pipapo_type, }; #define NFT_SET_FEATURES (NFT_SET_INTERVAL | NFT_SET_MAP | \ NFT_SET_TIMEOUT | NFT_SET_OBJECT | \ NFT_SET_EVAL) static bool nft_set_ops_candidate(const struct nft_set_type *type, u32 flags) { return (flags & type->features) == (flags & NFT_SET_FEATURES); } /* * Select a set implementation based on the data characteristics and the * given policy. The total memory use might not be known if no size is * given, in that case the amount of memory per element is used. */ static const struct nft_set_ops * nft_select_set_ops(const struct nft_ctx *ctx, u32 flags, const struct nft_set_desc *desc) { struct nftables_pernet *nft_net = nft_pernet(ctx->net); const struct nft_set_ops *ops, *bops; struct nft_set_estimate est, best; const struct nft_set_type *type; int i; lockdep_assert_held(&nft_net->commit_mutex); lockdep_nfnl_nft_mutex_not_held(); bops = NULL; best.size = ~0; best.lookup = ~0; best.space = ~0; for (i = 0; i < ARRAY_SIZE(nft_set_types); i++) { type = nft_set_types[i]; ops = &type->ops; if (!nft_set_ops_candidate(type, flags)) continue; if (!ops->estimate(desc, flags, &est)) continue; switch (desc->policy) { case NFT_SET_POL_PERFORMANCE: if (est.lookup < best.lookup) break; if (est.lookup == best.lookup && est.space < best.space) break; continue; case NFT_SET_POL_MEMORY: if (!desc->size) { if (est.space < best.space) break; if (est.space == best.space && est.lookup < best.lookup) break; } else if (est.size < best.size || !bops) { break; } continue; default: break; } bops = ops; best = est; } if (bops != NULL) return bops; return ERR_PTR(-EOPNOTSUPP); } static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { [NFTA_SET_TABLE] = { .type = NLA_STRING, .len = NFT_TABLE_MAXNAMELEN - 1 }, [NFTA_SET_NAME] = { .type = NLA_STRING, .len = NFT_SET_MAXNAMELEN - 1 }, [NFTA_SET_FLAGS] = { .type = NLA_U32 }, [NFTA_SET_KEY_TYPE] = { .type = NLA_U32 }, [NFTA_SET_KEY_LEN] = { .type = NLA_U32 }, [NFTA_SET_DATA_TYPE] = { .type = NLA_U32 }, [NFTA_SET_DATA_LEN] = { .type = NLA_U32 }, [NFTA_SET_POLICY] = { .type = NLA_U32 }, [NFTA_SET_DESC] = { .type = NLA_NESTED }, [NFTA_SET_ID] = { .type = NLA_U32 }, [NFTA_SET_TIMEOUT] = { .type = NLA_U64 }, [NFTA_SET_GC_INTERVAL] = { .type = NLA_U32 }, [NFTA_SET_USERDATA] = { .type = NLA_BINARY, .len = NFT_USERDATA_MAXLEN }, [NFTA_SET_OBJ_TYPE] = { .type = NLA_U32 }, [NFTA_SET_HANDLE] = { .type = NLA_U64 }, [NFTA_SET_EXPR] = { .type = NLA_NESTED }, [NFTA_SET_EXPRESSIONS] = NLA_POLICY_NESTED_ARRAY(nft_expr_policy), }; static const struct nla_policy nft_concat_policy[NFTA_SET_FIELD_MAX + 1] = { [NFTA_SET_FIELD_LEN] = { .type = NLA_U32 }, }; static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { [NFTA_SET_DESC_SIZE] = { .type = NLA_U32 }, [NFTA_SET_DESC_CONCAT] = NLA_POLICY_NESTED_ARRAY(nft_concat_policy), }; static struct nft_set *nft_set_lookup(const struct net *net, const struct nft_table *table, const struct nlattr *nla, u8 genmask) { struct nft_set *set; if (nla == NULL) return ERR_PTR(-EINVAL); list_for_each_entry_rcu(set, &table->sets, list, lockdep_commit_lock_is_held(net)) { if (!nla_strcmp(nla, set->name) && nft_active_genmask(set, genmask)) return set; } return ERR_PTR(-ENOENT); } static struct nft_set *nft_set_lookup_byhandle(const struct nft_table *table, const struct nlattr *nla, u8 genmask) { struct nft_set *set; list_for_each_entry(set, &table->sets, list) { if (be64_to_cpu(nla_get_be64(nla)) == set->handle && nft_active_genmask(set, genmask)) return set; } return ERR_PTR(-ENOENT); } static struct nft_set *nft_set_lookup_byid(const struct net *net, const struct nft_table *table, const struct nlattr *nla, u8 genmask) { struct nftables_pernet *nft_net = nft_pernet(net); u32 id = ntohl(nla_get_be32(nla)); struct nft_trans_set *trans; /* its likely the id we need is at the tail, not at start */ list_for_each_entry_reverse(trans, &nft_net->commit_set_list, list_trans_newset) { struct nft_set *set = trans->set; if (id == trans->set_id && set->table == table && nft_active_genmask(set, genmask)) return set; } return ERR_PTR(-ENOENT); } struct nft_set *nft_set_lookup_global(const struct net *net, const struct nft_table *table, const struct nlattr *nla_set_name, const struct nlattr *nla_set_id, u8 genmask) { struct nft_set *set; set = nft_set_lookup(net, table, nla_set_name, genmask); if (IS_ERR(set)) { if (!nla_set_id) return set; set = nft_set_lookup_byid(net, table, nla_set_id, genmask); } return set; } EXPORT_SYMBOL_GPL(nft_set_lookup_global); static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set, const char *name) { const struct nft_set *i; const char *p; unsigned long *inuse; unsigned int n = 0, min = 0; p = strchr(name, '%'); if (p != NULL) { if (p[1] != 'd' || strchr(p + 2, '%')) return -EINVAL; if (strnlen(name, NFT_SET_MAX_ANONLEN) >= NFT_SET_MAX_ANONLEN) return -EINVAL; inuse = (unsigned long *)get_zeroed_page(GFP_KERNEL); if (inuse == NULL) return -ENOMEM; cont: list_for_each_entry(i, &ctx->table->sets, list) { int tmp; if (!nft_is_active_next(ctx->net, i)) continue; if (!sscanf(i->name, name, &tmp)) continue; if (tmp < min || tmp >= min + BITS_PER_BYTE * PAGE_SIZE) continue; set_bit(tmp - min, inuse); } n = find_first_zero_bit(inuse, BITS_PER_BYTE * PAGE_SIZE); if (n >= BITS_PER_BYTE * PAGE_SIZE) { min += BITS_PER_BYTE * PAGE_SIZE; memset(inuse, 0, PAGE_SIZE); goto cont; } free_page((unsigned long)inuse); } set->name = kasprintf(GFP_KERNEL_ACCOUNT, name, min + n); if (!set->name) return -ENOMEM; list_for_each_entry(i, &ctx->table->sets, list) { if (!nft_is_active_next(ctx->net, i)) continue; if (!strcmp(set->name, i->name)) { kfree(set->name); set->name = NULL; return -ENFILE; } } return 0; } int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result) { u64 ms = be64_to_cpu(nla_get_be64(nla)); u64 max = (u64)(~((u64)0)); max = div_u64(max, NSEC_PER_MSEC); if (ms >= max) return -ERANGE; ms *= NSEC_PER_MSEC; *result = nsecs_to_jiffies64(ms) ? : !!ms; return 0; } __be64 nf_jiffies64_to_msecs(u64 input) { return cpu_to_be64(jiffies64_to_msecs(input)); } static int nf_tables_fill_set_concat(struct sk_buff *skb, const struct nft_set *set) { struct nlattr *concat, *field; int i; concat = nla_nest_start_noflag(skb, NFTA_SET_DESC_CONCAT); if (!concat) return -ENOMEM; for (i = 0; i < set->field_count; i++) { field = nla_nest_start_noflag(skb, NFTA_LIST_ELEM); if (!field) return -ENOMEM; if (nla_put_be32(skb, NFTA_SET_FIELD_LEN, htonl(set->field_len[i]))) return -ENOMEM; nla_nest_end(skb, field); } nla_nest_end(skb, concat); return 0; } static u32 nft_set_userspace_size(const struct nft_set_ops *ops, u32 size) { if (ops->usize) return ops->usize(size); return size; } static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, const struct nft_set *set, u16 event, u16 flags) { u64 timeout = READ_ONCE(set->timeout); u32 gc_int = READ_ONCE(set->gc_int); u32 portid = ctx->portid; struct nlmsghdr *nlh; struct nlattr *nest; u32 seq = ctx->seq; int i; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nfnl_msg_put(skb, portid, seq, event, flags, ctx->family, NFNETLINK_V0, nft_base_seq(ctx->net)); if (!nlh) goto nla_put_failure; if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name)) goto nla_put_failure; if (nla_put_string(skb, NFTA_SET_NAME, set->name)) goto nla_put_failure; if (nla_put_be64(skb, NFTA_SET_HANDLE, cpu_to_be64(set->handle), NFTA_SET_PAD)) goto nla_put_failure; if (event == NFT_MSG_DELSET) { nlmsg_end(skb, nlh); return 0; } if (set->flags != 0) if (nla_put_be32(skb, NFTA_SET_FLAGS, htonl(set->flags))) goto nla_put_failure; if (nla_put_be32(skb, NFTA_SET_KEY_TYPE, htonl(set->ktype))) goto nla_put_failure; if (nla_put_be32(skb, NFTA_SET_KEY_LEN, htonl(set->klen))) goto nla_put_failure; if (set->flags & NFT_SET_MAP) { if (nla_put_be32(skb, NFTA_SET_DATA_TYPE, htonl(set->dtype))) goto nla_put_failure; if (nla_put_be32(skb, NFTA_SET_DATA_LEN, htonl(set->dlen))) goto nla_put_failure; } if (set->flags & NFT_SET_OBJECT && nla_put_be32(skb, NFTA_SET_OBJ_TYPE, htonl(set->objtype))) goto nla_put_failure; if (timeout && nla_put_be64(skb, NFTA_SET_TIMEOUT, nf_jiffies64_to_msecs(timeout), NFTA_SET_PAD)) goto nla_put_failure; if (gc_int && nla_put_be32(skb, NFTA_SET_GC_INTERVAL, htonl(gc_int))) goto nla_put_failure; if (set->policy != NFT_SET_POL_PERFORMANCE) { if (nla_put_be32(skb, NFTA_SET_POLICY, htonl(set->policy))) goto nla_put_failure; } if (set->udata && nla_put(skb, NFTA_SET_USERDATA, set->udlen, set->udata)) goto nla_put_failure; nest = nla_nest_start_noflag(skb, NFTA_SET_DESC); if (!nest) goto nla_put_failure; if (set->size && nla_put_be32(skb, NFTA_SET_DESC_SIZE, htonl(nft_set_userspace_size(set->ops, set->size)))) goto nla_put_failure; if (set->field_count > 1 && nf_tables_fill_set_concat(skb, set)) goto nla_put_failure; nla_nest_end(skb, nest); if (set->num_exprs == 1) { nest = nla_nest_start_noflag(skb, NFTA_SET_EXPR); if (nf_tables_fill_expr_info(skb, set->exprs[0], false) < 0) goto nla_put_failure; nla_nest_end(skb, nest); } else if (set->num_exprs > 1) { nest = nla_nest_start_noflag(skb, NFTA_SET_EXPRESSIONS); if (nest == NULL) goto nla_put_failure; for (i = 0; i < set->num_exprs; i++) { if (nft_expr_dump(skb, NFTA_LIST_ELEM, set->exprs[i], false) < 0) goto nla_put_failure; } nla_nest_end(skb, nest); } nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_trim(skb, nlh); return -1; } static void nf_tables_set_notify(const struct nft_ctx *ctx, const struct nft_set *set, int event, gfp_t gfp_flags) { struct nftables_pernet *nft_net = nft_pernet(ctx->net); u32 portid = ctx->portid; struct sk_buff *skb; u16 flags = 0; int err; if (!ctx->report && !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) return; skb = nlmsg_new(NLMSG_GOODSIZE, gfp_flags); if (skb == NULL) goto err; if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); err = nf_tables_fill_set(skb, ctx, set, event, flags); if (err < 0) { kfree_skb(skb); goto err; } nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list); return; err: nfnetlink_set_err(ctx->net, portid, NFNLGRP_NFTABLES, -ENOBUFS); } static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb) { const struct nft_set *set; unsigned int idx, s_idx = cb->args[0]; struct nft_table *table, *cur_table = (struct nft_table *)cb->args[2]; struct net *net = sock_net(skb->sk); struct nft_ctx *ctx = cb->data, ctx_set; struct nftables_pernet *nft_net; if (cb->args[1]) return skb->len; rcu_read_lock(); nft_net = nft_pernet(net); cb->seq = READ_ONCE(nft_net->base_seq); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (ctx->family != NFPROTO_UNSPEC && ctx->family != table->family) continue; if (ctx->table && ctx->table != table) continue; if (cur_table) { if (cur_table != table) continue; cur_table = NULL; } idx = 0; list_for_each_entry_rcu(set, &table->sets, list) { if (idx < s_idx) goto cont; if (!nft_is_active(net, set)) goto cont; ctx_set = *ctx; ctx_set.table = table; ctx_set.family = table->family; if (nf_tables_fill_set(skb, &ctx_set, set, NFT_MSG_NEWSET, NLM_F_MULTI) < 0) { cb->args[0] = idx; cb->args[2] = (unsigned long) table; goto done; } nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: idx++; } if (s_idx) s_idx = 0; } cb->args[1] = 1; done: rcu_read_unlock(); return skb->len; } static int nf_tables_dump_sets_start(struct netlink_callback *cb) { struct nft_ctx *ctx_dump = NULL; ctx_dump = kmemdup(cb->data, sizeof(*ctx_dump), GFP_ATOMIC); if (ctx_dump == NULL) return -ENOMEM; cb->data = ctx_dump; return 0; } static int nf_tables_dump_sets_done(struct netlink_callback *cb) { kfree(cb->data); return 0; } /* called with rcu_read_lock held */ static int nf_tables_getset(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); u8 family = info->nfmsg->nfgen_family; struct nft_table *table = NULL; struct net *net = info->net; const struct nft_set *set; struct sk_buff *skb2; struct nft_ctx ctx; int err; if (nla[NFTA_SET_TABLE]) { table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, genmask, 0); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]); return PTR_ERR(table); } } nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = nf_tables_dump_sets_start, .dump = nf_tables_dump_sets, .done = nf_tables_dump_sets_done, .data = &ctx, .module = THIS_MODULE, }; return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } /* Only accept unspec with dump */ if (info->nfmsg->nfgen_family == NFPROTO_UNSPEC) return -EAFNOSUPPORT; if (!nla[NFTA_SET_TABLE]) return -EINVAL; set = nft_set_lookup(net, table, nla[NFTA_SET_NAME], genmask); if (IS_ERR(set)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]); return PTR_ERR(set); } skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (skb2 == NULL) return -ENOMEM; err = nf_tables_fill_set(skb2, &ctx, set, NFT_MSG_NEWSET, 0); if (err < 0) goto err_fill_set_info; return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); err_fill_set_info: kfree_skb(skb2); return err; } static int nft_set_desc_concat_parse(const struct nlattr *attr, struct nft_set_desc *desc) { struct nlattr *tb[NFTA_SET_FIELD_MAX + 1]; u32 len; int err; if (desc->field_count >= ARRAY_SIZE(desc->field_len)) return -E2BIG; err = nla_parse_nested_deprecated(tb, NFTA_SET_FIELD_MAX, attr, nft_concat_policy, NULL); if (err < 0) return err; if (!tb[NFTA_SET_FIELD_LEN]) return -EINVAL; len = ntohl(nla_get_be32(tb[NFTA_SET_FIELD_LEN])); if (!len || len > U8_MAX) return -EINVAL; desc->field_len[desc->field_count++] = len; return 0; } static int nft_set_desc_concat(struct nft_set_desc *desc, const struct nlattr *nla) { u32 len = 0, num_regs; struct nlattr *attr; int rem, err, i; nla_for_each_nested(attr, nla, rem) { if (nla_type(attr) != NFTA_LIST_ELEM) return -EINVAL; err = nft_set_desc_concat_parse(attr, desc); if (err < 0) return err; } for (i = 0; i < desc->field_count; i++) len += round_up(desc->field_len[i], sizeof(u32)); if (len != desc->klen) return -EINVAL; num_regs = DIV_ROUND_UP(desc->klen, sizeof(u32)); if (num_regs > NFT_REG32_COUNT) return -E2BIG; return 0; } static int nf_tables_set_desc_parse(struct nft_set_desc *desc, const struct nlattr *nla) { struct nlattr *da[NFTA_SET_DESC_MAX + 1]; int err; err = nla_parse_nested_deprecated(da, NFTA_SET_DESC_MAX, nla, nft_set_desc_policy, NULL); if (err < 0) return err; if (da[NFTA_SET_DESC_SIZE] != NULL) desc->size = ntohl(nla_get_be32(da[NFTA_SET_DESC_SIZE])); if (da[NFTA_SET_DESC_CONCAT]) err = nft_set_desc_concat(desc, da[NFTA_SET_DESC_CONCAT]); return err; } static int nft_set_expr_alloc(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr * const *nla, struct nft_expr **exprs, int *num_exprs, u32 flags) { struct nft_expr *expr; int err, i; if (nla[NFTA_SET_EXPR]) { expr = nft_set_elem_expr_alloc(ctx, set, nla[NFTA_SET_EXPR]); if (IS_ERR(expr)) { err = PTR_ERR(expr); goto err_set_expr_alloc; } exprs[0] = expr; (*num_exprs)++; } else if (nla[NFTA_SET_EXPRESSIONS]) { struct nlattr *tmp; int left; if (!(flags & NFT_SET_EXPR)) { err = -EINVAL; goto err_set_expr_alloc; } i = 0; nla_for_each_nested(tmp, nla[NFTA_SET_EXPRESSIONS], left) { if (i == NFT_SET_EXPR_MAX) { err = -E2BIG; goto err_set_expr_alloc; } if (nla_type(tmp) != NFTA_LIST_ELEM) { err = -EINVAL; goto err_set_expr_alloc; } expr = nft_set_elem_expr_alloc(ctx, set, tmp); if (IS_ERR(expr)) { err = PTR_ERR(expr); goto err_set_expr_alloc; } exprs[i++] = expr; (*num_exprs)++; } } return 0; err_set_expr_alloc: for (i = 0; i < *num_exprs; i++) nft_expr_destroy(ctx, exprs[i]); return err; } static bool nft_set_is_same(const struct nft_set *set, const struct nft_set_desc *desc, struct nft_expr *exprs[], u32 num_exprs, u32 flags) { int i; if (set->ktype != desc->ktype || set->dtype != desc->dtype || set->flags != flags || set->klen != desc->klen || set->dlen != desc->dlen || set->field_count != desc->field_count || set->num_exprs != num_exprs) return false; for (i = 0; i < desc->field_count; i++) { if (set->field_len[i] != desc->field_len[i]) return false; } for (i = 0; i < num_exprs; i++) { if (set->exprs[i]->ops != exprs[i]->ops) return false; } return true; } static u32 nft_set_kernel_size(const struct nft_set_ops *ops, const struct nft_set_desc *desc) { if (ops->ksize) return ops->ksize(desc->size); return desc->size; } static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); u8 family = info->nfmsg->nfgen_family; const struct nft_set_ops *ops; struct net *net = info->net; struct nft_set_desc desc; struct nft_table *table; unsigned char *udata; struct nft_set *set; struct nft_ctx ctx; size_t alloc_size; int num_exprs = 0; char *name; int err, i; u16 udlen; u32 flags; u64 size; if (nla[NFTA_SET_TABLE] == NULL || nla[NFTA_SET_NAME] == NULL || nla[NFTA_SET_KEY_LEN] == NULL || nla[NFTA_SET_ID] == NULL) return -EINVAL; memset(&desc, 0, sizeof(desc)); desc.ktype = NFT_DATA_VALUE; if (nla[NFTA_SET_KEY_TYPE] != NULL) { desc.ktype = ntohl(nla_get_be32(nla[NFTA_SET_KEY_TYPE])); if ((desc.ktype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK) return -EINVAL; } desc.klen = ntohl(nla_get_be32(nla[NFTA_SET_KEY_LEN])); if (desc.klen == 0 || desc.klen > NFT_DATA_VALUE_MAXLEN) return -EINVAL; flags = 0; if (nla[NFTA_SET_FLAGS] != NULL) { flags = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS])); if (flags & ~(NFT_SET_ANONYMOUS | NFT_SET_CONSTANT | NFT_SET_INTERVAL | NFT_SET_TIMEOUT | NFT_SET_MAP | NFT_SET_EVAL | NFT_SET_OBJECT | NFT_SET_CONCAT | NFT_SET_EXPR)) return -EOPNOTSUPP; /* Only one of these operations is supported */ if ((flags & (NFT_SET_MAP | NFT_SET_OBJECT)) == (NFT_SET_MAP | NFT_SET_OBJECT)) return -EOPNOTSUPP; if ((flags & (NFT_SET_EVAL | NFT_SET_OBJECT)) == (NFT_SET_EVAL | NFT_SET_OBJECT)) return -EOPNOTSUPP; if ((flags & (NFT_SET_ANONYMOUS | NFT_SET_TIMEOUT | NFT_SET_EVAL)) == (NFT_SET_ANONYMOUS | NFT_SET_TIMEOUT)) return -EOPNOTSUPP; if ((flags & (NFT_SET_CONSTANT | NFT_SET_TIMEOUT)) == (NFT_SET_CONSTANT | NFT_SET_TIMEOUT)) return -EOPNOTSUPP; } desc.dtype = 0; if (nla[NFTA_SET_DATA_TYPE] != NULL) { if (!(flags & NFT_SET_MAP)) return -EINVAL; desc.dtype = ntohl(nla_get_be32(nla[NFTA_SET_DATA_TYPE])); if ((desc.dtype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK && desc.dtype != NFT_DATA_VERDICT) return -EINVAL; if (desc.dtype != NFT_DATA_VERDICT) { if (nla[NFTA_SET_DATA_LEN] == NULL) return -EINVAL; desc.dlen = ntohl(nla_get_be32(nla[NFTA_SET_DATA_LEN])); if (desc.dlen == 0 || desc.dlen > NFT_DATA_VALUE_MAXLEN) return -EINVAL; } else desc.dlen = sizeof(struct nft_verdict); } else if (flags & NFT_SET_MAP) return -EINVAL; if (nla[NFTA_SET_OBJ_TYPE] != NULL) { if (!(flags & NFT_SET_OBJECT)) return -EINVAL; desc.objtype = ntohl(nla_get_be32(nla[NFTA_SET_OBJ_TYPE])); if (desc.objtype == NFT_OBJECT_UNSPEC || desc.objtype > NFT_OBJECT_MAX) return -EOPNOTSUPP; } else if (flags & NFT_SET_OBJECT) return -EINVAL; else desc.objtype = NFT_OBJECT_UNSPEC; desc.timeout = 0; if (nla[NFTA_SET_TIMEOUT] != NULL) { if (!(flags & NFT_SET_TIMEOUT)) return -EINVAL; if (flags & NFT_SET_ANONYMOUS) return -EOPNOTSUPP; err = nf_msecs_to_jiffies64(nla[NFTA_SET_TIMEOUT], &desc.timeout); if (err) return err; } desc.gc_int = 0; if (nla[NFTA_SET_GC_INTERVAL] != NULL) { if (!(flags & NFT_SET_TIMEOUT)) return -EINVAL; if (flags & NFT_SET_ANONYMOUS) return -EOPNOTSUPP; desc.gc_int = ntohl(nla_get_be32(nla[NFTA_SET_GC_INTERVAL])); } desc.policy = NFT_SET_POL_PERFORMANCE; if (nla[NFTA_SET_POLICY] != NULL) { desc.policy = ntohl(nla_get_be32(nla[NFTA_SET_POLICY])); switch (desc.policy) { case NFT_SET_POL_PERFORMANCE: case NFT_SET_POL_MEMORY: break; default: return -EOPNOTSUPP; } } if (nla[NFTA_SET_DESC] != NULL) { err = nf_tables_set_desc_parse(&desc, nla[NFTA_SET_DESC]); if (err < 0) return err; if (desc.field_count > 1) { if (!(flags & NFT_SET_CONCAT)) return -EINVAL; } else if (flags & NFT_SET_CONCAT) { return -EINVAL; } } else if (flags & NFT_SET_CONCAT) { return -EINVAL; } if (nla[NFTA_SET_EXPR] || nla[NFTA_SET_EXPRESSIONS]) desc.expr = true; table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, genmask, NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]); return PTR_ERR(table); } nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); set = nft_set_lookup(net, table, nla[NFTA_SET_NAME], genmask); if (IS_ERR(set)) { if (PTR_ERR(set) != -ENOENT) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]); return PTR_ERR(set); } } else { struct nft_expr *exprs[NFT_SET_EXPR_MAX] = {}; if (info->nlh->nlmsg_flags & NLM_F_EXCL) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]); return -EEXIST; } if (info->nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; if (nft_set_is_anonymous(set)) return -EOPNOTSUPP; err = nft_set_expr_alloc(&ctx, set, nla, exprs, &num_exprs, flags); if (err < 0) return err; if (desc.size) desc.size = nft_set_kernel_size(set->ops, &desc); err = 0; if (!nft_set_is_same(set, &desc, exprs, num_exprs, flags)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]); err = -EEXIST; } for (i = 0; i < num_exprs; i++) nft_expr_destroy(&ctx, exprs[i]); if (err < 0) return err; return __nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set, &desc); } if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) return -ENOENT; ops = nft_select_set_ops(&ctx, flags, &desc); if (IS_ERR(ops)) return PTR_ERR(ops); if (desc.size) desc.size = nft_set_kernel_size(ops, &desc); udlen = 0; if (nla[NFTA_SET_USERDATA]) udlen = nla_len(nla[NFTA_SET_USERDATA]); size = 0; if (ops->privsize != NULL) size = ops->privsize(nla, &desc); alloc_size = sizeof(*set) + size + udlen; if (alloc_size < size || alloc_size > INT_MAX) return -ENOMEM; if (!nft_use_inc(&table->use)) return -EMFILE; set = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT); if (!set) { err = -ENOMEM; goto err_alloc; } name = nla_strdup(nla[NFTA_SET_NAME], GFP_KERNEL_ACCOUNT); if (!name) { err = -ENOMEM; goto err_set_name; } err = nf_tables_set_alloc_name(&ctx, set, name); kfree(name); if (err < 0) goto err_set_name; udata = NULL; if (udlen) { udata = set->data + size; nla_memcpy(udata, nla[NFTA_SET_USERDATA], udlen); } INIT_LIST_HEAD(&set->bindings); INIT_LIST_HEAD(&set->catchall_list); refcount_set(&set->refs, 1); set->table = table; write_pnet(&set->net, net); set->ops = ops; set->ktype = desc.ktype; set->klen = desc.klen; set->dtype = desc.dtype; set->objtype = desc.objtype; set->dlen = desc.dlen; set->flags = flags; set->size = desc.size; set->policy = desc.policy; set->udlen = udlen; set->udata = udata; set->timeout = desc.timeout; set->gc_int = desc.gc_int; set->field_count = desc.field_count; for (i = 0; i < desc.field_count; i++) set->field_len[i] = desc.field_len[i]; err = ops->init(set, &desc, nla); if (err < 0) goto err_set_init; err = nft_set_expr_alloc(&ctx, set, nla, set->exprs, &num_exprs, flags); if (err < 0) goto err_set_destroy; set->num_exprs = num_exprs; set->handle = nf_tables_alloc_handle(table); INIT_LIST_HEAD(&set->pending_update); err = nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set); if (err < 0) goto err_set_expr_alloc; list_add_tail_rcu(&set->list, &table->sets); return 0; err_set_expr_alloc: for (i = 0; i < set->num_exprs; i++) nft_expr_destroy(&ctx, set->exprs[i]); err_set_destroy: ops->destroy(&ctx, set); err_set_init: kfree(set->name); err_set_name: kvfree(set); err_alloc: nft_use_dec_restore(&table->use); return err; } static void nft_set_catchall_destroy(const struct nft_ctx *ctx, struct nft_set *set) { struct nft_set_elem_catchall *next, *catchall; list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { list_del_rcu(&catchall->list); nf_tables_set_elem_destroy(ctx, set, catchall->elem); kfree_rcu(catchall, rcu); } } static void nft_set_put(struct nft_set *set) { if (refcount_dec_and_test(&set->refs)) { kfree(set->name); kvfree(set); } } static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) { int i; if (WARN_ON(set->use > 0)) return; for (i = 0; i < set->num_exprs; i++) nft_expr_destroy(ctx, set->exprs[i]); set->ops->destroy(ctx, set); nft_set_catchall_destroy(ctx, set); nft_set_put(set); } static int nf_tables_delset(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; struct nft_set *set; struct nft_ctx ctx; if (info->nfmsg->nfgen_family == NFPROTO_UNSPEC) return -EAFNOSUPPORT; table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, genmask, NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]); return PTR_ERR(table); } if (nla[NFTA_SET_HANDLE]) { attr = nla[NFTA_SET_HANDLE]; set = nft_set_lookup_byhandle(table, attr, genmask); } else { attr = nla[NFTA_SET_NAME]; set = nft_set_lookup(net, table, attr, genmask); } if (IS_ERR(set)) { if (PTR_ERR(set) == -ENOENT && NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYSET) return 0; NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(set); } if (set->use || (info->nlh->nlmsg_flags & NLM_F_NONREC && atomic_read(&set->nelems) > 0)) { NL_SET_BAD_ATTR(extack, attr); return -EBUSY; } nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); return nft_delset(&ctx, set); } static int nft_validate_register_store(const struct nft_ctx *ctx, enum nft_registers reg, const struct nft_data *data, enum nft_data_types type, unsigned int len); static int nft_setelem_data_validate(const struct nft_ctx *ctx, struct nft_set *set, struct nft_elem_priv *elem_priv) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); enum nft_registers dreg; dreg = nft_type_to_reg(set->dtype); return nft_validate_register_store(ctx, dreg, nft_set_ext_data(ext), set->dtype == NFT_DATA_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE, set->dlen); } static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); if (!nft_set_elem_active(ext, iter->genmask)) return 0; return nft_setelem_data_validate(ctx, set, elem_priv); } static int nft_set_catchall_bind_check(const struct nft_ctx *ctx, struct nft_set *set) { u8 genmask = nft_genmask_next(ctx->net); struct nft_set_elem_catchall *catchall; struct nft_set_ext *ext; int ret = 0; list_for_each_entry_rcu(catchall, &set->catchall_list, list, lockdep_commit_lock_is_held(ctx->net)) { ext = nft_set_elem_ext(set, catchall->elem); if (!nft_set_elem_active(ext, genmask)) continue; ret = nft_setelem_data_validate(ctx, set, catchall->elem); if (ret < 0) break; } return ret; } int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding) { struct nft_set_binding *i; struct nft_set_iter iter; if (!list_empty(&set->bindings) && nft_set_is_anonymous(set)) return -EBUSY; if (binding->flags & NFT_SET_MAP) { /* If the set is already bound to the same chain all * jumps are already validated for that chain. */ list_for_each_entry(i, &set->bindings, list) { if (i->flags & NFT_SET_MAP && i->chain == binding->chain) goto bind; } iter.genmask = nft_genmask_next(ctx->net); iter.type = NFT_ITER_UPDATE; iter.skip = 0; iter.count = 0; iter.err = 0; iter.fn = nf_tables_bind_check_setelem; set->ops->walk(ctx, set, &iter); if (!iter.err) iter.err = nft_set_catchall_bind_check(ctx, set); if (iter.err < 0) return iter.err; } bind: if (!nft_use_inc(&set->use)) return -EMFILE; binding->chain = ctx->chain; list_add_tail_rcu(&binding->list, &set->bindings); nft_set_trans_bind(ctx, set); return 0; } EXPORT_SYMBOL_GPL(nf_tables_bind_set); static void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding, bool event) { list_del_rcu(&binding->list); if (list_empty(&set->bindings) && nft_set_is_anonymous(set)) { list_del_rcu(&set->list); set->dead = 1; if (event) nf_tables_set_notify(ctx, set, NFT_MSG_DELSET, GFP_KERNEL); } } static void nft_setelem_data_activate(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv); static int nft_mapelem_activate(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv) { struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); /* called from abort path, reverse check to undo changes. */ if (nft_set_elem_active(ext, iter->genmask)) return 0; nft_clear(ctx->net, ext); nft_setelem_data_activate(ctx->net, set, elem_priv); return 0; } static void nft_map_catchall_activate(const struct nft_ctx *ctx, struct nft_set *set) { u8 genmask = nft_genmask_next(ctx->net); struct nft_set_elem_catchall *catchall; struct nft_set_ext *ext; list_for_each_entry(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); if (!nft_set_elem_active(ext, genmask)) continue; nft_clear(ctx->net, ext); nft_setelem_data_activate(ctx->net, set, catchall->elem); break; } } static void nft_map_activate(const struct nft_ctx *ctx, struct nft_set *set) { struct nft_set_iter iter = { .genmask = nft_genmask_next(ctx->net), .type = NFT_ITER_UPDATE, .fn = nft_mapelem_activate, }; set->ops->walk(ctx, set, &iter); WARN_ON_ONCE(iter.err); nft_map_catchall_activate(ctx, set); } void nf_tables_activate_set(const struct nft_ctx *ctx, struct nft_set *set) { if (nft_set_is_anonymous(set)) { if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) nft_map_activate(ctx, set); nft_clear(ctx->net, set); } nft_use_inc_restore(&set->use); } EXPORT_SYMBOL_GPL(nf_tables_activate_set); void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding, enum nft_trans_phase phase) { lockdep_commit_lock_is_held(ctx->net); switch (phase) { case NFT_TRANS_PREPARE_ERROR: nft_set_trans_unbind(ctx, set); if (nft_set_is_anonymous(set)) nft_deactivate_next(ctx->net, set); else list_del_rcu(&binding->list); nft_use_dec(&set->use); break; case NFT_TRANS_PREPARE: if (nft_set_is_anonymous(set)) { if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) nft_map_deactivate(ctx, set); nft_deactivate_next(ctx->net, set); } nft_use_dec(&set->use); return; case NFT_TRANS_ABORT: case NFT_TRANS_RELEASE: if (nft_set_is_anonymous(set) && set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) nft_map_deactivate(ctx, set); nft_use_dec(&set->use); fallthrough; default: nf_tables_unbind_set(ctx, set, binding, phase == NFT_TRANS_COMMIT); } } EXPORT_SYMBOL_GPL(nf_tables_deactivate_set); void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set) { if (list_empty(&set->bindings) && nft_set_is_anonymous(set)) nft_set_destroy(ctx, set); } EXPORT_SYMBOL_GPL(nf_tables_destroy_set); const struct nft_set_ext_type nft_set_ext_types[] = { [NFT_SET_EXT_KEY] = { .align = __alignof__(u32), }, [NFT_SET_EXT_DATA] = { .align = __alignof__(u32), }, [NFT_SET_EXT_EXPRESSIONS] = { .align = __alignof__(struct nft_set_elem_expr), }, [NFT_SET_EXT_OBJREF] = { .len = sizeof(struct nft_object *), .align = __alignof__(struct nft_object *), }, [NFT_SET_EXT_FLAGS] = { .len = sizeof(u8), .align = __alignof__(u8), }, [NFT_SET_EXT_TIMEOUT] = { .len = sizeof(struct nft_timeout), .align = __alignof__(struct nft_timeout), }, [NFT_SET_EXT_USERDATA] = { .len = sizeof(struct nft_userdata), .align = __alignof__(struct nft_userdata), }, [NFT_SET_EXT_KEY_END] = { .align = __alignof__(u32), }, }; /* * Set elements */ static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = { [NFTA_SET_ELEM_KEY] = { .type = NLA_NESTED }, [NFTA_SET_ELEM_DATA] = { .type = NLA_NESTED }, [NFTA_SET_ELEM_FLAGS] = { .type = NLA_U32 }, [NFTA_SET_ELEM_TIMEOUT] = { .type = NLA_U64 }, [NFTA_SET_ELEM_EXPIRATION] = { .type = NLA_U64 }, [NFTA_SET_ELEM_USERDATA] = { .type = NLA_BINARY, .len = NFT_USERDATA_MAXLEN }, [NFTA_SET_ELEM_EXPR] = { .type = NLA_NESTED }, [NFTA_SET_ELEM_OBJREF] = { .type = NLA_STRING, .len = NFT_OBJ_MAXNAMELEN - 1 }, [NFTA_SET_ELEM_KEY_END] = { .type = NLA_NESTED }, [NFTA_SET_ELEM_EXPRESSIONS] = NLA_POLICY_NESTED_ARRAY(nft_expr_policy), }; static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = { [NFTA_SET_ELEM_LIST_TABLE] = { .type = NLA_STRING, .len = NFT_TABLE_MAXNAMELEN - 1 }, [NFTA_SET_ELEM_LIST_SET] = { .type = NLA_STRING, .len = NFT_SET_MAXNAMELEN - 1 }, [NFTA_SET_ELEM_LIST_ELEMENTS] = NLA_POLICY_NESTED_ARRAY(nft_set_elem_policy), [NFTA_SET_ELEM_LIST_SET_ID] = { .type = NLA_U32 }, }; static int nft_set_elem_expr_dump(struct sk_buff *skb, const struct nft_set *set, const struct nft_set_ext *ext, bool reset) { struct nft_set_elem_expr *elem_expr; u32 size, num_exprs = 0; struct nft_expr *expr; struct nlattr *nest; elem_expr = nft_set_ext_expr(ext); nft_setelem_expr_foreach(expr, elem_expr, size) num_exprs++; if (num_exprs == 1) { expr = nft_setelem_expr_at(elem_expr, 0); if (nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, expr, reset) < 0) return -1; return 0; } else if (num_exprs > 1) { nest = nla_nest_start_noflag(skb, NFTA_SET_ELEM_EXPRESSIONS); if (nest == NULL) goto nla_put_failure; nft_setelem_expr_foreach(expr, elem_expr, size) { expr = nft_setelem_expr_at(elem_expr, size); if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr, reset) < 0) goto nla_put_failure; } nla_nest_end(skb, nest); } return 0; nla_put_failure: return -1; } static int nf_tables_fill_setelem(struct sk_buff *skb, const struct nft_set *set, const struct nft_elem_priv *elem_priv, bool reset) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; nest = nla_nest_start_noflag(skb, NFTA_LIST_ELEM); if (nest == NULL) goto nla_put_failure; if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY) && nft_data_dump(skb, NFTA_SET_ELEM_KEY, nft_set_ext_key(ext), NFT_DATA_VALUE, set->klen) < 0) goto nla_put_failure; if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END) && nft_data_dump(skb, NFTA_SET_ELEM_KEY_END, nft_set_ext_key_end(ext), NFT_DATA_VALUE, set->klen) < 0) goto nla_put_failure; if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && nft_data_dump(skb, NFTA_SET_ELEM_DATA, nft_set_ext_data(ext), nft_set_datatype(set), set->dlen) < 0) goto nla_put_failure; if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS) && nft_set_elem_expr_dump(skb, set, ext, reset)) goto nla_put_failure; if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) && nla_put_string(skb, NFTA_SET_ELEM_OBJREF, (*nft_set_ext_obj(ext))->key.name) < 0) goto nla_put_failure; if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && nla_put_be32(skb, NFTA_SET_ELEM_FLAGS, htonl(*nft_set_ext_flags(ext)))) goto nla_put_failure; if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT)) { u64 timeout = READ_ONCE(nft_set_ext_timeout(ext)->timeout); u64 set_timeout = READ_ONCE(set->timeout); __be64 msecs = 0; if (set_timeout != timeout) { msecs = nf_jiffies64_to_msecs(timeout); if (nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT, msecs, NFTA_SET_ELEM_PAD)) goto nla_put_failure; } if (timeout > 0) { u64 expires, now = get_jiffies_64(); expires = READ_ONCE(nft_set_ext_timeout(ext)->expiration); if (time_before64(now, expires)) expires -= now; else expires = 0; if (nla_put_be64(skb, NFTA_SET_ELEM_EXPIRATION, nf_jiffies64_to_msecs(expires), NFTA_SET_ELEM_PAD)) goto nla_put_failure; } } if (nft_set_ext_exists(ext, NFT_SET_EXT_USERDATA)) { struct nft_userdata *udata; udata = nft_set_ext_userdata(ext); if (nla_put(skb, NFTA_SET_ELEM_USERDATA, udata->len + 1, udata->data)) goto nla_put_failure; } nla_nest_end(skb, nest); return 0; nla_put_failure: nlmsg_trim(skb, b); return -EMSGSIZE; } struct nft_set_dump_args { const struct netlink_callback *cb; struct nft_set_iter iter; struct sk_buff *skb; bool reset; }; static int nf_tables_dump_setelem(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); struct nft_set_dump_args *args; if (!nft_set_elem_active(ext, iter->genmask)) return 0; if (nft_set_elem_expired(ext) || nft_set_elem_is_dead(ext)) return 0; args = container_of(iter, struct nft_set_dump_args, iter); return nf_tables_fill_setelem(args->skb, set, elem_priv, args->reset); } static void audit_log_nft_set_reset(const struct nft_table *table, unsigned int base_seq, unsigned int nentries) { char *buf = kasprintf(GFP_ATOMIC, "%s:%u", table->name, base_seq); audit_log_nfcfg(buf, table->family, nentries, AUDIT_NFT_OP_SETELEM_RESET, GFP_ATOMIC); kfree(buf); } struct nft_set_dump_ctx { const struct nft_set *set; struct nft_ctx ctx; bool reset; }; static int nft_set_catchall_dump(struct net *net, struct sk_buff *skb, const struct nft_set *set, bool reset, unsigned int base_seq) { struct nft_set_elem_catchall *catchall; u8 genmask = nft_genmask_cur(net); struct nft_set_ext *ext; int ret = 0; list_for_each_entry_rcu(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); if (!nft_set_elem_active(ext, genmask) || nft_set_elem_expired(ext)) continue; ret = nf_tables_fill_setelem(skb, set, catchall->elem, reset); if (reset && !ret) audit_log_nft_set_reset(set->table, base_seq, 1); break; } return ret; } static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) { struct nft_set_dump_ctx *dump_ctx = cb->data; struct net *net = sock_net(skb->sk); struct nftables_pernet *nft_net; struct nft_table *table; struct nft_set *set; struct nft_set_dump_args args; bool set_found = false; struct nlmsghdr *nlh; struct nlattr *nest; u32 portid, seq; int event; rcu_read_lock(); nft_net = nft_pernet(net); cb->seq = READ_ONCE(nft_net->base_seq); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (dump_ctx->ctx.family != NFPROTO_UNSPEC && dump_ctx->ctx.family != table->family) continue; if (table != dump_ctx->ctx.table) continue; list_for_each_entry_rcu(set, &table->sets, list) { if (set == dump_ctx->set) { set_found = true; break; } } break; } if (!set_found) { rcu_read_unlock(); return -ENOENT; } event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWSETELEM); portid = NETLINK_CB(cb->skb).portid; seq = cb->nlh->nlmsg_seq; nlh = nfnl_msg_put(skb, portid, seq, event, NLM_F_MULTI, table->family, NFNETLINK_V0, nft_base_seq(net)); if (!nlh) goto nla_put_failure; if (nla_put_string(skb, NFTA_SET_ELEM_LIST_TABLE, table->name)) goto nla_put_failure; if (nla_put_string(skb, NFTA_SET_ELEM_LIST_SET, set->name)) goto nla_put_failure; nest = nla_nest_start_noflag(skb, NFTA_SET_ELEM_LIST_ELEMENTS); if (nest == NULL) goto nla_put_failure; args.cb = cb; args.skb = skb; args.reset = dump_ctx->reset; args.iter.genmask = nft_genmask_cur(net); args.iter.type = NFT_ITER_READ; args.iter.skip = cb->args[0]; args.iter.count = 0; args.iter.err = 0; args.iter.fn = nf_tables_dump_setelem; set->ops->walk(&dump_ctx->ctx, set, &args.iter); if (!args.iter.err && args.iter.count == cb->args[0]) args.iter.err = nft_set_catchall_dump(net, skb, set, dump_ctx->reset, cb->seq); nla_nest_end(skb, nest); nlmsg_end(skb, nlh); rcu_read_unlock(); if (args.iter.err && args.iter.err != -EMSGSIZE) return args.iter.err; if (args.iter.count == cb->args[0]) return 0; cb->args[0] = args.iter.count; return skb->len; nla_put_failure: rcu_read_unlock(); return -ENOSPC; } static int nf_tables_dumpreset_set(struct sk_buff *skb, struct netlink_callback *cb) { struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk)); struct nft_set_dump_ctx *dump_ctx = cb->data; int ret, skip = cb->args[0]; mutex_lock(&nft_net->commit_mutex); ret = nf_tables_dump_set(skb, cb); if (cb->args[0] > skip) audit_log_nft_set_reset(dump_ctx->ctx.table, cb->seq, cb->args[0] - skip); mutex_unlock(&nft_net->commit_mutex); return ret; } static int nf_tables_dump_set_start(struct netlink_callback *cb) { struct nft_set_dump_ctx *dump_ctx = cb->data; cb->data = kmemdup(dump_ctx, sizeof(*dump_ctx), GFP_ATOMIC); return cb->data ? 0 : -ENOMEM; } static int nf_tables_dump_set_done(struct netlink_callback *cb) { kfree(cb->data); return 0; } static int nf_tables_fill_setelem_info(struct sk_buff *skb, const struct nft_ctx *ctx, u32 seq, u32 portid, int event, u16 flags, const struct nft_set *set, const struct nft_elem_priv *elem_priv, bool reset) { struct nlmsghdr *nlh; struct nlattr *nest; int err; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nfnl_msg_put(skb, portid, seq, event, flags, ctx->family, NFNETLINK_V0, nft_base_seq(ctx->net)); if (!nlh) goto nla_put_failure; if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name)) goto nla_put_failure; if (nla_put_string(skb, NFTA_SET_NAME, set->name)) goto nla_put_failure; nest = nla_nest_start_noflag(skb, NFTA_SET_ELEM_LIST_ELEMENTS); if (nest == NULL) goto nla_put_failure; err = nf_tables_fill_setelem(skb, set, elem_priv, reset); if (err < 0) goto nla_put_failure; nla_nest_end(skb, nest); nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_trim(skb, nlh); return -1; } static int nft_setelem_parse_flags(const struct nft_set *set, const struct nlattr *attr, u32 *flags) { if (attr == NULL) return 0; *flags = ntohl(nla_get_be32(attr)); if (*flags & ~(NFT_SET_ELEM_INTERVAL_END | NFT_SET_ELEM_CATCHALL)) return -EOPNOTSUPP; if (!(set->flags & NFT_SET_INTERVAL) && *flags & NFT_SET_ELEM_INTERVAL_END) return -EINVAL; if ((*flags & (NFT_SET_ELEM_INTERVAL_END | NFT_SET_ELEM_CATCHALL)) == (NFT_SET_ELEM_INTERVAL_END | NFT_SET_ELEM_CATCHALL)) return -EINVAL; return 0; } static int nft_setelem_parse_key(struct nft_ctx *ctx, const struct nft_set *set, struct nft_data *key, struct nlattr *attr) { struct nft_data_desc desc = { .type = NFT_DATA_VALUE, .size = NFT_DATA_VALUE_MAXLEN, .len = set->klen, }; return nft_data_init(ctx, key, &desc, attr); } static int nft_setelem_parse_data(struct nft_ctx *ctx, struct nft_set *set, struct nft_data_desc *desc, struct nft_data *data, struct nlattr *attr) { u32 dtype; if (set->dtype == NFT_DATA_VERDICT) dtype = NFT_DATA_VERDICT; else dtype = NFT_DATA_VALUE; desc->type = dtype; desc->size = NFT_DATA_VALUE_MAXLEN; desc->len = set->dlen; desc->flags = NFT_DATA_DESC_SETELEM; return nft_data_init(ctx, data, desc, attr); } static void *nft_setelem_catchall_get(const struct net *net, const struct nft_set *set) { struct nft_set_elem_catchall *catchall; u8 genmask = nft_genmask_cur(net); struct nft_set_ext *ext; void *priv = NULL; list_for_each_entry_rcu(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); if (!nft_set_elem_active(ext, genmask) || nft_set_elem_expired(ext)) continue; priv = catchall->elem; break; } return priv; } static int nft_setelem_get(struct nft_ctx *ctx, const struct nft_set *set, struct nft_set_elem *elem, u32 flags) { void *priv; if (!(flags & NFT_SET_ELEM_CATCHALL)) { priv = set->ops->get(ctx->net, set, elem, flags); if (IS_ERR(priv)) return PTR_ERR(priv); } else { priv = nft_setelem_catchall_get(ctx->net, set); if (!priv) return -ENOENT; } elem->priv = priv; return 0; } static int nft_get_set_elem(struct nft_ctx *ctx, const struct nft_set *set, const struct nlattr *attr, bool reset) { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; struct nft_set_elem elem; struct sk_buff *skb; uint32_t flags = 0; int err; err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr, nft_set_elem_policy, NULL); if (err < 0) return err; err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags); if (err < 0) return err; if (!nla[NFTA_SET_ELEM_KEY] && !(flags & NFT_SET_ELEM_CATCHALL)) return -EINVAL; if (nla[NFTA_SET_ELEM_KEY]) { err = nft_setelem_parse_key(ctx, set, &elem.key.val, nla[NFTA_SET_ELEM_KEY]); if (err < 0) return err; } if (nla[NFTA_SET_ELEM_KEY_END]) { err = nft_setelem_parse_key(ctx, set, &elem.key_end.val, nla[NFTA_SET_ELEM_KEY_END]); if (err < 0) return err; } err = nft_setelem_get(ctx, set, &elem, flags); if (err < 0) return err; err = -ENOMEM; skb = nlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC); if (skb == NULL) return err; err = nf_tables_fill_setelem_info(skb, ctx, ctx->seq, ctx->portid, NFT_MSG_NEWSETELEM, 0, set, elem.priv, reset); if (err < 0) goto err_fill_setelem; return nfnetlink_unicast(skb, ctx->net, ctx->portid); err_fill_setelem: kfree_skb(skb); return err; } static int nft_set_dump_ctx_init(struct nft_set_dump_ctx *dump_ctx, const struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[], bool reset) { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; struct nft_table *table; struct nft_set *set; table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family, genmask, 0); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]); return PTR_ERR(table); } set = nft_set_lookup(net, table, nla[NFTA_SET_ELEM_LIST_SET], genmask); if (IS_ERR(set)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_SET]); return PTR_ERR(set); } nft_ctx_init(&dump_ctx->ctx, net, skb, info->nlh, family, table, NULL, nla); dump_ctx->set = set; dump_ctx->reset = reset; return 0; } /* called with rcu_read_lock held */ static int nf_tables_getsetelem(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct netlink_ext_ack *extack = info->extack; struct nft_set_dump_ctx dump_ctx; struct nlattr *attr; int rem, err = 0; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = nf_tables_dump_set_start, .dump = nf_tables_dump_set, .done = nf_tables_dump_set_done, .module = THIS_MODULE, }; err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, false); if (err) return err; c.data = &dump_ctx; return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS]) return -EINVAL; err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, false); if (err) return err; nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { err = nft_get_set_elem(&dump_ctx.ctx, dump_ctx.set, attr, false); if (err < 0) { NL_SET_BAD_ATTR(extack, attr); break; } } return err; } static int nf_tables_getsetelem_reset(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct nftables_pernet *nft_net = nft_pernet(info->net); struct netlink_ext_ack *extack = info->extack; struct nft_set_dump_ctx dump_ctx; int rem, err = 0, nelems = 0; struct nlattr *attr; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = nf_tables_dump_set_start, .dump = nf_tables_dumpreset_set, .done = nf_tables_dump_set_done, .module = THIS_MODULE, }; err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, true); if (err) return err; c.data = &dump_ctx; return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS]) return -EINVAL; if (!try_module_get(THIS_MODULE)) return -EINVAL; rcu_read_unlock(); mutex_lock(&nft_net->commit_mutex); rcu_read_lock(); err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, true); if (err) goto out_unlock; nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { err = nft_get_set_elem(&dump_ctx.ctx, dump_ctx.set, attr, true); if (err < 0) { NL_SET_BAD_ATTR(extack, attr); break; } nelems++; } audit_log_nft_set_reset(dump_ctx.ctx.table, nft_net->base_seq, nelems); out_unlock: rcu_read_unlock(); mutex_unlock(&nft_net->commit_mutex); rcu_read_lock(); module_put(THIS_MODULE); return err; } static void nf_tables_setelem_notify(const struct nft_ctx *ctx, const struct nft_set *set, const struct nft_elem_priv *elem_priv, int event) { struct nftables_pernet *nft_net; struct net *net = ctx->net; u32 portid = ctx->portid; struct sk_buff *skb; u16 flags = 0; int err; if (!ctx->report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) return; skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (skb == NULL) goto err; if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); err = nf_tables_fill_setelem_info(skb, ctx, 0, portid, event, flags, set, elem_priv, false); if (err < 0) { kfree_skb(skb); goto err; } nft_net = nft_pernet(net); nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list); return; err: nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS); } static struct nft_trans *nft_trans_elem_alloc(const struct nft_ctx *ctx, int msg_type, struct nft_set *set) { struct nft_trans_elem *te; struct nft_trans *trans; trans = nft_trans_alloc(ctx, msg_type, struct_size(te, elems, 1)); if (trans == NULL) return NULL; te = nft_trans_container_elem(trans); te->nelems = 1; te->set = set; return trans; } struct nft_expr *nft_set_elem_expr_alloc(const struct nft_ctx *ctx, const struct nft_set *set, const struct nlattr *attr) { struct nft_expr *expr; int err; expr = nft_expr_init(ctx, attr); if (IS_ERR(expr)) return expr; err = -EOPNOTSUPP; if (expr->ops->type->flags & NFT_EXPR_GC) { if (set->flags & NFT_SET_TIMEOUT) goto err_set_elem_expr; if (!set->ops->gc_init) goto err_set_elem_expr; set->ops->gc_init(set); } return expr; err_set_elem_expr: nft_expr_destroy(ctx, expr); return ERR_PTR(err); } static int nft_set_ext_check(const struct nft_set_ext_tmpl *tmpl, u8 id, u32 len) { len += nft_set_ext_types[id].len; if (len > tmpl->ext_len[id] || len > U8_MAX) return -1; return 0; } static int nft_set_ext_memcpy(const struct nft_set_ext_tmpl *tmpl, u8 id, void *to, const void *from, u32 len) { if (nft_set_ext_check(tmpl, id, len) < 0) return -1; memcpy(to, from, len); return 0; } struct nft_elem_priv *nft_set_elem_init(const struct nft_set *set, const struct nft_set_ext_tmpl *tmpl, const u32 *key, const u32 *key_end, const u32 *data, u64 timeout, u64 expiration, gfp_t gfp) { struct nft_set_ext *ext; void *elem; elem = kzalloc(set->ops->elemsize + tmpl->len, gfp); if (elem == NULL) return ERR_PTR(-ENOMEM); ext = nft_set_elem_ext(set, elem); nft_set_ext_init(ext, tmpl); if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY) && nft_set_ext_memcpy(tmpl, NFT_SET_EXT_KEY, nft_set_ext_key(ext), key, set->klen) < 0) goto err_ext_check; if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END) && nft_set_ext_memcpy(tmpl, NFT_SET_EXT_KEY_END, nft_set_ext_key_end(ext), key_end, set->klen) < 0) goto err_ext_check; if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && nft_set_ext_memcpy(tmpl, NFT_SET_EXT_DATA, nft_set_ext_data(ext), data, set->dlen) < 0) goto err_ext_check; if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT)) { nft_set_ext_timeout(ext)->timeout = timeout; if (expiration == 0) expiration = timeout; nft_set_ext_timeout(ext)->expiration = get_jiffies_64() + expiration; } return elem; err_ext_check: kfree(elem); return ERR_PTR(-EINVAL); } static void __nft_set_elem_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr) { if (expr->ops->destroy_clone) { expr->ops->destroy_clone(ctx, expr); module_put(expr->ops->type->owner); } else { nf_tables_expr_destroy(ctx, expr); } } static void nft_set_elem_expr_destroy(const struct nft_ctx *ctx, struct nft_set_elem_expr *elem_expr) { struct nft_expr *expr; u32 size; nft_setelem_expr_foreach(expr, elem_expr, size) __nft_set_elem_expr_destroy(ctx, expr); } /* Drop references and destroy. Called from gc, dynset and abort path. */ static void __nft_set_elem_destroy(const struct nft_ctx *ctx, const struct nft_set *set, const struct nft_elem_priv *elem_priv, bool destroy_expr) { struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); nft_data_release(nft_set_ext_key(ext), NFT_DATA_VALUE); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) nft_data_release(nft_set_ext_data(ext), set->dtype); if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS)) nft_set_elem_expr_destroy(ctx, nft_set_ext_expr(ext)); if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) nft_use_dec(&(*nft_set_ext_obj(ext))->use); kfree(elem_priv); } /* Drop references and destroy. Called from gc and dynset. */ void nft_set_elem_destroy(const struct nft_set *set, const struct nft_elem_priv *elem_priv, bool destroy_expr) { struct nft_ctx ctx = { .net = read_pnet(&set->net), .family = set->table->family, }; __nft_set_elem_destroy(&ctx, set, elem_priv, destroy_expr); } EXPORT_SYMBOL_GPL(nft_set_elem_destroy); /* Drop references and destroy. Called from abort path. */ static void nft_trans_set_elem_destroy(const struct nft_ctx *ctx, struct nft_trans_elem *te) { int i; for (i = 0; i < te->nelems; i++) { /* skip update request, see nft_trans_elems_new_abort() */ if (!te->elems[i].priv) continue; __nft_set_elem_destroy(ctx, te->set, te->elems[i].priv, true); } } /* Destroy element. References have been already dropped in the preparation * path via nft_setelem_data_deactivate(). */ void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, const struct nft_set *set, const struct nft_elem_priv *elem_priv) { struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS)) nft_set_elem_expr_destroy(ctx, nft_set_ext_expr(ext)); kfree(elem_priv); } static void nft_trans_elems_destroy(const struct nft_ctx *ctx, const struct nft_trans_elem *te) { int i; for (i = 0; i < te->nelems; i++) nf_tables_set_elem_destroy(ctx, te->set, te->elems[i].priv); } int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set, struct nft_expr *expr_array[]) { struct nft_expr *expr; int err, i, k; for (i = 0; i < set->num_exprs; i++) { expr = kzalloc(set->exprs[i]->ops->size, GFP_KERNEL_ACCOUNT); if (!expr) goto err_expr; err = nft_expr_clone(expr, set->exprs[i], GFP_KERNEL_ACCOUNT); if (err < 0) { kfree(expr); goto err_expr; } expr_array[i] = expr; } return 0; err_expr: for (k = i - 1; k >= 0; k--) nft_expr_destroy(ctx, expr_array[k]); return -ENOMEM; } static int nft_set_elem_expr_setup(struct nft_ctx *ctx, const struct nft_set_ext_tmpl *tmpl, const struct nft_set_ext *ext, struct nft_expr *expr_array[], u32 num_exprs) { struct nft_set_elem_expr *elem_expr = nft_set_ext_expr(ext); u32 len = sizeof(struct nft_set_elem_expr); struct nft_expr *expr; int i, err; if (num_exprs == 0) return 0; for (i = 0; i < num_exprs; i++) len += expr_array[i]->ops->size; if (nft_set_ext_check(tmpl, NFT_SET_EXT_EXPRESSIONS, len) < 0) return -EINVAL; for (i = 0; i < num_exprs; i++) { expr = nft_setelem_expr_at(elem_expr, elem_expr->size); err = nft_expr_clone(expr, expr_array[i], GFP_KERNEL_ACCOUNT); if (err < 0) goto err_elem_expr_setup; elem_expr->size += expr_array[i]->ops->size; nft_expr_destroy(ctx, expr_array[i]); expr_array[i] = NULL; } return 0; err_elem_expr_setup: for (; i < num_exprs; i++) { nft_expr_destroy(ctx, expr_array[i]); expr_array[i] = NULL; } return -ENOMEM; } struct nft_set_ext *nft_set_catchall_lookup(const struct net *net, const struct nft_set *set) { struct nft_set_elem_catchall *catchall; u8 genmask = nft_genmask_cur(net); struct nft_set_ext *ext; list_for_each_entry_rcu(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); if (nft_set_elem_active(ext, genmask) && !nft_set_elem_expired(ext) && !nft_set_elem_is_dead(ext)) return ext; } return NULL; } EXPORT_SYMBOL_GPL(nft_set_catchall_lookup); static int nft_setelem_catchall_insert(const struct net *net, struct nft_set *set, const struct nft_set_elem *elem, struct nft_elem_priv **priv) { struct nft_set_elem_catchall *catchall; u8 genmask = nft_genmask_next(net); struct nft_set_ext *ext; list_for_each_entry(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); if (nft_set_elem_active(ext, genmask)) { *priv = catchall->elem; return -EEXIST; } } catchall = kmalloc(sizeof(*catchall), GFP_KERNEL_ACCOUNT); if (!catchall) return -ENOMEM; catchall->elem = elem->priv; list_add_tail_rcu(&catchall->list, &set->catchall_list); return 0; } static int nft_setelem_insert(const struct net *net, struct nft_set *set, const struct nft_set_elem *elem, struct nft_elem_priv **elem_priv, unsigned int flags) { int ret; if (flags & NFT_SET_ELEM_CATCHALL) ret = nft_setelem_catchall_insert(net, set, elem, elem_priv); else ret = set->ops->insert(net, set, elem, elem_priv); return ret; } static bool nft_setelem_is_catchall(const struct nft_set *set, const struct nft_elem_priv *elem_priv) { struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && *nft_set_ext_flags(ext) & NFT_SET_ELEM_CATCHALL) return true; return false; } static void nft_setelem_activate(struct net *net, struct nft_set *set, struct nft_elem_priv *elem_priv) { struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); if (nft_setelem_is_catchall(set, elem_priv)) { nft_clear(net, ext); } else { set->ops->activate(net, set, elem_priv); } } static void nft_trans_elem_update(const struct nft_set *set, const struct nft_trans_one_elem *elem) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); const struct nft_elem_update *update = elem->update; if (update->flags & NFT_TRANS_UPD_TIMEOUT) WRITE_ONCE(nft_set_ext_timeout(ext)->timeout, update->timeout); if (update->flags & NFT_TRANS_UPD_EXPIRATION) WRITE_ONCE(nft_set_ext_timeout(ext)->expiration, get_jiffies_64() + update->expiration); } static void nft_trans_elems_add(const struct nft_ctx *ctx, struct nft_trans_elem *te) { int i; for (i = 0; i < te->nelems; i++) { struct nft_trans_one_elem *elem = &te->elems[i]; if (elem->update) nft_trans_elem_update(te->set, elem); else nft_setelem_activate(ctx->net, te->set, elem->priv); nf_tables_setelem_notify(ctx, te->set, elem->priv, NFT_MSG_NEWSETELEM); kfree(elem->update); } } static int nft_setelem_catchall_deactivate(const struct net *net, struct nft_set *set, struct nft_set_elem *elem) { struct nft_set_elem_catchall *catchall; struct nft_set_ext *ext; list_for_each_entry(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); if (!nft_is_active_next(net, ext)) continue; kfree(elem->priv); elem->priv = catchall->elem; nft_set_elem_change_active(net, set, ext); return 0; } return -ENOENT; } static int __nft_setelem_deactivate(const struct net *net, struct nft_set *set, struct nft_set_elem *elem) { void *priv; priv = set->ops->deactivate(net, set, elem); if (!priv) return -ENOENT; kfree(elem->priv); elem->priv = priv; set->ndeact++; return 0; } static int nft_setelem_deactivate(const struct net *net, struct nft_set *set, struct nft_set_elem *elem, u32 flags) { int ret; if (flags & NFT_SET_ELEM_CATCHALL) ret = nft_setelem_catchall_deactivate(net, set, elem); else ret = __nft_setelem_deactivate(net, set, elem); return ret; } static void nft_setelem_catchall_destroy(struct nft_set_elem_catchall *catchall) { list_del_rcu(&catchall->list); kfree_rcu(catchall, rcu); } static void nft_setelem_catchall_remove(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv) { struct nft_set_elem_catchall *catchall, *next; list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { if (catchall->elem == elem_priv) { nft_setelem_catchall_destroy(catchall); break; } } } static void nft_setelem_remove(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv) { if (nft_setelem_is_catchall(set, elem_priv)) nft_setelem_catchall_remove(net, set, elem_priv); else set->ops->remove(net, set, elem_priv); } static void nft_trans_elems_remove(const struct nft_ctx *ctx, const struct nft_trans_elem *te) { int i; for (i = 0; i < te->nelems; i++) { WARN_ON_ONCE(te->elems[i].update); nf_tables_setelem_notify(ctx, te->set, te->elems[i].priv, te->nft_trans.msg_type); nft_setelem_remove(ctx->net, te->set, te->elems[i].priv); if (!nft_setelem_is_catchall(te->set, te->elems[i].priv)) { atomic_dec(&te->set->nelems); te->set->ndeact--; } } } static bool nft_setelem_valid_key_end(const struct nft_set *set, struct nlattr **nla, u32 flags) { if ((set->flags & (NFT_SET_CONCAT | NFT_SET_INTERVAL)) == (NFT_SET_CONCAT | NFT_SET_INTERVAL)) { if (flags & NFT_SET_ELEM_INTERVAL_END) return false; if (nla[NFTA_SET_ELEM_KEY_END] && flags & NFT_SET_ELEM_CATCHALL) return false; } else { if (nla[NFTA_SET_ELEM_KEY_END]) return false; } return true; } static u32 nft_set_maxsize(const struct nft_set *set) { u32 maxsize, delta; if (!set->size) return UINT_MAX; if (set->ops->adjust_maxsize) delta = set->ops->adjust_maxsize(set); else delta = 0; if (check_add_overflow(set->size, set->ndeact, &maxsize)) return UINT_MAX; if (check_add_overflow(maxsize, delta, &maxsize)) return UINT_MAX; return maxsize; } static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr, u32 nlmsg_flags) { struct nft_expr *expr_array[NFT_SET_EXPR_MAX] = {}; struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; u8 genmask = nft_genmask_next(ctx->net); u32 flags = 0, size = 0, num_exprs = 0; struct nft_set_ext_tmpl tmpl; struct nft_set_ext *ext, *ext2; struct nft_set_elem elem; struct nft_set_binding *binding; struct nft_elem_priv *elem_priv; struct nft_object *obj = NULL; struct nft_userdata *udata; struct nft_data_desc desc; enum nft_registers dreg; struct nft_trans *trans; u64 expiration; u64 timeout; int err, i; u8 ulen; err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr, nft_set_elem_policy, NULL); if (err < 0) return err; nft_set_ext_prepare(&tmpl); err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags); if (err < 0) return err; if (((flags & NFT_SET_ELEM_CATCHALL) && nla[NFTA_SET_ELEM_KEY]) || (!(flags & NFT_SET_ELEM_CATCHALL) && !nla[NFTA_SET_ELEM_KEY])) return -EINVAL; if (flags != 0) { err = nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS); if (err < 0) return err; } if (set->flags & NFT_SET_MAP) { if (nla[NFTA_SET_ELEM_DATA] == NULL && !(flags & NFT_SET_ELEM_INTERVAL_END)) return -EINVAL; } else { if (nla[NFTA_SET_ELEM_DATA] != NULL) return -EINVAL; } if (set->flags & NFT_SET_OBJECT) { if (!nla[NFTA_SET_ELEM_OBJREF] && !(flags & NFT_SET_ELEM_INTERVAL_END)) return -EINVAL; } else { if (nla[NFTA_SET_ELEM_OBJREF]) return -EINVAL; } if (!nft_setelem_valid_key_end(set, nla, flags)) return -EINVAL; if ((flags & NFT_SET_ELEM_INTERVAL_END) && (nla[NFTA_SET_ELEM_DATA] || nla[NFTA_SET_ELEM_OBJREF] || nla[NFTA_SET_ELEM_TIMEOUT] || nla[NFTA_SET_ELEM_EXPIRATION] || nla[NFTA_SET_ELEM_USERDATA] || nla[NFTA_SET_ELEM_EXPR] || nla[NFTA_SET_ELEM_KEY_END] || nla[NFTA_SET_ELEM_EXPRESSIONS])) return -EINVAL; timeout = 0; if (nla[NFTA_SET_ELEM_TIMEOUT] != NULL) { if (!(set->flags & NFT_SET_TIMEOUT)) return -EINVAL; err = nf_msecs_to_jiffies64(nla[NFTA_SET_ELEM_TIMEOUT], &timeout); if (err) return err; } else if (set->flags & NFT_SET_TIMEOUT && !(flags & NFT_SET_ELEM_INTERVAL_END)) { timeout = set->timeout; } expiration = 0; if (nla[NFTA_SET_ELEM_EXPIRATION] != NULL) { if (!(set->flags & NFT_SET_TIMEOUT)) return -EINVAL; if (timeout == 0) return -EOPNOTSUPP; err = nf_msecs_to_jiffies64(nla[NFTA_SET_ELEM_EXPIRATION], &expiration); if (err) return err; if (expiration > timeout) return -ERANGE; } if (nla[NFTA_SET_ELEM_EXPR]) { struct nft_expr *expr; if (set->num_exprs && set->num_exprs != 1) return -EOPNOTSUPP; expr = nft_set_elem_expr_alloc(ctx, set, nla[NFTA_SET_ELEM_EXPR]); if (IS_ERR(expr)) return PTR_ERR(expr); expr_array[0] = expr; num_exprs = 1; if (set->num_exprs && set->exprs[0]->ops != expr->ops) { err = -EOPNOTSUPP; goto err_set_elem_expr; } } else if (nla[NFTA_SET_ELEM_EXPRESSIONS]) { struct nft_expr *expr; struct nlattr *tmp; int left; i = 0; nla_for_each_nested(tmp, nla[NFTA_SET_ELEM_EXPRESSIONS], left) { if (i == NFT_SET_EXPR_MAX || (set->num_exprs && set->num_exprs == i)) { err = -E2BIG; goto err_set_elem_expr; } if (nla_type(tmp) != NFTA_LIST_ELEM) { err = -EINVAL; goto err_set_elem_expr; } expr = nft_set_elem_expr_alloc(ctx, set, tmp); if (IS_ERR(expr)) { err = PTR_ERR(expr); goto err_set_elem_expr; } expr_array[i] = expr; num_exprs++; if (set->num_exprs && expr->ops != set->exprs[i]->ops) { err = -EOPNOTSUPP; goto err_set_elem_expr; } i++; } if (set->num_exprs && set->num_exprs != i) { err = -EOPNOTSUPP; goto err_set_elem_expr; } } else if (set->num_exprs > 0 && !(flags & NFT_SET_ELEM_INTERVAL_END)) { err = nft_set_elem_expr_clone(ctx, set, expr_array); if (err < 0) goto err_set_elem_expr_clone; num_exprs = set->num_exprs; } if (nla[NFTA_SET_ELEM_KEY]) { err = nft_setelem_parse_key(ctx, set, &elem.key.val, nla[NFTA_SET_ELEM_KEY]); if (err < 0) goto err_set_elem_expr; err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); if (err < 0) goto err_parse_key; } if (nla[NFTA_SET_ELEM_KEY_END]) { err = nft_setelem_parse_key(ctx, set, &elem.key_end.val, nla[NFTA_SET_ELEM_KEY_END]); if (err < 0) goto err_parse_key; err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen); if (err < 0) goto err_parse_key_end; } if (set->flags & NFT_SET_TIMEOUT) { err = nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT); if (err < 0) goto err_parse_key_end; } if (num_exprs) { for (i = 0; i < num_exprs; i++) size += expr_array[i]->ops->size; err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_EXPRESSIONS, sizeof(struct nft_set_elem_expr) + size); if (err < 0) goto err_parse_key_end; } if (nla[NFTA_SET_ELEM_OBJREF] != NULL) { obj = nft_obj_lookup(ctx->net, ctx->table, nla[NFTA_SET_ELEM_OBJREF], set->objtype, genmask); if (IS_ERR(obj)) { err = PTR_ERR(obj); obj = NULL; goto err_parse_key_end; } if (!nft_use_inc(&obj->use)) { err = -EMFILE; obj = NULL; goto err_parse_key_end; } err = nft_set_ext_add(&tmpl, NFT_SET_EXT_OBJREF); if (err < 0) goto err_parse_key_end; } if (nla[NFTA_SET_ELEM_DATA] != NULL) { err = nft_setelem_parse_data(ctx, set, &desc, &elem.data.val, nla[NFTA_SET_ELEM_DATA]); if (err < 0) goto err_parse_key_end; dreg = nft_type_to_reg(set->dtype); list_for_each_entry(binding, &set->bindings, list) { struct nft_ctx bind_ctx = { .net = ctx->net, .family = ctx->family, .table = ctx->table, .chain = (struct nft_chain *)binding->chain, }; if (!(binding->flags & NFT_SET_MAP)) continue; err = nft_validate_register_store(&bind_ctx, dreg, &elem.data.val, desc.type, desc.len); if (err < 0) goto err_parse_data; if (desc.type == NFT_DATA_VERDICT && (elem.data.val.verdict.code == NFT_GOTO || elem.data.val.verdict.code == NFT_JUMP)) nft_validate_state_update(ctx->table, NFT_VALIDATE_NEED); } err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, desc.len); if (err < 0) goto err_parse_data; } /* The full maximum length of userdata can exceed the maximum * offset value (U8_MAX) for following extensions, therefor it * must be the last extension added. */ ulen = 0; if (nla[NFTA_SET_ELEM_USERDATA] != NULL) { ulen = nla_len(nla[NFTA_SET_ELEM_USERDATA]); if (ulen > 0) { err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_USERDATA, ulen); if (err < 0) goto err_parse_data; } } elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, elem.key_end.val.data, elem.data.val.data, timeout, expiration, GFP_KERNEL_ACCOUNT); if (IS_ERR(elem.priv)) { err = PTR_ERR(elem.priv); goto err_parse_data; } ext = nft_set_elem_ext(set, elem.priv); if (flags) *nft_set_ext_flags(ext) = flags; if (obj) *nft_set_ext_obj(ext) = obj; if (ulen > 0) { if (nft_set_ext_check(&tmpl, NFT_SET_EXT_USERDATA, ulen) < 0) { err = -EINVAL; goto err_elem_free; } udata = nft_set_ext_userdata(ext); udata->len = ulen - 1; nla_memcpy(&udata->data, nla[NFTA_SET_ELEM_USERDATA], ulen); } err = nft_set_elem_expr_setup(ctx, &tmpl, ext, expr_array, num_exprs); if (err < 0) goto err_elem_free; trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set); if (trans == NULL) { err = -ENOMEM; goto err_elem_free; } ext->genmask = nft_genmask_cur(ctx->net); err = nft_setelem_insert(ctx->net, set, &elem, &elem_priv, flags); if (err) { if (err == -EEXIST) { ext2 = nft_set_elem_ext(set, elem_priv); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) ^ nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) || nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) ^ nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF)) goto err_element_clash; if ((nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) && memcmp(nft_set_ext_data(ext), nft_set_ext_data(ext2), set->dlen) != 0) || (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) && nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF) && *nft_set_ext_obj(ext) != *nft_set_ext_obj(ext2))) goto err_element_clash; else if (!(nlmsg_flags & NLM_F_EXCL)) { err = 0; if (nft_set_ext_exists(ext2, NFT_SET_EXT_TIMEOUT)) { struct nft_elem_update update = { }; if (timeout != nft_set_ext_timeout(ext2)->timeout) { update.timeout = timeout; if (expiration == 0) expiration = timeout; update.flags |= NFT_TRANS_UPD_TIMEOUT; } if (expiration) { update.expiration = expiration; update.flags |= NFT_TRANS_UPD_EXPIRATION; } if (update.flags) { struct nft_trans_one_elem *ue; ue = &nft_trans_container_elem(trans)->elems[0]; ue->update = kmemdup(&update, sizeof(update), GFP_KERNEL); if (!ue->update) { err = -ENOMEM; goto err_element_clash; } ue->priv = elem_priv; nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); goto err_elem_free; } } } } else if (err == -ENOTEMPTY) { /* ENOTEMPTY reports overlapping between this element * and an existing one. */ err = -EEXIST; } goto err_element_clash; } if (!(flags & NFT_SET_ELEM_CATCHALL)) { unsigned int max = nft_set_maxsize(set); if (!atomic_add_unless(&set->nelems, 1, max)) { err = -ENFILE; goto err_set_full; } } nft_trans_container_elem(trans)->elems[0].priv = elem.priv; nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); return 0; err_set_full: nft_setelem_remove(ctx->net, set, elem.priv); err_element_clash: kfree(trans); err_elem_free: nf_tables_set_elem_destroy(ctx, set, elem.priv); err_parse_data: if (nla[NFTA_SET_ELEM_DATA] != NULL) nft_data_release(&elem.data.val, desc.type); err_parse_key_end: if (obj) nft_use_dec_restore(&obj->use); nft_data_release(&elem.key_end.val, NFT_DATA_VALUE); err_parse_key: nft_data_release(&elem.key.val, NFT_DATA_VALUE); err_set_elem_expr: for (i = 0; i < num_exprs && expr_array[i]; i++) nft_expr_destroy(ctx, expr_array[i]); err_set_elem_expr_clone: return err; } static int nf_tables_newsetelem(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; struct nft_set *set; struct nft_ctx ctx; int rem, err; if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) return -EINVAL; table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family, genmask, NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]); return PTR_ERR(table); } set = nft_set_lookup_global(net, table, nla[NFTA_SET_ELEM_LIST_SET], nla[NFTA_SET_ELEM_LIST_SET_ID], genmask); if (IS_ERR(set)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_SET]); return PTR_ERR(set); } if (!list_empty(&set->bindings) && (set->flags & (NFT_SET_CONSTANT | NFT_SET_ANONYMOUS))) return -EBUSY; nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { err = nft_add_set_elem(&ctx, set, attr, info->nlh->nlmsg_flags); if (err < 0) { NL_SET_BAD_ATTR(extack, attr); return err; } } if (table->validate_state == NFT_VALIDATE_DO) return nft_table_validate(net, table); return 0; } /** * nft_data_hold - hold a nft_data item * * @data: struct nft_data to release * @type: type of data * * Hold a nft_data item. NFT_DATA_VALUE types can be silently discarded, * NFT_DATA_VERDICT bumps the reference to chains in case of NFT_JUMP and * NFT_GOTO verdicts. This function must be called on active data objects * from the second phase of the commit protocol. */ void nft_data_hold(const struct nft_data *data, enum nft_data_types type) { struct nft_chain *chain; if (type == NFT_DATA_VERDICT) { switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: chain = data->verdict.chain; nft_use_inc_restore(&chain->use); break; } } } static int nft_setelem_active_next(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); u8 genmask = nft_genmask_next(net); return nft_set_elem_active(ext, genmask); } static void nft_setelem_data_activate(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) nft_data_hold(nft_set_ext_data(ext), set->dtype); if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) nft_use_inc_restore(&(*nft_set_ext_obj(ext))->use); } void nft_setelem_data_deactivate(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) nft_data_release(nft_set_ext_data(ext), set->dtype); if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) nft_use_dec(&(*nft_set_ext_obj(ext))->use); } /* similar to nft_trans_elems_remove, but called from abort path to undo newsetelem. * No notifications and no ndeact changes. * * Returns true if set had been added to (i.e., elements need to be removed again). */ static bool nft_trans_elems_new_abort(const struct nft_ctx *ctx, struct nft_trans_elem *te) { bool removed = false; int i; for (i = 0; i < te->nelems; i++) { if (te->elems[i].update) { kfree(te->elems[i].update); te->elems[i].update = NULL; /* Update request, so do not release this element */ te->elems[i].priv = NULL; continue; } if (!te->set->ops->abort || nft_setelem_is_catchall(te->set, te->elems[i].priv)) nft_setelem_remove(ctx->net, te->set, te->elems[i].priv); if (!nft_setelem_is_catchall(te->set, te->elems[i].priv)) atomic_dec(&te->set->nelems); removed = true; } return removed; } /* Called from abort path to undo DELSETELEM/DESTROYSETELEM. */ static void nft_trans_elems_destroy_abort(const struct nft_ctx *ctx, const struct nft_trans_elem *te) { int i; for (i = 0; i < te->nelems; i++) { if (!nft_setelem_active_next(ctx->net, te->set, te->elems[i].priv)) { nft_setelem_data_activate(ctx->net, te->set, te->elems[i].priv); nft_setelem_activate(ctx->net, te->set, te->elems[i].priv); } if (!nft_setelem_is_catchall(te->set, te->elems[i].priv)) te->set->ndeact--; } } static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr) { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; struct nft_set_ext_tmpl tmpl; struct nft_set_elem elem; struct nft_set_ext *ext; struct nft_trans *trans; u32 flags = 0; int err; err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr, nft_set_elem_policy, NULL); if (err < 0) return err; err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags); if (err < 0) return err; if (!nla[NFTA_SET_ELEM_KEY] && !(flags & NFT_SET_ELEM_CATCHALL)) return -EINVAL; if (!nft_setelem_valid_key_end(set, nla, flags)) return -EINVAL; nft_set_ext_prepare(&tmpl); if (flags != 0) { err = nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS); if (err < 0) return err; } if (nla[NFTA_SET_ELEM_KEY]) { err = nft_setelem_parse_key(ctx, set, &elem.key.val, nla[NFTA_SET_ELEM_KEY]); if (err < 0) return err; err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); if (err < 0) goto fail_elem; } if (nla[NFTA_SET_ELEM_KEY_END]) { err = nft_setelem_parse_key(ctx, set, &elem.key_end.val, nla[NFTA_SET_ELEM_KEY_END]); if (err < 0) goto fail_elem; err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen); if (err < 0) goto fail_elem_key_end; } err = -ENOMEM; elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, elem.key_end.val.data, NULL, 0, 0, GFP_KERNEL_ACCOUNT); if (IS_ERR(elem.priv)) { err = PTR_ERR(elem.priv); goto fail_elem_key_end; } ext = nft_set_elem_ext(set, elem.priv); if (flags) *nft_set_ext_flags(ext) = flags; trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set); if (trans == NULL) goto fail_trans; err = nft_setelem_deactivate(ctx->net, set, &elem, flags); if (err < 0) goto fail_ops; nft_setelem_data_deactivate(ctx->net, set, elem.priv); nft_trans_container_elem(trans)->elems[0].priv = elem.priv; nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); return 0; fail_ops: kfree(trans); fail_trans: kfree(elem.priv); fail_elem_key_end: nft_data_release(&elem.key_end.val, NFT_DATA_VALUE); fail_elem: nft_data_release(&elem.key.val, NFT_DATA_VALUE); return err; } static int nft_setelem_flush(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); struct nft_trans *trans; if (!nft_set_elem_active(ext, iter->genmask)) return 0; trans = nft_trans_alloc_gfp(ctx, NFT_MSG_DELSETELEM, struct_size_t(struct nft_trans_elem, elems, 1), GFP_ATOMIC); if (!trans) return -ENOMEM; set->ops->flush(ctx->net, set, elem_priv); set->ndeact++; nft_setelem_data_deactivate(ctx->net, set, elem_priv); nft_trans_elem_set(trans) = set; nft_trans_container_elem(trans)->nelems = 1; nft_trans_container_elem(trans)->elems[0].priv = elem_priv; nft_trans_commit_list_add_elem(ctx->net, trans, GFP_ATOMIC); return 0; } static int __nft_set_catchall_flush(const struct nft_ctx *ctx, struct nft_set *set, struct nft_elem_priv *elem_priv) { struct nft_trans *trans; trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set); if (!trans) return -ENOMEM; nft_setelem_data_deactivate(ctx->net, set, elem_priv); nft_trans_container_elem(trans)->elems[0].priv = elem_priv; nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); return 0; } static int nft_set_catchall_flush(const struct nft_ctx *ctx, struct nft_set *set) { u8 genmask = nft_genmask_next(ctx->net); struct nft_set_elem_catchall *catchall; struct nft_set_ext *ext; int ret = 0; list_for_each_entry_rcu(catchall, &set->catchall_list, list, lockdep_commit_lock_is_held(ctx->net)) { ext = nft_set_elem_ext(set, catchall->elem); if (!nft_set_elem_active(ext, genmask)) continue; ret = __nft_set_catchall_flush(ctx, set, catchall->elem); if (ret < 0) break; nft_set_elem_change_active(ctx->net, set, ext); } return ret; } static int nft_set_flush(struct nft_ctx *ctx, struct nft_set *set, u8 genmask) { struct nft_set_iter iter = { .genmask = genmask, .type = NFT_ITER_UPDATE, .fn = nft_setelem_flush, }; set->ops->walk(ctx, set, &iter); if (!iter.err) iter.err = nft_set_catchall_flush(ctx, set); return iter.err; } static int nf_tables_delsetelem(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; struct nft_set *set; struct nft_ctx ctx; int rem, err = 0; table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family, genmask, NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]); return PTR_ERR(table); } set = nft_set_lookup(net, table, nla[NFTA_SET_ELEM_LIST_SET], genmask); if (IS_ERR(set)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_SET]); return PTR_ERR(set); } if (nft_set_is_anonymous(set)) return -EOPNOTSUPP; if (!list_empty(&set->bindings) && (set->flags & NFT_SET_CONSTANT)) return -EBUSY; nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS]) return nft_set_flush(&ctx, set, genmask); nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { err = nft_del_setelem(&ctx, set, attr); if (err == -ENOENT && NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYSETELEM) continue; if (err < 0) { NL_SET_BAD_ATTR(extack, attr); return err; } } return 0; } /* * Stateful objects */ /** * nft_register_obj- register nf_tables stateful object type * @obj_type: object type * * Registers the object type for use with nf_tables. Returns zero on * success or a negative errno code otherwise. */ int nft_register_obj(struct nft_object_type *obj_type) { if (obj_type->type == NFT_OBJECT_UNSPEC) return -EINVAL; nfnl_lock(NFNL_SUBSYS_NFTABLES); list_add_rcu(&obj_type->list, &nf_tables_objects); nfnl_unlock(NFNL_SUBSYS_NFTABLES); return 0; } EXPORT_SYMBOL_GPL(nft_register_obj); /** * nft_unregister_obj - unregister nf_tables object type * @obj_type: object type * * Unregisters the object type for use with nf_tables. */ void nft_unregister_obj(struct nft_object_type *obj_type) { nfnl_lock(NFNL_SUBSYS_NFTABLES); list_del_rcu(&obj_type->list); nfnl_unlock(NFNL_SUBSYS_NFTABLES); } EXPORT_SYMBOL_GPL(nft_unregister_obj); struct nft_object *nft_obj_lookup(const struct net *net, const struct nft_table *table, const struct nlattr *nla, u32 objtype, u8 genmask) { struct nft_object_hash_key k = { .table = table }; char search[NFT_OBJ_MAXNAMELEN]; struct rhlist_head *tmp, *list; struct nft_object *obj; nla_strscpy(search, nla, sizeof(search)); k.name = search; WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_commit_lock_is_held(net)); rcu_read_lock(); list = rhltable_lookup(&nft_objname_ht, &k, nft_objname_ht_params); if (!list) goto out; rhl_for_each_entry_rcu(obj, tmp, list, rhlhead) { if (objtype == obj->ops->type->type && nft_active_genmask(obj, genmask)) { rcu_read_unlock(); return obj; } } out: rcu_read_unlock(); return ERR_PTR(-ENOENT); } EXPORT_SYMBOL_GPL(nft_obj_lookup); static struct nft_object *nft_obj_lookup_byhandle(const struct nft_table *table, const struct nlattr *nla, u32 objtype, u8 genmask) { struct nft_object *obj; list_for_each_entry(obj, &table->objects, list) { if (be64_to_cpu(nla_get_be64(nla)) == obj->handle && objtype == obj->ops->type->type && nft_active_genmask(obj, genmask)) return obj; } return ERR_PTR(-ENOENT); } static const struct nla_policy nft_obj_policy[NFTA_OBJ_MAX + 1] = { [NFTA_OBJ_TABLE] = { .type = NLA_STRING, .len = NFT_TABLE_MAXNAMELEN - 1 }, [NFTA_OBJ_NAME] = { .type = NLA_STRING, .len = NFT_OBJ_MAXNAMELEN - 1 }, [NFTA_OBJ_TYPE] = { .type = NLA_U32 }, [NFTA_OBJ_DATA] = { .type = NLA_NESTED }, [NFTA_OBJ_HANDLE] = { .type = NLA_U64}, [NFTA_OBJ_USERDATA] = { .type = NLA_BINARY, .len = NFT_USERDATA_MAXLEN }, }; static struct nft_object *nft_obj_init(const struct nft_ctx *ctx, const struct nft_object_type *type, const struct nlattr *attr) { struct nlattr **tb; const struct nft_object_ops *ops; struct nft_object *obj; int err = -ENOMEM; tb = kmalloc_array(type->maxattr + 1, sizeof(*tb), GFP_KERNEL); if (!tb) goto err1; if (attr) { err = nla_parse_nested_deprecated(tb, type->maxattr, attr, type->policy, NULL); if (err < 0) goto err2; } else { memset(tb, 0, sizeof(tb[0]) * (type->maxattr + 1)); } if (type->select_ops) { ops = type->select_ops(ctx, (const struct nlattr * const *)tb); if (IS_ERR(ops)) { err = PTR_ERR(ops); goto err2; } } else { ops = type->ops; } err = -ENOMEM; obj = kzalloc(sizeof(*obj) + ops->size, GFP_KERNEL_ACCOUNT); if (!obj) goto err2; err = ops->init(ctx, (const struct nlattr * const *)tb, obj); if (err < 0) goto err3; obj->ops = ops; kfree(tb); return obj; err3: kfree(obj); err2: kfree(tb); err1: return ERR_PTR(err); } static int nft_object_dump(struct sk_buff *skb, unsigned int attr, struct nft_object *obj, bool reset) { struct nlattr *nest; nest = nla_nest_start_noflag(skb, attr); if (!nest) goto nla_put_failure; if (obj->ops->dump(skb, obj, reset) < 0) goto nla_put_failure; nla_nest_end(skb, nest); return 0; nla_put_failure: return -1; } static const struct nft_object_type *__nft_obj_type_get(u32 objtype, u8 family) { const struct nft_object_type *type; list_for_each_entry_rcu(type, &nf_tables_objects, list) { if (type->family != NFPROTO_UNSPEC && type->family != family) continue; if (objtype == type->type) return type; } return NULL; } static const struct nft_object_type * nft_obj_type_get(struct net *net, u32 objtype, u8 family) { const struct nft_object_type *type; rcu_read_lock(); type = __nft_obj_type_get(objtype, family); if (type != NULL && try_module_get(type->owner)) { rcu_read_unlock(); return type; } rcu_read_unlock(); lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (type == NULL) { if (nft_request_module(net, "nft-obj-%u", objtype) == -EAGAIN) return ERR_PTR(-EAGAIN); } #endif return ERR_PTR(-ENOENT); } static int nf_tables_updobj(const struct nft_ctx *ctx, const struct nft_object_type *type, const struct nlattr *attr, struct nft_object *obj) { struct nft_object *newobj; struct nft_trans *trans; int err = -ENOMEM; /* caller must have obtained type->owner reference. */ trans = nft_trans_alloc(ctx, NFT_MSG_NEWOBJ, sizeof(struct nft_trans_obj)); if (!trans) goto err_trans; newobj = nft_obj_init(ctx, type, attr); if (IS_ERR(newobj)) { err = PTR_ERR(newobj); goto err_free_trans; } nft_trans_obj(trans) = obj; nft_trans_obj_update(trans) = true; nft_trans_obj_newobj(trans) = newobj; nft_trans_commit_list_add_tail(ctx->net, trans); return 0; err_free_trans: kfree(trans); err_trans: module_put(type->owner); return err; } static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); u8 family = info->nfmsg->nfgen_family; const struct nft_object_type *type; struct net *net = info->net; struct nft_table *table; struct nft_object *obj; struct nft_ctx ctx; u32 objtype; int err; if (!nla[NFTA_OBJ_TYPE] || !nla[NFTA_OBJ_NAME] || !nla[NFTA_OBJ_DATA]) return -EINVAL; table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask, NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]); return PTR_ERR(table); } objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); obj = nft_obj_lookup(net, table, nla[NFTA_OBJ_NAME], objtype, genmask); if (IS_ERR(obj)) { err = PTR_ERR(obj); if (err != -ENOENT) { NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]); return err; } } else { if (info->nlh->nlmsg_flags & NLM_F_EXCL) { NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]); return -EEXIST; } if (info->nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; if (!obj->ops->update) return 0; type = nft_obj_type_get(net, objtype, family); if (WARN_ON_ONCE(IS_ERR(type))) return PTR_ERR(type); nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); /* type->owner reference is put when transaction object is released. */ return nf_tables_updobj(&ctx, type, nla[NFTA_OBJ_DATA], obj); } nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); if (!nft_use_inc(&table->use)) return -EMFILE; type = nft_obj_type_get(net, objtype, family); if (IS_ERR(type)) { err = PTR_ERR(type); goto err_type; } obj = nft_obj_init(&ctx, type, nla[NFTA_OBJ_DATA]); if (IS_ERR(obj)) { err = PTR_ERR(obj); goto err_init; } obj->key.table = table; obj->handle = nf_tables_alloc_handle(table); obj->key.name = nla_strdup(nla[NFTA_OBJ_NAME], GFP_KERNEL_ACCOUNT); if (!obj->key.name) { err = -ENOMEM; goto err_strdup; } if (nla[NFTA_OBJ_USERDATA]) { obj->udata = nla_memdup(nla[NFTA_OBJ_USERDATA], GFP_KERNEL_ACCOUNT); if (obj->udata == NULL) goto err_userdata; obj->udlen = nla_len(nla[NFTA_OBJ_USERDATA]); } err = nft_trans_obj_add(&ctx, NFT_MSG_NEWOBJ, obj); if (err < 0) goto err_trans; err = rhltable_insert(&nft_objname_ht, &obj->rhlhead, nft_objname_ht_params); if (err < 0) goto err_obj_ht; list_add_tail_rcu(&obj->list, &table->objects); return 0; err_obj_ht: /* queued in transaction log */ INIT_LIST_HEAD(&obj->list); return err; err_trans: kfree(obj->udata); err_userdata: kfree(obj->key.name); err_strdup: if (obj->ops->destroy) obj->ops->destroy(&ctx, obj); kfree(obj); err_init: module_put(type->owner); err_type: nft_use_dec_restore(&table->use); return err; } static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq, int event, u32 flags, int family, const struct nft_table *table, struct nft_object *obj, bool reset) { struct nlmsghdr *nlh; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nfnl_msg_put(skb, portid, seq, event, flags, family, NFNETLINK_V0, nft_base_seq(net)); if (!nlh) goto nla_put_failure; if (nla_put_string(skb, NFTA_OBJ_TABLE, table->name) || nla_put_string(skb, NFTA_OBJ_NAME, obj->key.name) || nla_put_be64(skb, NFTA_OBJ_HANDLE, cpu_to_be64(obj->handle), NFTA_OBJ_PAD)) goto nla_put_failure; if (event == NFT_MSG_DELOBJ) { nlmsg_end(skb, nlh); return 0; } if (nla_put_be32(skb, NFTA_OBJ_TYPE, htonl(obj->ops->type->type)) || nla_put_be32(skb, NFTA_OBJ_USE, htonl(obj->use)) || nft_object_dump(skb, NFTA_OBJ_DATA, obj, reset)) goto nla_put_failure; if (obj->udata && nla_put(skb, NFTA_OBJ_USERDATA, obj->udlen, obj->udata)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_trim(skb, nlh); return -1; } static void audit_log_obj_reset(const struct nft_table *table, unsigned int base_seq, unsigned int nentries) { char *buf = kasprintf(GFP_ATOMIC, "%s:%u", table->name, base_seq); audit_log_nfcfg(buf, table->family, nentries, AUDIT_NFT_OP_OBJ_RESET, GFP_ATOMIC); kfree(buf); } struct nft_obj_dump_ctx { unsigned int s_idx; char *table; u32 type; bool reset; }; static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) { const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); struct nft_obj_dump_ctx *ctx = (void *)cb->ctx; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; struct nftables_pernet *nft_net; const struct nft_table *table; unsigned int entries = 0; struct nft_object *obj; unsigned int idx = 0; int rc = 0; rcu_read_lock(); nft_net = nft_pernet(net); cb->seq = READ_ONCE(nft_net->base_seq); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) continue; entries = 0; list_for_each_entry_rcu(obj, &table->objects, list) { if (!nft_is_active(net, obj)) goto cont; if (idx < ctx->s_idx) goto cont; if (ctx->table && strcmp(ctx->table, table->name)) goto cont; if (ctx->type != NFT_OBJECT_UNSPEC && obj->ops->type->type != ctx->type) goto cont; rc = nf_tables_fill_obj_info(skb, net, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFT_MSG_NEWOBJ, NLM_F_MULTI | NLM_F_APPEND, table->family, table, obj, ctx->reset); if (rc < 0) break; entries++; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: idx++; } if (ctx->reset && entries) audit_log_obj_reset(table, nft_net->base_seq, entries); if (rc < 0) break; } rcu_read_unlock(); ctx->s_idx = idx; return skb->len; } static int nf_tables_dumpreset_obj(struct sk_buff *skb, struct netlink_callback *cb) { struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk)); int ret; mutex_lock(&nft_net->commit_mutex); ret = nf_tables_dump_obj(skb, cb); mutex_unlock(&nft_net->commit_mutex); return ret; } static int nf_tables_dump_obj_start(struct netlink_callback *cb) { struct nft_obj_dump_ctx *ctx = (void *)cb->ctx; const struct nlattr * const *nla = cb->data; BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); if (nla[NFTA_OBJ_TABLE]) { ctx->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_ATOMIC); if (!ctx->table) return -ENOMEM; } if (nla[NFTA_OBJ_TYPE]) ctx->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); return 0; } static int nf_tables_dumpreset_obj_start(struct netlink_callback *cb) { struct nft_obj_dump_ctx *ctx = (void *)cb->ctx; ctx->reset = true; return nf_tables_dump_obj_start(cb); } static int nf_tables_dump_obj_done(struct netlink_callback *cb) { struct nft_obj_dump_ctx *ctx = (void *)cb->ctx; kfree(ctx->table); return 0; } /* Caller must hold rcu read lock or transaction mutex */ static struct sk_buff * nf_tables_getobj_single(u32 portid, const struct nfnl_info *info, const struct nlattr * const nla[], bool reset) { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); u8 family = info->nfmsg->nfgen_family; const struct nft_table *table; struct net *net = info->net; struct nft_object *obj; struct sk_buff *skb2; u32 objtype; int err; if (!nla[NFTA_OBJ_NAME] || !nla[NFTA_OBJ_TYPE]) return ERR_PTR(-EINVAL); table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask, 0); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]); return ERR_CAST(table); } objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); obj = nft_obj_lookup(net, table, nla[NFTA_OBJ_NAME], objtype, genmask); if (IS_ERR(obj)) { NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]); return ERR_CAST(obj); } skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (!skb2) return ERR_PTR(-ENOMEM); err = nf_tables_fill_obj_info(skb2, net, portid, info->nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0, family, table, obj, reset); if (err < 0) { kfree_skb(skb2); return ERR_PTR(err); } return skb2; } static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { u32 portid = NETLINK_CB(skb).portid; struct sk_buff *skb2; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = nf_tables_dump_obj_start, .dump = nf_tables_dump_obj, .done = nf_tables_dump_obj_done, .module = THIS_MODULE, .data = (void *)nla, }; return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } skb2 = nf_tables_getobj_single(portid, info, nla, false); if (IS_ERR(skb2)) return PTR_ERR(skb2); return nfnetlink_unicast(skb2, info->net, portid); } static int nf_tables_getobj_reset(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct nftables_pernet *nft_net = nft_pernet(info->net); u32 portid = NETLINK_CB(skb).portid; struct net *net = info->net; struct sk_buff *skb2; char *buf; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = nf_tables_dumpreset_obj_start, .dump = nf_tables_dumpreset_obj, .done = nf_tables_dump_obj_done, .module = THIS_MODULE, .data = (void *)nla, }; return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } if (!try_module_get(THIS_MODULE)) return -EINVAL; rcu_read_unlock(); mutex_lock(&nft_net->commit_mutex); skb2 = nf_tables_getobj_single(portid, info, nla, true); mutex_unlock(&nft_net->commit_mutex); rcu_read_lock(); module_put(THIS_MODULE); if (IS_ERR(skb2)) return PTR_ERR(skb2); buf = kasprintf(GFP_ATOMIC, "%.*s:%u", nla_len(nla[NFTA_OBJ_TABLE]), (char *)nla_data(nla[NFTA_OBJ_TABLE]), nft_net->base_seq); audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1, AUDIT_NFT_OP_OBJ_RESET, GFP_ATOMIC); kfree(buf); return nfnetlink_unicast(skb2, net, portid); } static void nft_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) { if (obj->ops->destroy) obj->ops->destroy(ctx, obj); module_put(obj->ops->type->owner); kfree(obj->key.name); kfree(obj->udata); kfree(obj); } static int nf_tables_delobj(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; struct nft_object *obj; struct nft_ctx ctx; u32 objtype; if (!nla[NFTA_OBJ_TYPE] || (!nla[NFTA_OBJ_NAME] && !nla[NFTA_OBJ_HANDLE])) return -EINVAL; table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask, NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]); return PTR_ERR(table); } objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); if (nla[NFTA_OBJ_HANDLE]) { attr = nla[NFTA_OBJ_HANDLE]; obj = nft_obj_lookup_byhandle(table, attr, objtype, genmask); } else { attr = nla[NFTA_OBJ_NAME]; obj = nft_obj_lookup(net, table, attr, objtype, genmask); } if (IS_ERR(obj)) { if (PTR_ERR(obj) == -ENOENT && NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYOBJ) return 0; NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(obj); } if (obj->use > 0) { NL_SET_BAD_ATTR(extack, attr); return -EBUSY; } nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); return nft_delobj(&ctx, obj); } static void __nft_obj_notify(struct net *net, const struct nft_table *table, struct nft_object *obj, u32 portid, u32 seq, int event, u16 flags, int family, int report, gfp_t gfp) { struct nftables_pernet *nft_net = nft_pernet(net); struct sk_buff *skb; int err; if (!report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) return; skb = nlmsg_new(NLMSG_GOODSIZE, gfp); if (skb == NULL) goto err; err = nf_tables_fill_obj_info(skb, net, portid, seq, event, flags & (NLM_F_CREATE | NLM_F_EXCL), family, table, obj, false); if (err < 0) { kfree_skb(skb); goto err; } nft_notify_enqueue(skb, report, &nft_net->notify_list); return; err: nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS); } void nft_obj_notify(struct net *net, const struct nft_table *table, struct nft_object *obj, u32 portid, u32 seq, int event, u16 flags, int family, int report, gfp_t gfp) { struct nftables_pernet *nft_net = nft_pernet(net); char *buf = kasprintf(gfp, "%s:%u", table->name, nft_net->base_seq); audit_log_nfcfg(buf, family, obj->handle, event == NFT_MSG_NEWOBJ ? AUDIT_NFT_OP_OBJ_REGISTER : AUDIT_NFT_OP_OBJ_UNREGISTER, gfp); kfree(buf); __nft_obj_notify(net, table, obj, portid, seq, event, flags, family, report, gfp); } EXPORT_SYMBOL_GPL(nft_obj_notify); static void nf_tables_obj_notify(const struct nft_ctx *ctx, struct nft_object *obj, int event) { __nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid, ctx->seq, event, ctx->flags, ctx->family, ctx->report, GFP_KERNEL); } /* * Flow tables */ void nft_register_flowtable_type(struct nf_flowtable_type *type) { nfnl_lock(NFNL_SUBSYS_NFTABLES); list_add_tail_rcu(&type->list, &nf_tables_flowtables); nfnl_unlock(NFNL_SUBSYS_NFTABLES); } EXPORT_SYMBOL_GPL(nft_register_flowtable_type); void nft_unregister_flowtable_type(struct nf_flowtable_type *type) { nfnl_lock(NFNL_SUBSYS_NFTABLES); list_del_rcu(&type->list); nfnl_unlock(NFNL_SUBSYS_NFTABLES); } EXPORT_SYMBOL_GPL(nft_unregister_flowtable_type); static const struct nla_policy nft_flowtable_policy[NFTA_FLOWTABLE_MAX + 1] = { [NFTA_FLOWTABLE_TABLE] = { .type = NLA_STRING, .len = NFT_NAME_MAXLEN - 1 }, [NFTA_FLOWTABLE_NAME] = { .type = NLA_STRING, .len = NFT_NAME_MAXLEN - 1 }, [NFTA_FLOWTABLE_HOOK] = { .type = NLA_NESTED }, [NFTA_FLOWTABLE_HANDLE] = { .type = NLA_U64 }, [NFTA_FLOWTABLE_FLAGS] = { .type = NLA_U32 }, }; struct nft_flowtable *nft_flowtable_lookup(const struct net *net, const struct nft_table *table, const struct nlattr *nla, u8 genmask) { struct nft_flowtable *flowtable; list_for_each_entry_rcu(flowtable, &table->flowtables, list, lockdep_commit_lock_is_held(net)) { if (!nla_strcmp(nla, flowtable->name) && nft_active_genmask(flowtable, genmask)) return flowtable; } return ERR_PTR(-ENOENT); } EXPORT_SYMBOL_GPL(nft_flowtable_lookup); void nf_tables_deactivate_flowtable(const struct nft_ctx *ctx, struct nft_flowtable *flowtable, enum nft_trans_phase phase) { switch (phase) { case NFT_TRANS_PREPARE_ERROR: case NFT_TRANS_PREPARE: case NFT_TRANS_ABORT: case NFT_TRANS_RELEASE: nft_use_dec(&flowtable->use); fallthrough; default: return; } } EXPORT_SYMBOL_GPL(nf_tables_deactivate_flowtable); static struct nft_flowtable * nft_flowtable_lookup_byhandle(const struct nft_table *table, const struct nlattr *nla, u8 genmask) { struct nft_flowtable *flowtable; list_for_each_entry(flowtable, &table->flowtables, list) { if (be64_to_cpu(nla_get_be64(nla)) == flowtable->handle && nft_active_genmask(flowtable, genmask)) return flowtable; } return ERR_PTR(-ENOENT); } struct nft_flowtable_hook { u32 num; int priority; struct list_head list; }; static const struct nla_policy nft_flowtable_hook_policy[NFTA_FLOWTABLE_HOOK_MAX + 1] = { [NFTA_FLOWTABLE_HOOK_NUM] = { .type = NLA_U32 }, [NFTA_FLOWTABLE_HOOK_PRIORITY] = { .type = NLA_U32 }, [NFTA_FLOWTABLE_HOOK_DEVS] = { .type = NLA_NESTED }, }; static int nft_flowtable_parse_hook(const struct nft_ctx *ctx, const struct nlattr * const nla[], struct nft_flowtable_hook *flowtable_hook, struct nft_flowtable *flowtable, struct netlink_ext_ack *extack, bool add) { struct nlattr *tb[NFTA_FLOWTABLE_HOOK_MAX + 1]; struct nft_hook *hook; int hooknum, priority; int err; INIT_LIST_HEAD(&flowtable_hook->list); err = nla_parse_nested_deprecated(tb, NFTA_FLOWTABLE_HOOK_MAX, nla[NFTA_FLOWTABLE_HOOK], nft_flowtable_hook_policy, NULL); if (err < 0) return err; if (add) { if (!tb[NFTA_FLOWTABLE_HOOK_NUM] || !tb[NFTA_FLOWTABLE_HOOK_PRIORITY]) { NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]); return -ENOENT; } hooknum = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_NUM])); if (hooknum != NF_NETDEV_INGRESS) return -EOPNOTSUPP; priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY])); flowtable_hook->priority = priority; flowtable_hook->num = hooknum; } else { if (tb[NFTA_FLOWTABLE_HOOK_NUM]) { hooknum = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_NUM])); if (hooknum != flowtable->hooknum) return -EOPNOTSUPP; } if (tb[NFTA_FLOWTABLE_HOOK_PRIORITY]) { priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY])); if (priority != flowtable->data.priority) return -EOPNOTSUPP; } flowtable_hook->priority = flowtable->data.priority; flowtable_hook->num = flowtable->hooknum; } if (tb[NFTA_FLOWTABLE_HOOK_DEVS]) { err = nf_tables_parse_netdev_hooks(ctx->net, tb[NFTA_FLOWTABLE_HOOK_DEVS], &flowtable_hook->list, extack); if (err < 0) return err; } list_for_each_entry(hook, &flowtable_hook->list, list) { hook->ops.pf = NFPROTO_NETDEV; hook->ops.hooknum = flowtable_hook->num; hook->ops.priority = flowtable_hook->priority; hook->ops.priv = &flowtable->data; hook->ops.hook = flowtable->data.type->hook; } return err; } /* call under rcu_read_lock */ static const struct nf_flowtable_type *__nft_flowtable_type_get(u8 family) { const struct nf_flowtable_type *type; list_for_each_entry_rcu(type, &nf_tables_flowtables, list) { if (family == type->family) return type; } return NULL; } static const struct nf_flowtable_type * nft_flowtable_type_get(struct net *net, u8 family) { const struct nf_flowtable_type *type; rcu_read_lock(); type = __nft_flowtable_type_get(family); if (type != NULL && try_module_get(type->owner)) { rcu_read_unlock(); return type; } rcu_read_unlock(); lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (type == NULL) { if (nft_request_module(net, "nf-flowtable-%u", family) == -EAGAIN) return ERR_PTR(-EAGAIN); } #endif return ERR_PTR(-ENOENT); } /* Only called from error and netdev event paths. */ static void nft_unregister_flowtable_hook(struct net *net, struct nft_flowtable *flowtable, struct nft_hook *hook) { nf_unregister_net_hook(net, &hook->ops); flowtable->data.type->setup(&flowtable->data, hook->ops.dev, FLOW_BLOCK_UNBIND); } static void __nft_unregister_flowtable_net_hooks(struct net *net, struct nft_flowtable *flowtable, struct list_head *hook_list, bool release_netdev) { struct nft_hook *hook, *next; list_for_each_entry_safe(hook, next, hook_list, list) { nf_unregister_net_hook(net, &hook->ops); flowtable->data.type->setup(&flowtable->data, hook->ops.dev, FLOW_BLOCK_UNBIND); if (release_netdev) { list_del(&hook->list); kfree_rcu(hook, rcu); } } } static void nft_unregister_flowtable_net_hooks(struct net *net, struct nft_flowtable *flowtable, struct list_head *hook_list) { __nft_unregister_flowtable_net_hooks(net, flowtable, hook_list, false); } static int nft_register_flowtable_net_hooks(struct net *net, struct nft_table *table, struct list_head *hook_list, struct nft_flowtable *flowtable) { struct nft_hook *hook, *next; struct nft_flowtable *ft; int err, i = 0; list_for_each_entry(hook, hook_list, list) { list_for_each_entry(ft, &table->flowtables, list) { if (!nft_is_active_next(net, ft)) continue; if (nft_hook_list_find(&ft->hook_list, hook)) { err = -EEXIST; goto err_unregister_net_hooks; } } err = flowtable->data.type->setup(&flowtable->data, hook->ops.dev, FLOW_BLOCK_BIND); if (err < 0) goto err_unregister_net_hooks; err = nf_register_net_hook(net, &hook->ops); if (err < 0) { flowtable->data.type->setup(&flowtable->data, hook->ops.dev, FLOW_BLOCK_UNBIND); goto err_unregister_net_hooks; } i++; } return 0; err_unregister_net_hooks: list_for_each_entry_safe(hook, next, hook_list, list) { if (i-- <= 0) break; nft_unregister_flowtable_hook(net, flowtable, hook); list_del_rcu(&hook->list); kfree_rcu(hook, rcu); } return err; } static void nft_hooks_destroy(struct list_head *hook_list) { struct nft_hook *hook, *next; list_for_each_entry_safe(hook, next, hook_list, list) { list_del_rcu(&hook->list); kfree_rcu(hook, rcu); } } static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh, struct nft_flowtable *flowtable, struct netlink_ext_ack *extack) { const struct nlattr * const *nla = ctx->nla; struct nft_flowtable_hook flowtable_hook; struct nft_hook *hook, *next; struct nft_trans *trans; bool unregister = false; u32 flags; int err; err = nft_flowtable_parse_hook(ctx, nla, &flowtable_hook, flowtable, extack, false); if (err < 0) return err; list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) { if (nft_hook_list_find(&flowtable->hook_list, hook)) { list_del(&hook->list); kfree(hook); } } if (nla[NFTA_FLOWTABLE_FLAGS]) { flags = ntohl(nla_get_be32(nla[NFTA_FLOWTABLE_FLAGS])); if (flags & ~NFT_FLOWTABLE_MASK) { err = -EOPNOTSUPP; goto err_flowtable_update_hook; } if ((flowtable->data.flags & NFT_FLOWTABLE_HW_OFFLOAD) ^ (flags & NFT_FLOWTABLE_HW_OFFLOAD)) { err = -EOPNOTSUPP; goto err_flowtable_update_hook; } } else { flags = flowtable->data.flags; } err = nft_register_flowtable_net_hooks(ctx->net, ctx->table, &flowtable_hook.list, flowtable); if (err < 0) goto err_flowtable_update_hook; trans = nft_trans_alloc(ctx, NFT_MSG_NEWFLOWTABLE, sizeof(struct nft_trans_flowtable)); if (!trans) { unregister = true; err = -ENOMEM; goto err_flowtable_update_hook; } nft_trans_flowtable_flags(trans) = flags; nft_trans_flowtable(trans) = flowtable; nft_trans_flowtable_update(trans) = true; INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans)); list_splice(&flowtable_hook.list, &nft_trans_flowtable_hooks(trans)); nft_trans_commit_list_add_tail(ctx->net, trans); return 0; err_flowtable_update_hook: list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) { if (unregister) nft_unregister_flowtable_hook(ctx->net, flowtable, hook); list_del_rcu(&hook->list); kfree_rcu(hook, rcu); } return err; } static int nf_tables_newflowtable(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct netlink_ext_ack *extack = info->extack; struct nft_flowtable_hook flowtable_hook; u8 genmask = nft_genmask_next(info->net); u8 family = info->nfmsg->nfgen_family; const struct nf_flowtable_type *type; struct nft_flowtable *flowtable; struct net *net = info->net; struct nft_table *table; struct nft_trans *trans; struct nft_ctx ctx; int err; if (!nla[NFTA_FLOWTABLE_TABLE] || !nla[NFTA_FLOWTABLE_NAME] || !nla[NFTA_FLOWTABLE_HOOK]) return -EINVAL; table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family, genmask, NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]); return PTR_ERR(table); } flowtable = nft_flowtable_lookup(net, table, nla[NFTA_FLOWTABLE_NAME], genmask); if (IS_ERR(flowtable)) { err = PTR_ERR(flowtable); if (err != -ENOENT) { NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]); return err; } } else { if (info->nlh->nlmsg_flags & NLM_F_EXCL) { NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]); return -EEXIST; } nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); return nft_flowtable_update(&ctx, info->nlh, flowtable, extack); } nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); if (!nft_use_inc(&table->use)) return -EMFILE; flowtable = kzalloc(sizeof(*flowtable), GFP_KERNEL_ACCOUNT); if (!flowtable) { err = -ENOMEM; goto flowtable_alloc; } flowtable->table = table; flowtable->handle = nf_tables_alloc_handle(table); INIT_LIST_HEAD(&flowtable->hook_list); flowtable->name = nla_strdup(nla[NFTA_FLOWTABLE_NAME], GFP_KERNEL_ACCOUNT); if (!flowtable->name) { err = -ENOMEM; goto err1; } type = nft_flowtable_type_get(net, family); if (IS_ERR(type)) { err = PTR_ERR(type); goto err2; } if (nla[NFTA_FLOWTABLE_FLAGS]) { flowtable->data.flags = ntohl(nla_get_be32(nla[NFTA_FLOWTABLE_FLAGS])); if (flowtable->data.flags & ~NFT_FLOWTABLE_MASK) { err = -EOPNOTSUPP; goto err3; } } write_pnet(&flowtable->data.net, net); flowtable->data.type = type; err = type->init(&flowtable->data); if (err < 0) goto err3; err = nft_flowtable_parse_hook(&ctx, nla, &flowtable_hook, flowtable, extack, true); if (err < 0) goto err_flowtable_parse_hooks; list_splice(&flowtable_hook.list, &flowtable->hook_list); flowtable->data.priority = flowtable_hook.priority; flowtable->hooknum = flowtable_hook.num; trans = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto err_flowtable_trans; } /* This must be LAST to ensure no packets are walking over this flowtable. */ err = nft_register_flowtable_net_hooks(ctx.net, table, &flowtable->hook_list, flowtable); if (err < 0) goto err_flowtable_hooks; list_add_tail_rcu(&flowtable->list, &table->flowtables); return 0; err_flowtable_hooks: nft_trans_destroy(trans); err_flowtable_trans: nft_hooks_destroy(&flowtable->hook_list); err_flowtable_parse_hooks: flowtable->data.type->free(&flowtable->data); err3: module_put(type->owner); err2: kfree(flowtable->name); err1: kfree(flowtable); flowtable_alloc: nft_use_dec_restore(&table->use); return err; } static void nft_flowtable_hook_release(struct nft_flowtable_hook *flowtable_hook) { struct nft_hook *this, *next; list_for_each_entry_safe(this, next, &flowtable_hook->list, list) { list_del(&this->list); kfree(this); } } static int nft_delflowtable_hook(struct nft_ctx *ctx, struct nft_flowtable *flowtable, struct netlink_ext_ack *extack) { const struct nlattr * const *nla = ctx->nla; struct nft_flowtable_hook flowtable_hook; LIST_HEAD(flowtable_del_list); struct nft_hook *this, *hook; struct nft_trans *trans; int err; err = nft_flowtable_parse_hook(ctx, nla, &flowtable_hook, flowtable, extack, false); if (err < 0) return err; list_for_each_entry(this, &flowtable_hook.list, list) { hook = nft_hook_list_find(&flowtable->hook_list, this); if (!hook) { err = -ENOENT; goto err_flowtable_del_hook; } list_move(&hook->list, &flowtable_del_list); } trans = nft_trans_alloc(ctx, NFT_MSG_DELFLOWTABLE, sizeof(struct nft_trans_flowtable)); if (!trans) { err = -ENOMEM; goto err_flowtable_del_hook; } nft_trans_flowtable(trans) = flowtable; nft_trans_flowtable_update(trans) = true; INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans)); list_splice(&flowtable_del_list, &nft_trans_flowtable_hooks(trans)); nft_flowtable_hook_release(&flowtable_hook); nft_trans_commit_list_add_tail(ctx->net, trans); return 0; err_flowtable_del_hook: list_splice(&flowtable_del_list, &flowtable->hook_list); nft_flowtable_hook_release(&flowtable_hook); return err; } static int nf_tables_delflowtable(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); u8 family = info->nfmsg->nfgen_family; struct nft_flowtable *flowtable; struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; struct nft_ctx ctx; if (!nla[NFTA_FLOWTABLE_TABLE] || (!nla[NFTA_FLOWTABLE_NAME] && !nla[NFTA_FLOWTABLE_HANDLE])) return -EINVAL; table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family, genmask, NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]); return PTR_ERR(table); } if (nla[NFTA_FLOWTABLE_HANDLE]) { attr = nla[NFTA_FLOWTABLE_HANDLE]; flowtable = nft_flowtable_lookup_byhandle(table, attr, genmask); } else { attr = nla[NFTA_FLOWTABLE_NAME]; flowtable = nft_flowtable_lookup(net, table, attr, genmask); } if (IS_ERR(flowtable)) { if (PTR_ERR(flowtable) == -ENOENT && NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYFLOWTABLE) return 0; NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(flowtable); } nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); if (nla[NFTA_FLOWTABLE_HOOK]) return nft_delflowtable_hook(&ctx, flowtable, extack); if (flowtable->use > 0) { NL_SET_BAD_ATTR(extack, attr); return -EBUSY; } return nft_delflowtable(&ctx, flowtable); } static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq, int event, u32 flags, int family, struct nft_flowtable *flowtable, struct list_head *hook_list) { struct nlattr *nest, *nest_devs; struct nft_hook *hook; struct nlmsghdr *nlh; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nfnl_msg_put(skb, portid, seq, event, flags, family, NFNETLINK_V0, nft_base_seq(net)); if (!nlh) goto nla_put_failure; if (nla_put_string(skb, NFTA_FLOWTABLE_TABLE, flowtable->table->name) || nla_put_string(skb, NFTA_FLOWTABLE_NAME, flowtable->name) || nla_put_be64(skb, NFTA_FLOWTABLE_HANDLE, cpu_to_be64(flowtable->handle), NFTA_FLOWTABLE_PAD)) goto nla_put_failure; if (event == NFT_MSG_DELFLOWTABLE && !hook_list) { nlmsg_end(skb, nlh); return 0; } if (nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use)) || nla_put_be32(skb, NFTA_FLOWTABLE_FLAGS, htonl(flowtable->data.flags))) goto nla_put_failure; nest = nla_nest_start_noflag(skb, NFTA_FLOWTABLE_HOOK); if (!nest) goto nla_put_failure; if (nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_NUM, htonl(flowtable->hooknum)) || nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_PRIORITY, htonl(flowtable->data.priority))) goto nla_put_failure; nest_devs = nla_nest_start_noflag(skb, NFTA_FLOWTABLE_HOOK_DEVS); if (!nest_devs) goto nla_put_failure; if (!hook_list) hook_list = &flowtable->hook_list; list_for_each_entry_rcu(hook, hook_list, list, lockdep_commit_lock_is_held(net)) { if (nla_put(skb, NFTA_DEVICE_NAME, hook->ifnamelen, hook->ifname)) goto nla_put_failure; } nla_nest_end(skb, nest_devs); nla_nest_end(skb, nest); nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_trim(skb, nlh); return -1; } struct nft_flowtable_filter { char *table; }; static int nf_tables_dump_flowtable(struct sk_buff *skb, struct netlink_callback *cb) { const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); struct nft_flowtable_filter *filter = cb->data; unsigned int idx = 0, s_idx = cb->args[0]; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; struct nft_flowtable *flowtable; struct nftables_pernet *nft_net; const struct nft_table *table; rcu_read_lock(); nft_net = nft_pernet(net); cb->seq = READ_ONCE(nft_net->base_seq); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) continue; list_for_each_entry_rcu(flowtable, &table->flowtables, list) { if (!nft_is_active(net, flowtable)) goto cont; if (idx < s_idx) goto cont; if (idx > s_idx) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); if (filter && filter->table && strcmp(filter->table, table->name)) goto cont; if (nf_tables_fill_flowtable_info(skb, net, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFT_MSG_NEWFLOWTABLE, NLM_F_MULTI | NLM_F_APPEND, table->family, flowtable, NULL) < 0) goto done; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: idx++; } } done: rcu_read_unlock(); cb->args[0] = idx; return skb->len; } static int nf_tables_dump_flowtable_start(struct netlink_callback *cb) { const struct nlattr * const *nla = cb->data; struct nft_flowtable_filter *filter = NULL; if (nla[NFTA_FLOWTABLE_TABLE]) { filter = kzalloc(sizeof(*filter), GFP_ATOMIC); if (!filter) return -ENOMEM; filter->table = nla_strdup(nla[NFTA_FLOWTABLE_TABLE], GFP_ATOMIC); if (!filter->table) { kfree(filter); return -ENOMEM; } } cb->data = filter; return 0; } static int nf_tables_dump_flowtable_done(struct netlink_callback *cb) { struct nft_flowtable_filter *filter = cb->data; if (!filter) return 0; kfree(filter->table); kfree(filter); return 0; } /* called with rcu_read_lock held */ static int nf_tables_getflowtable(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); u8 family = info->nfmsg->nfgen_family; struct nft_flowtable *flowtable; const struct nft_table *table; struct net *net = info->net; struct sk_buff *skb2; int err; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = nf_tables_dump_flowtable_start, .dump = nf_tables_dump_flowtable, .done = nf_tables_dump_flowtable_done, .module = THIS_MODULE, .data = (void *)nla, }; return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } if (!nla[NFTA_FLOWTABLE_NAME]) return -EINVAL; table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family, genmask, 0); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]); return PTR_ERR(table); } flowtable = nft_flowtable_lookup(net, table, nla[NFTA_FLOWTABLE_NAME], genmask); if (IS_ERR(flowtable)) { NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]); return PTR_ERR(flowtable); } skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (!skb2) return -ENOMEM; err = nf_tables_fill_flowtable_info(skb2, net, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFT_MSG_NEWFLOWTABLE, 0, family, flowtable, NULL); if (err < 0) goto err_fill_flowtable_info; return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); err_fill_flowtable_info: kfree_skb(skb2); return err; } static void nf_tables_flowtable_notify(struct nft_ctx *ctx, struct nft_flowtable *flowtable, struct list_head *hook_list, int event) { struct nftables_pernet *nft_net = nft_pernet(ctx->net); struct sk_buff *skb; u16 flags = 0; int err; if (!ctx->report && !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) return; skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (skb == NULL) goto err; if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); err = nf_tables_fill_flowtable_info(skb, ctx->net, ctx->portid, ctx->seq, event, flags, ctx->family, flowtable, hook_list); if (err < 0) { kfree_skb(skb); goto err; } nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list); return; err: nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); } static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) { struct nft_hook *hook, *next; flowtable->data.type->free(&flowtable->data); list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { list_del_rcu(&hook->list); kfree_rcu(hook, rcu); } kfree(flowtable->name); module_put(flowtable->data.type->owner); kfree(flowtable); } static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq) { struct nftables_pernet *nft_net = nft_pernet(net); struct nlmsghdr *nlh; char buf[TASK_COMM_LEN]; int event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWGEN); nlh = nfnl_msg_put(skb, portid, seq, event, 0, AF_UNSPEC, NFNETLINK_V0, nft_base_seq(net)); if (!nlh) goto nla_put_failure; if (nla_put_be32(skb, NFTA_GEN_ID, htonl(nft_net->base_seq)) || nla_put_be32(skb, NFTA_GEN_PROC_PID, htonl(task_pid_nr(current))) || nla_put_string(skb, NFTA_GEN_PROC_NAME, get_task_comm(buf, current))) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_trim(skb, nlh); return -EMSGSIZE; } static void nft_flowtable_event(unsigned long event, struct net_device *dev, struct nft_flowtable *flowtable) { struct nft_hook *hook; list_for_each_entry(hook, &flowtable->hook_list, list) { if (hook->ops.dev != dev) continue; /* flow_offload_netdev_event() cleans up entries for us. */ nft_unregister_flowtable_hook(dev_net(dev), flowtable, hook); list_del_rcu(&hook->list); kfree_rcu(hook, rcu); break; } } static int nf_tables_flowtable_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct nft_flowtable *flowtable; struct nftables_pernet *nft_net; struct nft_table *table; struct net *net; if (event != NETDEV_UNREGISTER) return 0; net = dev_net(dev); nft_net = nft_pernet(net); mutex_lock(&nft_net->commit_mutex); list_for_each_entry(table, &nft_net->tables, list) { list_for_each_entry(flowtable, &table->flowtables, list) { nft_flowtable_event(event, dev, flowtable); } } mutex_unlock(&nft_net->commit_mutex); return NOTIFY_DONE; } static struct notifier_block nf_tables_flowtable_notifier = { .notifier_call = nf_tables_flowtable_event, }; static void nf_tables_gen_notify(struct net *net, struct sk_buff *skb, int event) { struct nlmsghdr *nlh = nlmsg_hdr(skb); struct sk_buff *skb2; int err; if (!nlmsg_report(nlh) && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) return; skb2 = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (skb2 == NULL) goto err; err = nf_tables_fill_gen_info(skb2, net, NETLINK_CB(skb).portid, nlh->nlmsg_seq); if (err < 0) { kfree_skb(skb2); goto err; } nfnetlink_send(skb2, net, NETLINK_CB(skb).portid, NFNLGRP_NFTABLES, nlmsg_report(nlh), GFP_KERNEL); return; err: nfnetlink_set_err(net, NETLINK_CB(skb).portid, NFNLGRP_NFTABLES, -ENOBUFS); } static int nf_tables_getgen(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct sk_buff *skb2; int err; skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (skb2 == NULL) return -ENOMEM; err = nf_tables_fill_gen_info(skb2, info->net, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq); if (err < 0) goto err_fill_gen_info; return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); err_fill_gen_info: kfree_skb(skb2); return err; } static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { [NFT_MSG_NEWTABLE] = { .call = nf_tables_newtable, .type = NFNL_CB_BATCH, .attr_count = NFTA_TABLE_MAX, .policy = nft_table_policy, }, [NFT_MSG_GETTABLE] = { .call = nf_tables_gettable, .type = NFNL_CB_RCU, .attr_count = NFTA_TABLE_MAX, .policy = nft_table_policy, }, [NFT_MSG_DELTABLE] = { .call = nf_tables_deltable, .type = NFNL_CB_BATCH, .attr_count = NFTA_TABLE_MAX, .policy = nft_table_policy, }, [NFT_MSG_DESTROYTABLE] = { .call = nf_tables_deltable, .type = NFNL_CB_BATCH, .attr_count = NFTA_TABLE_MAX, .policy = nft_table_policy, }, [NFT_MSG_NEWCHAIN] = { .call = nf_tables_newchain, .type = NFNL_CB_BATCH, .attr_count = NFTA_CHAIN_MAX, .policy = nft_chain_policy, }, [NFT_MSG_GETCHAIN] = { .call = nf_tables_getchain, .type = NFNL_CB_RCU, .attr_count = NFTA_CHAIN_MAX, .policy = nft_chain_policy, }, [NFT_MSG_DELCHAIN] = { .call = nf_tables_delchain, .type = NFNL_CB_BATCH, .attr_count = NFTA_CHAIN_MAX, .policy = nft_chain_policy, }, [NFT_MSG_DESTROYCHAIN] = { .call = nf_tables_delchain, .type = NFNL_CB_BATCH, .attr_count = NFTA_CHAIN_MAX, .policy = nft_chain_policy, }, [NFT_MSG_NEWRULE] = { .call = nf_tables_newrule, .type = NFNL_CB_BATCH, .attr_count = NFTA_RULE_MAX, .policy = nft_rule_policy, }, [NFT_MSG_GETRULE] = { .call = nf_tables_getrule, .type = NFNL_CB_RCU, .attr_count = NFTA_RULE_MAX, .policy = nft_rule_policy, }, [NFT_MSG_GETRULE_RESET] = { .call = nf_tables_getrule_reset, .type = NFNL_CB_RCU, .attr_count = NFTA_RULE_MAX, .policy = nft_rule_policy, }, [NFT_MSG_DELRULE] = { .call = nf_tables_delrule, .type = NFNL_CB_BATCH, .attr_count = NFTA_RULE_MAX, .policy = nft_rule_policy, }, [NFT_MSG_DESTROYRULE] = { .call = nf_tables_delrule, .type = NFNL_CB_BATCH, .attr_count = NFTA_RULE_MAX, .policy = nft_rule_policy, }, [NFT_MSG_NEWSET] = { .call = nf_tables_newset, .type = NFNL_CB_BATCH, .attr_count = NFTA_SET_MAX, .policy = nft_set_policy, }, [NFT_MSG_GETSET] = { .call = nf_tables_getset, .type = NFNL_CB_RCU, .attr_count = NFTA_SET_MAX, .policy = nft_set_policy, }, [NFT_MSG_DELSET] = { .call = nf_tables_delset, .type = NFNL_CB_BATCH, .attr_count = NFTA_SET_MAX, .policy = nft_set_policy, }, [NFT_MSG_DESTROYSET] = { .call = nf_tables_delset, .type = NFNL_CB_BATCH, .attr_count = NFTA_SET_MAX, .policy = nft_set_policy, }, [NFT_MSG_NEWSETELEM] = { .call = nf_tables_newsetelem, .type = NFNL_CB_BATCH, .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, [NFT_MSG_GETSETELEM] = { .call = nf_tables_getsetelem, .type = NFNL_CB_RCU, .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, [NFT_MSG_GETSETELEM_RESET] = { .call = nf_tables_getsetelem_reset, .type = NFNL_CB_RCU, .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, [NFT_MSG_DELSETELEM] = { .call = nf_tables_delsetelem, .type = NFNL_CB_BATCH, .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, [NFT_MSG_DESTROYSETELEM] = { .call = nf_tables_delsetelem, .type = NFNL_CB_BATCH, .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, [NFT_MSG_GETGEN] = { .call = nf_tables_getgen, .type = NFNL_CB_RCU, }, [NFT_MSG_NEWOBJ] = { .call = nf_tables_newobj, .type = NFNL_CB_BATCH, .attr_count = NFTA_OBJ_MAX, .policy = nft_obj_policy, }, [NFT_MSG_GETOBJ] = { .call = nf_tables_getobj, .type = NFNL_CB_RCU, .attr_count = NFTA_OBJ_MAX, .policy = nft_obj_policy, }, [NFT_MSG_DELOBJ] = { .call = nf_tables_delobj, .type = NFNL_CB_BATCH, .attr_count = NFTA_OBJ_MAX, .policy = nft_obj_policy, }, [NFT_MSG_DESTROYOBJ] = { .call = nf_tables_delobj, .type = NFNL_CB_BATCH, .attr_count = NFTA_OBJ_MAX, .policy = nft_obj_policy, }, [NFT_MSG_GETOBJ_RESET] = { .call = nf_tables_getobj_reset, .type = NFNL_CB_RCU, .attr_count = NFTA_OBJ_MAX, .policy = nft_obj_policy, }, [NFT_MSG_NEWFLOWTABLE] = { .call = nf_tables_newflowtable, .type = NFNL_CB_BATCH, .attr_count = NFTA_FLOWTABLE_MAX, .policy = nft_flowtable_policy, }, [NFT_MSG_GETFLOWTABLE] = { .call = nf_tables_getflowtable, .type = NFNL_CB_RCU, .attr_count = NFTA_FLOWTABLE_MAX, .policy = nft_flowtable_policy, }, [NFT_MSG_DELFLOWTABLE] = { .call = nf_tables_delflowtable, .type = NFNL_CB_BATCH, .attr_count = NFTA_FLOWTABLE_MAX, .policy = nft_flowtable_policy, }, [NFT_MSG_DESTROYFLOWTABLE] = { .call = nf_tables_delflowtable, .type = NFNL_CB_BATCH, .attr_count = NFTA_FLOWTABLE_MAX, .policy = nft_flowtable_policy, }, }; static int nf_tables_validate(struct net *net) { struct nftables_pernet *nft_net = nft_pernet(net); struct nft_table *table; list_for_each_entry(table, &nft_net->tables, list) { switch (table->validate_state) { case NFT_VALIDATE_SKIP: continue; case NFT_VALIDATE_NEED: nft_validate_state_update(table, NFT_VALIDATE_DO); fallthrough; case NFT_VALIDATE_DO: if (nft_table_validate(net, table) < 0) return -EAGAIN; nft_validate_state_update(table, NFT_VALIDATE_SKIP); break; } } return 0; } /* a drop policy has to be deferred until all rules have been activated, * otherwise a large ruleset that contains a drop-policy base chain will * cause all packets to get dropped until the full transaction has been * processed. * * We defer the drop policy until the transaction has been finalized. */ static void nft_chain_commit_drop_policy(struct nft_trans_chain *trans) { struct nft_base_chain *basechain; if (trans->policy != NF_DROP) return; if (!nft_is_base_chain(trans->chain)) return; basechain = nft_base_chain(trans->chain); basechain->policy = NF_DROP; } static void nft_chain_commit_update(struct nft_trans_chain *trans) { struct nft_table *table = trans->nft_trans_binding.nft_trans.table; struct nft_base_chain *basechain; if (trans->name) { rhltable_remove(&table->chains_ht, &trans->chain->rhlhead, nft_chain_ht_params); swap(trans->chain->name, trans->name); rhltable_insert_key(&table->chains_ht, trans->chain->name, &trans->chain->rhlhead, nft_chain_ht_params); } if (!nft_is_base_chain(trans->chain)) return; nft_chain_stats_replace(trans); basechain = nft_base_chain(trans->chain); switch (trans->policy) { case NF_DROP: case NF_ACCEPT: basechain->policy = trans->policy; break; } } static void nft_obj_commit_update(const struct nft_ctx *ctx, struct nft_trans *trans) { struct nft_object *newobj; struct nft_object *obj; obj = nft_trans_obj(trans); newobj = nft_trans_obj_newobj(trans); if (WARN_ON_ONCE(!obj->ops->update)) return; obj->ops->update(obj, newobj); nft_obj_destroy(ctx, newobj); } static void nft_commit_release(struct nft_trans *trans) { struct nft_ctx ctx = { .net = trans->net, }; nft_ctx_update(&ctx, trans); switch (trans->msg_type) { case NFT_MSG_DELTABLE: case NFT_MSG_DESTROYTABLE: nf_tables_table_destroy(trans->table); break; case NFT_MSG_NEWCHAIN: free_percpu(nft_trans_chain_stats(trans)); kfree(nft_trans_chain_name(trans)); break; case NFT_MSG_DELCHAIN: case NFT_MSG_DESTROYCHAIN: if (nft_trans_chain_update(trans)) nft_hooks_destroy(&nft_trans_chain_hooks(trans)); else nf_tables_chain_destroy(nft_trans_chain(trans)); break; case NFT_MSG_DELRULE: case NFT_MSG_DESTROYRULE: nf_tables_rule_destroy(&ctx, nft_trans_rule(trans)); break; case NFT_MSG_DELSET: case NFT_MSG_DESTROYSET: nft_set_destroy(&ctx, nft_trans_set(trans)); break; case NFT_MSG_DELSETELEM: case NFT_MSG_DESTROYSETELEM: nft_trans_elems_destroy(&ctx, nft_trans_container_elem(trans)); break; case NFT_MSG_DELOBJ: case NFT_MSG_DESTROYOBJ: nft_obj_destroy(&ctx, nft_trans_obj(trans)); break; case NFT_MSG_DELFLOWTABLE: case NFT_MSG_DESTROYFLOWTABLE: if (nft_trans_flowtable_update(trans)) nft_hooks_destroy(&nft_trans_flowtable_hooks(trans)); else nf_tables_flowtable_destroy(nft_trans_flowtable(trans)); break; } if (trans->put_net) put_net(trans->net); kfree(trans); } static void nf_tables_trans_destroy_work(struct work_struct *w) { struct nft_trans *trans, *next; LIST_HEAD(head); spin_lock(&nf_tables_destroy_list_lock); list_splice_init(&nf_tables_destroy_list, &head); spin_unlock(&nf_tables_destroy_list_lock); if (list_empty(&head)) return; synchronize_rcu(); list_for_each_entry_safe(trans, next, &head, list) { nft_trans_list_del(trans); nft_commit_release(trans); } } void nf_tables_trans_destroy_flush_work(void) { flush_work(&trans_destroy_work); } EXPORT_SYMBOL_GPL(nf_tables_trans_destroy_flush_work); static bool nft_expr_reduce(struct nft_regs_track *track, const struct nft_expr *expr) { return false; } static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *chain) { const struct nft_expr *expr, *last; struct nft_regs_track track = {}; unsigned int size, data_size; void *data, *data_boundary; struct nft_rule_dp *prule; struct nft_rule *rule; /* already handled or inactive chain? */ if (chain->blob_next || !nft_is_active_next(net, chain)) return 0; data_size = 0; list_for_each_entry(rule, &chain->rules, list) { if (nft_is_active_next(net, rule)) { data_size += sizeof(*prule) + rule->dlen; if (data_size > INT_MAX) return -ENOMEM; } } chain->blob_next = nf_tables_chain_alloc_rules(chain, data_size); if (!chain->blob_next) return -ENOMEM; data = (void *)chain->blob_next->data; data_boundary = data + data_size; size = 0; list_for_each_entry(rule, &chain->rules, list) { if (!nft_is_active_next(net, rule)) continue; prule = (struct nft_rule_dp *)data; data += offsetof(struct nft_rule_dp, data); if (WARN_ON_ONCE(data > data_boundary)) return -ENOMEM; size = 0; track.last = nft_expr_last(rule); nft_rule_for_each_expr(expr, last, rule) { track.cur = expr; if (nft_expr_reduce(&track, expr)) { expr = track.cur; continue; } if (WARN_ON_ONCE(data + size + expr->ops->size > data_boundary)) return -ENOMEM; memcpy(data + size, expr, expr->ops->size); size += expr->ops->size; } if (WARN_ON_ONCE(size >= 1 << 12)) return -ENOMEM; prule->handle = rule->handle; prule->dlen = size; prule->is_last = 0; data += size; size = 0; chain->blob_next->size += (unsigned long)(data - (void *)prule); } if (WARN_ON_ONCE(data > data_boundary)) return -ENOMEM; prule = (struct nft_rule_dp *)data; nft_last_rule(chain, prule); return 0; } static void nf_tables_commit_chain_prepare_cancel(struct net *net) { struct nftables_pernet *nft_net = nft_pernet(net); struct nft_trans *trans, *next; list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) { if (trans->msg_type == NFT_MSG_NEWRULE || trans->msg_type == NFT_MSG_DELRULE) { struct nft_chain *chain = nft_trans_rule_chain(trans); kvfree(chain->blob_next); chain->blob_next = NULL; } } } static void __nf_tables_commit_chain_free_rules(struct rcu_head *h) { struct nft_rule_dp_last *l = container_of(h, struct nft_rule_dp_last, h); kvfree(l->blob); } static void nf_tables_commit_chain_free_rules_old(struct nft_rule_blob *blob) { struct nft_rule_dp_last *last; /* last rule trailer is after end marker */ last = (void *)blob + sizeof(*blob) + blob->size; last->blob = blob; call_rcu(&last->h, __nf_tables_commit_chain_free_rules); } static void nf_tables_commit_chain(struct net *net, struct nft_chain *chain) { struct nft_rule_blob *g0, *g1; bool next_genbit; next_genbit = nft_gencursor_next(net); g0 = rcu_dereference_protected(chain->blob_gen_0, lockdep_commit_lock_is_held(net)); g1 = rcu_dereference_protected(chain->blob_gen_1, lockdep_commit_lock_is_held(net)); /* No changes to this chain? */ if (chain->blob_next == NULL) { /* chain had no change in last or next generation */ if (g0 == g1) return; /* * chain had no change in this generation; make sure next * one uses same rules as current generation. */ if (next_genbit) { rcu_assign_pointer(chain->blob_gen_1, g0); nf_tables_commit_chain_free_rules_old(g1); } else { rcu_assign_pointer(chain->blob_gen_0, g1); nf_tables_commit_chain_free_rules_old(g0); } return; } if (next_genbit) rcu_assign_pointer(chain->blob_gen_1, chain->blob_next); else rcu_assign_pointer(chain->blob_gen_0, chain->blob_next); chain->blob_next = NULL; if (g0 == g1) return; if (next_genbit) nf_tables_commit_chain_free_rules_old(g1); else nf_tables_commit_chain_free_rules_old(g0); } static void nft_obj_del(struct nft_object *obj) { rhltable_remove(&nft_objname_ht, &obj->rhlhead, nft_objname_ht_params); list_del_rcu(&obj->list); } void nft_chain_del(struct nft_chain *chain) { struct nft_table *table = chain->table; WARN_ON_ONCE(rhltable_remove(&table->chains_ht, &chain->rhlhead, nft_chain_ht_params)); list_del_rcu(&chain->list); } static void nft_trans_gc_setelem_remove(struct nft_ctx *ctx, struct nft_trans_gc *trans) { struct nft_elem_priv **priv = trans->priv; unsigned int i; for (i = 0; i < trans->count; i++) { nft_setelem_data_deactivate(ctx->net, trans->set, priv[i]); nft_setelem_remove(ctx->net, trans->set, priv[i]); } } void nft_trans_gc_destroy(struct nft_trans_gc *trans) { nft_set_put(trans->set); put_net(trans->net); kfree(trans); } static void nft_trans_gc_trans_free(struct rcu_head *rcu) { struct nft_elem_priv *elem_priv; struct nft_trans_gc *trans; struct nft_ctx ctx = {}; unsigned int i; trans = container_of(rcu, struct nft_trans_gc, rcu); ctx.net = read_pnet(&trans->set->net); for (i = 0; i < trans->count; i++) { elem_priv = trans->priv[i]; if (!nft_setelem_is_catchall(trans->set, elem_priv)) atomic_dec(&trans->set->nelems); nf_tables_set_elem_destroy(&ctx, trans->set, elem_priv); } nft_trans_gc_destroy(trans); } static bool nft_trans_gc_work_done(struct nft_trans_gc *trans) { struct nftables_pernet *nft_net; struct nft_ctx ctx = {}; nft_net = nft_pernet(trans->net); mutex_lock(&nft_net->commit_mutex); /* Check for race with transaction, otherwise this batch refers to * stale objects that might not be there anymore. Skip transaction if * set has been destroyed from control plane transaction in case gc * worker loses race. */ if (READ_ONCE(nft_net->gc_seq) != trans->seq || trans->set->dead) { mutex_unlock(&nft_net->commit_mutex); return false; } ctx.net = trans->net; ctx.table = trans->set->table; nft_trans_gc_setelem_remove(&ctx, trans); mutex_unlock(&nft_net->commit_mutex); return true; } static void nft_trans_gc_work(struct work_struct *work) { struct nft_trans_gc *trans, *next; LIST_HEAD(trans_gc_list); spin_lock(&nf_tables_gc_list_lock); list_splice_init(&nf_tables_gc_list, &trans_gc_list); spin_unlock(&nf_tables_gc_list_lock); list_for_each_entry_safe(trans, next, &trans_gc_list, list) { list_del(&trans->list); if (!nft_trans_gc_work_done(trans)) { nft_trans_gc_destroy(trans); continue; } call_rcu(&trans->rcu, nft_trans_gc_trans_free); } } struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set, unsigned int gc_seq, gfp_t gfp) { struct net *net = read_pnet(&set->net); struct nft_trans_gc *trans; trans = kzalloc(sizeof(*trans), gfp); if (!trans) return NULL; trans->net = maybe_get_net(net); if (!trans->net) { kfree(trans); return NULL; } refcount_inc(&set->refs); trans->set = set; trans->seq = gc_seq; return trans; } void nft_trans_gc_elem_add(struct nft_trans_gc *trans, void *priv) { trans->priv[trans->count++] = priv; } static void nft_trans_gc_queue_work(struct nft_trans_gc *trans) { spin_lock(&nf_tables_gc_list_lock); list_add_tail(&trans->list, &nf_tables_gc_list); spin_unlock(&nf_tables_gc_list_lock); schedule_work(&trans_gc_work); } static int nft_trans_gc_space(struct nft_trans_gc *trans) { return NFT_TRANS_GC_BATCHCOUNT - trans->count; } struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, unsigned int gc_seq, gfp_t gfp) { struct nft_set *set; if (nft_trans_gc_space(gc)) return gc; set = gc->set; nft_trans_gc_queue_work(gc); return nft_trans_gc_alloc(set, gc_seq, gfp); } void nft_trans_gc_queue_async_done(struct nft_trans_gc *trans) { if (trans->count == 0) { nft_trans_gc_destroy(trans); return; } nft_trans_gc_queue_work(trans); } struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp) { struct nft_set *set; if (WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net))) return NULL; if (nft_trans_gc_space(gc)) return gc; set = gc->set; call_rcu(&gc->rcu, nft_trans_gc_trans_free); return nft_trans_gc_alloc(set, 0, gfp); } void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans) { WARN_ON_ONCE(!lockdep_commit_lock_is_held(trans->net)); if (trans->count == 0) { nft_trans_gc_destroy(trans); return; } call_rcu(&trans->rcu, nft_trans_gc_trans_free); } struct nft_trans_gc *nft_trans_gc_catchall_async(struct nft_trans_gc *gc, unsigned int gc_seq) { struct nft_set_elem_catchall *catchall; const struct nft_set *set = gc->set; struct nft_set_ext *ext; list_for_each_entry_rcu(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); if (!nft_set_elem_expired(ext)) continue; if (nft_set_elem_is_dead(ext)) goto dead_elem; nft_set_elem_dead(ext); dead_elem: gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); if (!gc) return NULL; nft_trans_gc_elem_add(gc, catchall->elem); } return gc; } struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc) { struct nft_set_elem_catchall *catchall, *next; u64 tstamp = nft_net_tstamp(gc->net); const struct nft_set *set = gc->set; struct nft_elem_priv *elem_priv; struct nft_set_ext *ext; WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net)); list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); if (!__nft_set_elem_expired(ext, tstamp)) continue; gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); if (!gc) return NULL; elem_priv = catchall->elem; nft_setelem_data_deactivate(gc->net, gc->set, elem_priv); nft_setelem_catchall_destroy(catchall); nft_trans_gc_elem_add(gc, elem_priv); } return gc; } static void nf_tables_module_autoload_cleanup(struct net *net) { struct nftables_pernet *nft_net = nft_pernet(net); struct nft_module_request *req, *next; WARN_ON_ONCE(!list_empty(&nft_net->commit_list)); list_for_each_entry_safe(req, next, &nft_net->module_list, list) { WARN_ON_ONCE(!req->done); list_del(&req->list); kfree(req); } } static void nf_tables_commit_release(struct net *net) { struct nftables_pernet *nft_net = nft_pernet(net); struct nft_trans *trans; /* all side effects have to be made visible. * For example, if a chain named 'foo' has been deleted, a * new transaction must not find it anymore. * * Memory reclaim happens asynchronously from work queue * to prevent expensive synchronize_rcu() in commit phase. */ if (list_empty(&nft_net->commit_list)) { nf_tables_module_autoload_cleanup(net); mutex_unlock(&nft_net->commit_mutex); return; } trans = list_last_entry(&nft_net->commit_list, struct nft_trans, list); get_net(trans->net); WARN_ON_ONCE(trans->put_net); trans->put_net = true; spin_lock(&nf_tables_destroy_list_lock); list_splice_tail_init(&nft_net->commit_list, &nf_tables_destroy_list); spin_unlock(&nf_tables_destroy_list_lock); nf_tables_module_autoload_cleanup(net); schedule_work(&trans_destroy_work); mutex_unlock(&nft_net->commit_mutex); } static void nft_commit_notify(struct net *net, u32 portid) { struct nftables_pernet *nft_net = nft_pernet(net); struct sk_buff *batch_skb = NULL, *nskb, *skb; unsigned char *data; int len; list_for_each_entry_safe(skb, nskb, &nft_net->notify_list, list) { if (!batch_skb) { new_batch: batch_skb = skb; len = NLMSG_GOODSIZE - skb->len; list_del(&skb->list); continue; } len -= skb->len; if (len > 0 && NFT_CB(skb).report == NFT_CB(batch_skb).report) { data = skb_put(batch_skb, skb->len); memcpy(data, skb->data, skb->len); list_del(&skb->list); kfree_skb(skb); continue; } nfnetlink_send(batch_skb, net, portid, NFNLGRP_NFTABLES, NFT_CB(batch_skb).report, GFP_KERNEL); goto new_batch; } if (batch_skb) { nfnetlink_send(batch_skb, net, portid, NFNLGRP_NFTABLES, NFT_CB(batch_skb).report, GFP_KERNEL); } WARN_ON_ONCE(!list_empty(&nft_net->notify_list)); } static int nf_tables_com