1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 | // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:ip,port type */ #include <linux/jhash.h> #include <linux/module.h> #include <linux/ip.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/random.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/netlink.h> #include <net/tcp.h> #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_getport.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 /* 1 SCTP and UDPLITE support added */ /* 2 Counters support added */ /* 3 Comments support added */ /* 4 Forceadd support added */ /* 5 skbinfo support added */ /* 6 bucketsize, initval support added */ #define IPSET_TYPE_REV_MAX 7 /* bitmask support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:ip,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip,port"); /* Type specific function prefix */ #define HTYPE hash_ipport #define IP_SET_HASH_WITH_NETMASK #define IP_SET_HASH_WITH_BITMASK /* IPv4 variant */ /* Member elements */ struct hash_ipport4_elem { __be32 ip; __be16 port; u8 proto; u8 padding; }; /* Common functions */ static bool hash_ipport4_data_equal(const struct hash_ipport4_elem *ip1, const struct hash_ipport4_elem *ip2, u32 *multi) { return ip1->ip == ip2->ip && ip1->port == ip2->port && ip1->proto == ip2->proto; } static bool hash_ipport4_data_list(struct sk_buff *skb, const struct hash_ipport4_elem *data) { if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_ipport4_data_next(struct hash_ipport4_elem *next, const struct hash_ipport4_elem *d) { next->ip = d->ip; next->port = d->port; } #define MTYPE hash_ipport4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" static int hash_ipport4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipport4_elem e = { .ip = 0 }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); const struct MTYPE *h = set->data; if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); e.ip &= h->bitmask.ip; if (e.ip == 0) return -EINVAL; return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { struct hash_ipport4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipport4_elem e = { .ip = 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip, ip_to = 0, p = 0, port, port_to, i = 0; bool with_ports = false; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; e.ip &= h->bitmask.ip; if (e.ip == 0) return -EINVAL; e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); with_ports = ip_set_proto_with_ports(e.proto); if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else { return -IPSET_ERR_MISSING_PROTO; } if (!(with_ports || e.proto == IPPROTO_ICMP)) e.port = 0; if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] || tb[IPSET_ATTR_PORT_TO])) { ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } ip_to = ip = ntohl(e.ip); if (tb[IPSET_ATTR_IP_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) return ret; if (ip > ip_to) swap(ip, ip_to); } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } port_to = port = ntohs(e.port); if (with_ports && tb[IPSET_ATTR_PORT_TO]) { port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); } if (retried) ip = ntohl(h->next.ip); for (; ip <= ip_to; ip++) { p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; for (; p <= port_to; p++, i++) { e.ip = htonl(ip); e.port = htons(p); if (i > IPSET_MAX_RANGE) { hash_ipport4_data_next(&h->next, &e); return -ERANGE; } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } } return ret; } /* IPv6 variant */ struct hash_ipport6_elem { union nf_inet_addr ip; __be16 port; u8 proto; u8 padding; }; /* Common functions */ static bool hash_ipport6_data_equal(const struct hash_ipport6_elem *ip1, const struct hash_ipport6_elem *ip2, u32 *multi) { return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && ip1->port == ip2->port && ip1->proto == ip2->proto; } static bool hash_ipport6_data_list(struct sk_buff *skb, const struct hash_ipport6_elem *data) { if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_ipport6_data_next(struct hash_ipport6_elem *next, const struct hash_ipport6_elem *d) { next->port = d->port; } #undef MTYPE #undef HOST_MASK #define MTYPE hash_ipport6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" static int hash_ipport6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipport6_elem e = { .ip = { .all = { 0 } } }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); const struct MTYPE *h = set->data; if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); nf_inet_addr_mask_inplace(&e.ip, &h->bitmask); if (ipv6_addr_any(&e.ip.in6)) return -EINVAL; return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { const struct hash_ipport6 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipport6_elem e = { .ip = { .all = { 0 } } }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 port, port_to; bool with_ports = false; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; if (unlikely(tb[IPSET_ATTR_CIDR])) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (cidr != HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; nf_inet_addr_mask_inplace(&e.ip, &h->bitmask); if (ipv6_addr_any(&e.ip.in6)) return -EINVAL; e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); with_ports = ip_set_proto_with_ports(e.proto); if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else { return -IPSET_ERR_MISSING_PROTO; } if (!(with_ports || e.proto == IPPROTO_ICMPV6)) e.port = 0; if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } port = ntohs(e.port); port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); if (retried) port = ntohs(h->next.port); for (; port <= port_to; port++) { e.port = htons(port); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } return ret; } static struct ip_set_type hash_ipport_type __read_mostly = { .name = "hash:ip,port", .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_IP | IPSET_TYPE_PORT, .dimension = IPSET_DIM_TWO, .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_ipport_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_NETMASK] = { .type = NLA_U8 }, [IPSET_ATTR_BITMASK] = { .type = NLA_NESTED }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_PORT] = { .type = NLA_U16 }, [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; static int __init hash_ipport_init(void) { return ip_set_type_register(&hash_ipport_type); } static void __exit hash_ipport_fini(void) { rcu_barrier(); ip_set_type_unregister(&hash_ipport_type); } module_init(hash_ipport_init); module_exit(hash_ipport_fini); |
1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2001 Vojtech Pavlik * * CATC EL1210A NetMate USB Ethernet driver * * Sponsored by SuSE * * Based on the work of * Donald Becker * * Old chipset support added by Simon Evans <spse@secret.org.uk> 2002 * - adds support for Belkin F5U011 */ /* * * Should you need to contact me, the author, you can do so either by * e-mail - mail your message to <vojtech@suse.cz>, or by paper mail: * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <linux/spinlock.h> #include <linux/ethtool.h> #include <linux/crc32.h> #include <linux/bitops.h> #include <linux/gfp.h> #include <linux/uaccess.h> #undef DEBUG #include <linux/usb.h> /* * Version information. */ #define DRIVER_VERSION "v2.8" #define DRIVER_AUTHOR "Vojtech Pavlik <vojtech@suse.cz>" #define DRIVER_DESC "CATC EL1210A NetMate USB Ethernet driver" #define SHORT_DRIVER_DESC "EL1210A NetMate USB Ethernet" MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL"); static const char driver_name[] = "catc"; /* * Some defines. */ #define STATS_UPDATE (HZ) /* Time between stats updates */ #define TX_TIMEOUT (5*HZ) /* Max time the queue can be stopped */ #define PKT_SZ 1536 /* Max Ethernet packet size */ #define RX_MAX_BURST 15 /* Max packets per rx buffer (> 0, < 16) */ #define TX_MAX_BURST 15 /* Max full sized packets per tx buffer (> 0) */ #define CTRL_QUEUE 16 /* Max control requests in flight (power of two) */ #define RX_PKT_SZ 1600 /* Max size of receive packet for F5U011 */ /* * Control requests. */ enum control_requests { ReadMem = 0xf1, GetMac = 0xf2, Reset = 0xf4, SetMac = 0xf5, SetRxMode = 0xf5, /* F5U011 only */ WriteROM = 0xf8, SetReg = 0xfa, GetReg = 0xfb, WriteMem = 0xfc, ReadROM = 0xfd, }; /* * Registers. */ enum register_offsets { TxBufCount = 0x20, RxBufCount = 0x21, OpModes = 0x22, TxQed = 0x23, RxQed = 0x24, MaxBurst = 0x25, RxUnit = 0x60, EthStatus = 0x61, StationAddr0 = 0x67, EthStats = 0x69, LEDCtrl = 0x81, }; enum eth_stats { TxSingleColl = 0x00, TxMultiColl = 0x02, TxExcessColl = 0x04, RxFramErr = 0x06, }; enum op_mode_bits { Op3MemWaits = 0x03, OpLenInclude = 0x08, OpRxMerge = 0x10, OpTxMerge = 0x20, OpWin95bugfix = 0x40, OpLoopback = 0x80, }; enum rx_filter_bits { RxEnable = 0x01, RxPolarity = 0x02, RxForceOK = 0x04, RxMultiCast = 0x08, RxPromisc = 0x10, AltRxPromisc = 0x20, /* F5U011 uses different bit */ }; enum led_values { LEDFast = 0x01, LEDSlow = 0x02, LEDFlash = 0x03, LEDPulse = 0x04, LEDLink = 0x08, }; enum link_status { LinkNoChange = 0, LinkGood = 1, LinkBad = 2 }; /* * The catc struct. */ #define CTRL_RUNNING 0 #define RX_RUNNING 1 #define TX_RUNNING 2 struct catc { struct net_device *netdev; struct usb_device *usbdev; unsigned long flags; unsigned int tx_ptr, tx_idx; unsigned int ctrl_head, ctrl_tail; spinlock_t tx_lock, ctrl_lock; u8 tx_buf[2][TX_MAX_BURST * (PKT_SZ + 2)]; u8 rx_buf[RX_MAX_BURST * (PKT_SZ + 2)]; u8 irq_buf[2]; u8 ctrl_buf[64]; struct usb_ctrlrequest ctrl_dr; struct timer_list timer; u8 stats_buf[8]; u16 stats_vals[4]; unsigned long last_stats; u8 multicast[64]; struct ctrl_queue { u8 dir; u8 request; u16 value; u16 index; void *buf; int len; void (*callback)(struct catc *catc, struct ctrl_queue *q); } ctrl_queue[CTRL_QUEUE]; struct urb *tx_urb, *rx_urb, *irq_urb, *ctrl_urb; u8 is_f5u011; /* Set if device is an F5U011 */ u8 rxmode[2]; /* Used for F5U011 */ atomic_t recq_sz; /* Used for F5U011 - counter of waiting rx packets */ }; /* * Useful macros. */ #define catc_get_mac(catc, mac) catc_ctrl_msg(catc, USB_DIR_IN, GetMac, 0, 0, mac, 6) #define catc_reset(catc) catc_ctrl_msg(catc, USB_DIR_OUT, Reset, 0, 0, NULL, 0) #define catc_set_reg(catc, reg, val) catc_ctrl_msg(catc, USB_DIR_OUT, SetReg, val, reg, NULL, 0) #define catc_get_reg(catc, reg, buf) catc_ctrl_msg(catc, USB_DIR_IN, GetReg, 0, reg, buf, 1) #define catc_write_mem(catc, addr, buf, size) catc_ctrl_msg(catc, USB_DIR_OUT, WriteMem, 0, addr, buf, size) #define catc_read_mem(catc, addr, buf, size) catc_ctrl_msg(catc, USB_DIR_IN, ReadMem, 0, addr, buf, size) #define f5u011_rxmode(catc, rxmode) catc_ctrl_msg(catc, USB_DIR_OUT, SetRxMode, 0, 1, rxmode, 2) #define f5u011_rxmode_async(catc, rxmode) catc_ctrl_async(catc, USB_DIR_OUT, SetRxMode, 0, 1, &rxmode, 2, NULL) #define f5u011_mchash_async(catc, hash) catc_ctrl_async(catc, USB_DIR_OUT, SetRxMode, 0, 2, &hash, 8, NULL) #define catc_set_reg_async(catc, reg, val) catc_ctrl_async(catc, USB_DIR_OUT, SetReg, val, reg, NULL, 0, NULL) #define catc_get_reg_async(catc, reg, cb) catc_ctrl_async(catc, USB_DIR_IN, GetReg, 0, reg, NULL, 1, cb) #define catc_write_mem_async(catc, addr, buf, size) catc_ctrl_async(catc, USB_DIR_OUT, WriteMem, 0, addr, buf, size, NULL) /* * Receive routines. */ static void catc_rx_done(struct urb *urb) { struct catc *catc = urb->context; u8 *pkt_start = urb->transfer_buffer; struct sk_buff *skb; int pkt_len, pkt_offset = 0; int status = urb->status; if (!catc->is_f5u011) { clear_bit(RX_RUNNING, &catc->flags); pkt_offset = 2; } if (status) { dev_dbg(&urb->dev->dev, "rx_done, status %d, length %d\n", status, urb->actual_length); return; } do { if(!catc->is_f5u011) { pkt_len = le16_to_cpup((__le16*)pkt_start); if (pkt_len > urb->actual_length) { catc->netdev->stats.rx_length_errors++; catc->netdev->stats.rx_errors++; break; } } else { pkt_len = urb->actual_length; } if (!(skb = dev_alloc_skb(pkt_len))) return; skb_copy_to_linear_data(skb, pkt_start + pkt_offset, pkt_len); skb_put(skb, pkt_len); skb->protocol = eth_type_trans(skb, catc->netdev); netif_rx(skb); catc->netdev->stats.rx_packets++; catc->netdev->stats.rx_bytes += pkt_len; /* F5U011 only does one packet per RX */ if (catc->is_f5u011) break; pkt_start += (((pkt_len + 1) >> 6) + 1) << 6; } while (pkt_start - (u8 *) urb->transfer_buffer < urb->actual_length); if (catc->is_f5u011) { if (atomic_read(&catc->recq_sz)) { int state; atomic_dec(&catc->recq_sz); netdev_dbg(catc->netdev, "getting extra packet\n"); urb->dev = catc->usbdev; if ((state = usb_submit_urb(urb, GFP_ATOMIC)) < 0) { netdev_dbg(catc->netdev, "submit(rx_urb) status %d\n", state); } } else { clear_bit(RX_RUNNING, &catc->flags); } } } static void catc_irq_done(struct urb *urb) { struct catc *catc = urb->context; u8 *data = urb->transfer_buffer; int status = urb->status; unsigned int hasdata, linksts = LinkNoChange; int res; if (!catc->is_f5u011) { hasdata = data[1] & 0x80; if (data[1] & 0x40) linksts = LinkGood; else if (data[1] & 0x20) linksts = LinkBad; } else { hasdata = (unsigned int)(be16_to_cpup((__be16*)data) & 0x0fff); if (data[0] == 0x90) linksts = LinkGood; else if (data[0] == 0xA0) linksts = LinkBad; } switch (status) { case 0: /* success */ break; case -ECONNRESET: /* unlink */ case -ENOENT: case -ESHUTDOWN: return; /* -EPIPE: should clear the halt */ default: /* error */ dev_dbg(&urb->dev->dev, "irq_done, status %d, data %02x %02x.\n", status, data[0], data[1]); goto resubmit; } if (linksts == LinkGood) { netif_carrier_on(catc->netdev); netdev_dbg(catc->netdev, "link ok\n"); } if (linksts == LinkBad) { netif_carrier_off(catc->netdev); netdev_dbg(catc->netdev, "link bad\n"); } if (hasdata) { if (test_and_set_bit(RX_RUNNING, &catc->flags)) { if (catc->is_f5u011) atomic_inc(&catc->recq_sz); } else { catc->rx_urb->dev = catc->usbdev; if ((res = usb_submit_urb(catc->rx_urb, GFP_ATOMIC)) < 0) { dev_err(&catc->usbdev->dev, "submit(rx_urb) status %d\n", res); } } } resubmit: res = usb_submit_urb (urb, GFP_ATOMIC); if (res) dev_err(&catc->usbdev->dev, "can't resubmit intr, %s-%s, status %d\n", catc->usbdev->bus->bus_name, catc->usbdev->devpath, res); } /* * Transmit routines. */ static int catc_tx_run(struct catc *catc) { int status; if (catc->is_f5u011) catc->tx_ptr = (catc->tx_ptr + 63) & ~63; catc->tx_urb->transfer_buffer_length = catc->tx_ptr; catc->tx_urb->transfer_buffer = catc->tx_buf[catc->tx_idx]; catc->tx_urb->dev = catc->usbdev; if ((status = usb_submit_urb(catc->tx_urb, GFP_ATOMIC)) < 0) dev_err(&catc->usbdev->dev, "submit(tx_urb), status %d\n", status); catc->tx_idx = !catc->tx_idx; catc->tx_ptr = 0; netif_trans_update(catc->netdev); return status; } static void catc_tx_done(struct urb *urb) { struct catc *catc = urb->context; unsigned long flags; int r, status = urb->status; if (status == -ECONNRESET) { dev_dbg(&urb->dev->dev, "Tx Reset.\n"); urb->status = 0; netif_trans_update(catc->netdev); catc->netdev->stats.tx_errors++; clear_bit(TX_RUNNING, &catc->flags); netif_wake_queue(catc->netdev); return; } if (status) { dev_dbg(&urb->dev->dev, "tx_done, status %d, length %d\n", status, urb->actual_length); return; } spin_lock_irqsave(&catc->tx_lock, flags); if (catc->tx_ptr) { r = catc_tx_run(catc); if (unlikely(r < 0)) clear_bit(TX_RUNNING, &catc->flags); } else { clear_bit(TX_RUNNING, &catc->flags); } netif_wake_queue(catc->netdev); spin_unlock_irqrestore(&catc->tx_lock, flags); } static netdev_tx_t catc_start_xmit(struct sk_buff *skb, struct net_device *netdev) { struct catc *catc = netdev_priv(netdev); unsigned long flags; int r = 0; char *tx_buf; spin_lock_irqsave(&catc->tx_lock, flags); catc->tx_ptr = (((catc->tx_ptr - 1) >> 6) + 1) << 6; tx_buf = catc->tx_buf[catc->tx_idx] + catc->tx_ptr; if (catc->is_f5u011) *(__be16 *)tx_buf = cpu_to_be16(skb->len); else *(__le16 *)tx_buf = cpu_to_le16(skb->len); skb_copy_from_linear_data(skb, tx_buf + 2, skb->len); catc->tx_ptr += skb->len + 2; if (!test_and_set_bit(TX_RUNNING, &catc->flags)) { r = catc_tx_run(catc); if (r < 0) clear_bit(TX_RUNNING, &catc->flags); } if ((catc->is_f5u011 && catc->tx_ptr) || (catc->tx_ptr >= ((TX_MAX_BURST - 1) * (PKT_SZ + 2)))) netif_stop_queue(netdev); spin_unlock_irqrestore(&catc->tx_lock, flags); if (r >= 0) { catc->netdev->stats.tx_bytes += skb->len; catc->netdev->stats.tx_packets++; } dev_kfree_skb(skb); return NETDEV_TX_OK; } static void catc_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct catc *catc = netdev_priv(netdev); dev_warn(&netdev->dev, "Transmit timed out.\n"); usb_unlink_urb(catc->tx_urb); } /* * Control messages. */ static int catc_ctrl_msg(struct catc *catc, u8 dir, u8 request, u16 value, u16 index, void *buf, int len) { int retval = usb_control_msg(catc->usbdev, dir ? usb_rcvctrlpipe(catc->usbdev, 0) : usb_sndctrlpipe(catc->usbdev, 0), request, 0x40 | dir, value, index, buf, len, 1000); return retval < 0 ? retval : 0; } static void catc_ctrl_run(struct catc *catc) { struct ctrl_queue *q = catc->ctrl_queue + catc->ctrl_tail; struct usb_device *usbdev = catc->usbdev; struct urb *urb = catc->ctrl_urb; struct usb_ctrlrequest *dr = &catc->ctrl_dr; int status; dr->bRequest = q->request; dr->bRequestType = 0x40 | q->dir; dr->wValue = cpu_to_le16(q->value); dr->wIndex = cpu_to_le16(q->index); dr->wLength = cpu_to_le16(q->len); urb->pipe = q->dir ? usb_rcvctrlpipe(usbdev, 0) : usb_sndctrlpipe(usbdev, 0); urb->transfer_buffer_length = q->len; urb->transfer_buffer = catc->ctrl_buf; urb->setup_packet = (void *) dr; urb->dev = usbdev; if (!q->dir && q->buf && q->len) memcpy(catc->ctrl_buf, q->buf, q->len); if ((status = usb_submit_urb(catc->ctrl_urb, GFP_ATOMIC))) dev_err(&catc->usbdev->dev, "submit(ctrl_urb) status %d\n", status); } static void catc_ctrl_done(struct urb *urb) { struct catc *catc = urb->context; struct ctrl_queue *q; unsigned long flags; int status = urb->status; if (status) dev_dbg(&urb->dev->dev, "ctrl_done, status %d, len %d.\n", status, urb->actual_length); spin_lock_irqsave(&catc->ctrl_lock, flags); q = catc->ctrl_queue + catc->ctrl_tail; if (q->dir) { if (q->buf && q->len) memcpy(q->buf, catc->ctrl_buf, q->len); else q->buf = catc->ctrl_buf; } if (q->callback) q->callback(catc, q); catc->ctrl_tail = (catc->ctrl_tail + 1) & (CTRL_QUEUE - 1); if (catc->ctrl_head != catc->ctrl_tail) catc_ctrl_run(catc); else clear_bit(CTRL_RUNNING, &catc->flags); spin_unlock_irqrestore(&catc->ctrl_lock, flags); } static int catc_ctrl_async(struct catc *catc, u8 dir, u8 request, u16 value, u16 index, void *buf, int len, void (*callback)(struct catc *catc, struct ctrl_queue *q)) { struct ctrl_queue *q; int retval = 0; unsigned long flags; spin_lock_irqsave(&catc->ctrl_lock, flags); q = catc->ctrl_queue + catc->ctrl_head; q->dir = dir; q->request = request; q->value = value; q->index = index; q->buf = buf; q->len = len; q->callback = callback; catc->ctrl_head = (catc->ctrl_head + 1) & (CTRL_QUEUE - 1); if (catc->ctrl_head == catc->ctrl_tail) { dev_err(&catc->usbdev->dev, "ctrl queue full\n"); catc->ctrl_tail = (catc->ctrl_tail + 1) & (CTRL_QUEUE - 1); retval = -1; } if (!test_and_set_bit(CTRL_RUNNING, &catc->flags)) catc_ctrl_run(catc); spin_unlock_irqrestore(&catc->ctrl_lock, flags); return retval; } /* * Statistics. */ static void catc_stats_done(struct catc *catc, struct ctrl_queue *q) { int index = q->index - EthStats; u16 data, last; catc->stats_buf[index] = *((char *)q->buf); if (index & 1) return; data = ((u16)catc->stats_buf[index] << 8) | catc->stats_buf[index + 1]; last = catc->stats_vals[index >> 1]; switch (index) { case TxSingleColl: case TxMultiColl: catc->netdev->stats.collisions += data - last; break; case TxExcessColl: catc->netdev->stats.tx_aborted_errors += data - last; catc->netdev->stats.tx_errors += data - last; break; case RxFramErr: catc->netdev->stats.rx_frame_errors += data - last; catc->netdev->stats.rx_errors += data - last; break; } catc->stats_vals[index >> 1] = data; } static void catc_stats_timer(struct timer_list *t) { struct catc *catc = from_timer(catc, t, timer); int i; for (i = 0; i < 8; i++) catc_get_reg_async(catc, EthStats + 7 - i, catc_stats_done); mod_timer(&catc->timer, jiffies + STATS_UPDATE); } /* * Receive modes. Broadcast, Multicast, Promisc. */ static void catc_multicast(const unsigned char *addr, u8 *multicast) { u32 crc; crc = ether_crc_le(6, addr); multicast[(crc >> 3) & 0x3f] |= 1 << (crc & 7); } static void catc_set_multicast_list(struct net_device *netdev) { struct catc *catc = netdev_priv(netdev); struct netdev_hw_addr *ha; u8 broadcast[ETH_ALEN]; u8 rx = RxEnable | RxPolarity | RxMultiCast; eth_broadcast_addr(broadcast); memset(catc->multicast, 0, 64); catc_multicast(broadcast, catc->multicast); catc_multicast(netdev->dev_addr, catc->multicast); if (netdev->flags & IFF_PROMISC) { memset(catc->multicast, 0xff, 64); rx |= (!catc->is_f5u011) ? RxPromisc : AltRxPromisc; } if (netdev->flags & IFF_ALLMULTI) { memset(catc->multicast, 0xff, 64); } else { netdev_for_each_mc_addr(ha, netdev) { u32 crc = ether_crc_le(6, ha->addr); if (!catc->is_f5u011) { catc->multicast[(crc >> 3) & 0x3f] |= 1 << (crc & 7); } else { catc->multicast[7-(crc >> 29)] |= 1 << ((crc >> 26) & 7); } } } if (!catc->is_f5u011) { catc_set_reg_async(catc, RxUnit, rx); catc_write_mem_async(catc, 0xfa80, catc->multicast, 64); } else { f5u011_mchash_async(catc, catc->multicast); if (catc->rxmode[0] != rx) { catc->rxmode[0] = rx; netdev_dbg(catc->netdev, "Setting RX mode to %2.2X %2.2X\n", catc->rxmode[0], catc->rxmode[1]); f5u011_rxmode_async(catc, catc->rxmode); } } } static void catc_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { struct catc *catc = netdev_priv(dev); strscpy(info->driver, driver_name, sizeof(info->driver)); strscpy(info->version, DRIVER_VERSION, sizeof(info->version)); usb_make_path(catc->usbdev, info->bus_info, sizeof(info->bus_info)); } static int catc_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { struct catc *catc = netdev_priv(dev); if (!catc->is_f5u011) return -EOPNOTSUPP; ethtool_link_ksettings_zero_link_mode(cmd, supported); ethtool_link_ksettings_add_link_mode(cmd, supported, 10baseT_Half); ethtool_link_ksettings_add_link_mode(cmd, supported, TP); ethtool_link_ksettings_zero_link_mode(cmd, advertising); ethtool_link_ksettings_add_link_mode(cmd, advertising, 10baseT_Half); ethtool_link_ksettings_add_link_mode(cmd, advertising, TP); cmd->base.speed = SPEED_10; cmd->base.duplex = DUPLEX_HALF; cmd->base.port = PORT_TP; cmd->base.phy_address = 0; cmd->base.autoneg = AUTONEG_DISABLE; return 0; } static const struct ethtool_ops ops = { .get_drvinfo = catc_get_drvinfo, .get_link = ethtool_op_get_link, .get_link_ksettings = catc_get_link_ksettings, }; /* * Open, close. */ static int catc_open(struct net_device *netdev) { struct catc *catc = netdev_priv(netdev); int status; catc->irq_urb->dev = catc->usbdev; if ((status = usb_submit_urb(catc->irq_urb, GFP_KERNEL)) < 0) { dev_err(&catc->usbdev->dev, "submit(irq_urb) status %d\n", status); return -1; } netif_start_queue(netdev); if (!catc->is_f5u011) mod_timer(&catc->timer, jiffies + STATS_UPDATE); return 0; } static int catc_stop(struct net_device *netdev) { struct catc *catc = netdev_priv(netdev); netif_stop_queue(netdev); if (!catc->is_f5u011) del_timer_sync(&catc->timer); usb_kill_urb(catc->rx_urb); usb_kill_urb(catc->tx_urb); usb_kill_urb(catc->irq_urb); usb_kill_urb(catc->ctrl_urb); return 0; } static const struct net_device_ops catc_netdev_ops = { .ndo_open = catc_open, .ndo_stop = catc_stop, .ndo_start_xmit = catc_start_xmit, .ndo_tx_timeout = catc_tx_timeout, .ndo_set_rx_mode = catc_set_multicast_list, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; /* * USB probe, disconnect. */ static int catc_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct device *dev = &intf->dev; struct usb_device *usbdev = interface_to_usbdev(intf); struct net_device *netdev; struct catc *catc; u8 broadcast[ETH_ALEN]; u8 *macbuf; int pktsz, ret = -ENOMEM; macbuf = kmalloc(ETH_ALEN, GFP_KERNEL); if (!macbuf) goto error; if (usb_set_interface(usbdev, intf->altsetting->desc.bInterfaceNumber, 1)) { dev_err(dev, "Can't set altsetting 1.\n"); ret = -EIO; goto fail_mem; } netdev = alloc_etherdev(sizeof(struct catc)); if (!netdev) goto fail_mem; catc = netdev_priv(netdev); netdev->netdev_ops = &catc_netdev_ops; netdev->watchdog_timeo = TX_TIMEOUT; netdev->ethtool_ops = &ops; catc->usbdev = usbdev; catc->netdev = netdev; spin_lock_init(&catc->tx_lock); spin_lock_init(&catc->ctrl_lock); timer_setup(&catc->timer, catc_stats_timer, 0); catc->ctrl_urb = usb_alloc_urb(0, GFP_KERNEL); catc->tx_urb = usb_alloc_urb(0, GFP_KERNEL); catc->rx_urb = usb_alloc_urb(0, GFP_KERNEL); catc->irq_urb = usb_alloc_urb(0, GFP_KERNEL); if ((!catc->ctrl_urb) || (!catc->tx_urb) || (!catc->rx_urb) || (!catc->irq_urb)) { dev_err(&intf->dev, "No free urbs available.\n"); ret = -ENOMEM; goto fail_free; } /* The F5U011 has the same vendor/product as the netmate but a device version of 0x130 */ if (le16_to_cpu(usbdev->descriptor.idVendor) == 0x0423 && le16_to_cpu(usbdev->descriptor.idProduct) == 0xa && le16_to_cpu(catc->usbdev->descriptor.bcdDevice) == 0x0130) { dev_dbg(dev, "Testing for f5u011\n"); catc->is_f5u011 = 1; atomic_set(&catc->recq_sz, 0); pktsz = RX_PKT_SZ; } else { pktsz = RX_MAX_BURST * (PKT_SZ + 2); } usb_fill_control_urb(catc->ctrl_urb, usbdev, usb_sndctrlpipe(usbdev, 0), NULL, NULL, 0, catc_ctrl_done, catc); usb_fill_bulk_urb(catc->tx_urb, usbdev, usb_sndbulkpipe(usbdev, 1), NULL, 0, catc_tx_done, catc); usb_fill_bulk_urb(catc->rx_urb, usbdev, usb_rcvbulkpipe(usbdev, 1), catc->rx_buf, pktsz, catc_rx_done, catc); usb_fill_int_urb(catc->irq_urb, usbdev, usb_rcvintpipe(usbdev, 2), catc->irq_buf, 2, catc_irq_done, catc, 1); if (!catc->is_f5u011) { u32 *buf; int i; dev_dbg(dev, "Checking memory size\n"); buf = kmalloc(4, GFP_KERNEL); if (!buf) { ret = -ENOMEM; goto fail_free; } *buf = 0x12345678; catc_write_mem(catc, 0x7a80, buf, 4); *buf = 0x87654321; catc_write_mem(catc, 0xfa80, buf, 4); catc_read_mem(catc, 0x7a80, buf, 4); switch (*buf) { case 0x12345678: catc_set_reg(catc, TxBufCount, 8); catc_set_reg(catc, RxBufCount, 32); dev_dbg(dev, "64k Memory\n"); break; default: dev_warn(&intf->dev, "Couldn't detect memory size, assuming 32k\n"); fallthrough; case 0x87654321: catc_set_reg(catc, TxBufCount, 4); catc_set_reg(catc, RxBufCount, 16); dev_dbg(dev, "32k Memory\n"); break; } kfree(buf); dev_dbg(dev, "Getting MAC from SEEROM.\n"); catc_get_mac(catc, macbuf); eth_hw_addr_set(netdev, macbuf); dev_dbg(dev, "Setting MAC into registers.\n"); for (i = 0; i < 6; i++) catc_set_reg(catc, StationAddr0 - i, netdev->dev_addr[i]); dev_dbg(dev, "Filling the multicast list.\n"); eth_broadcast_addr(broadcast); catc_multicast(broadcast, catc->multicast); catc_multicast(netdev->dev_addr, catc->multicast); catc_write_mem(catc, 0xfa80, catc->multicast, 64); dev_dbg(dev, "Clearing error counters.\n"); for (i = 0; i < 8; i++) catc_set_reg(catc, EthStats + i, 0); catc->last_stats = jiffies; dev_dbg(dev, "Enabling.\n"); catc_set_reg(catc, MaxBurst, RX_MAX_BURST); catc_set_reg(catc, OpModes, OpTxMerge | OpRxMerge | OpLenInclude | Op3MemWaits); catc_set_reg(catc, LEDCtrl, LEDLink); catc_set_reg(catc, RxUnit, RxEnable | RxPolarity | RxMultiCast); } else { dev_dbg(dev, "Performing reset\n"); catc_reset(catc); catc_get_mac(catc, macbuf); eth_hw_addr_set(netdev, macbuf); dev_dbg(dev, "Setting RX Mode\n"); catc->rxmode[0] = RxEnable | RxPolarity | RxMultiCast; catc->rxmode[1] = 0; f5u011_rxmode(catc, catc->rxmode); } dev_dbg(dev, "Init done.\n"); printk(KERN_INFO "%s: %s USB Ethernet at usb-%s-%s, %pM.\n", netdev->name, (catc->is_f5u011) ? "Belkin F5U011" : "CATC EL1210A NetMate", usbdev->bus->bus_name, usbdev->devpath, netdev->dev_addr); usb_set_intfdata(intf, catc); SET_NETDEV_DEV(netdev, &intf->dev); ret = register_netdev(netdev); if (ret) goto fail_clear_intfdata; kfree(macbuf); return 0; fail_clear_intfdata: usb_set_intfdata(intf, NULL); fail_free: usb_free_urb(catc->ctrl_urb); usb_free_urb(catc->tx_urb); usb_free_urb(catc->rx_urb); usb_free_urb(catc->irq_urb); free_netdev(netdev); fail_mem: kfree(macbuf); error: return ret; } static void catc_disconnect(struct usb_interface *intf) { struct catc *catc = usb_get_intfdata(intf); usb_set_intfdata(intf, NULL); if (catc) { unregister_netdev(catc->netdev); usb_free_urb(catc->ctrl_urb); usb_free_urb(catc->tx_urb); usb_free_urb(catc->rx_urb); usb_free_urb(catc->irq_urb); free_netdev(catc->netdev); } } /* * Module functions and tables. */ static const struct usb_device_id catc_id_table[] = { { USB_DEVICE(0x0423, 0xa) }, /* CATC Netmate, Belkin F5U011 */ { USB_DEVICE(0x0423, 0xc) }, /* CATC Netmate II, Belkin F5U111 */ { USB_DEVICE(0x08d1, 0x1) }, /* smartBridges smartNIC */ { } }; MODULE_DEVICE_TABLE(usb, catc_id_table); static struct usb_driver catc_driver = { .name = driver_name, .probe = catc_probe, .disconnect = catc_disconnect, .id_table = catc_id_table, .disable_hub_initiated_lpm = 1, }; module_usb_driver(catc_driver); |
4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 | // SPDX-License-Identifier: GPL-2.0-or-later /* delayacct.c - per-task delay accounting * * Copyright (C) Shailabh Nagar, IBM Corp. 2006 */ #include <linux/sched.h> #include <linux/sched/task.h> #include <linux/sched/cputime.h> #include <linux/sched/clock.h> #include <linux/slab.h> #include <linux/taskstats.h> #include <linux/sysctl.h> #include <linux/delayacct.h> #include <linux/module.h> DEFINE_STATIC_KEY_FALSE(delayacct_key); int delayacct_on __read_mostly; /* Delay accounting turned on/off */ struct kmem_cache *delayacct_cache; static void set_delayacct(bool enabled) { if (enabled) { static_branch_enable(&delayacct_key); delayacct_on = 1; } else { delayacct_on = 0; static_branch_disable(&delayacct_key); } } static int __init delayacct_setup_enable(char *str) { delayacct_on = 1; return 1; } __setup("delayacct", delayacct_setup_enable); void delayacct_init(void) { delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC|SLAB_ACCOUNT); delayacct_tsk_init(&init_task); set_delayacct(delayacct_on); } #ifdef CONFIG_PROC_SYSCTL static int sysctl_delayacct(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int state = delayacct_on; struct ctl_table t; int err; if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; t = *table; t.data = &state; err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); if (err < 0) return err; if (write) set_delayacct(state); return err; } static struct ctl_table kern_delayacct_table[] = { { .procname = "task_delayacct", .data = NULL, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = sysctl_delayacct, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, }; static __init int kernel_delayacct_sysctls_init(void) { register_sysctl_init("kernel", kern_delayacct_table); return 0; } late_initcall(kernel_delayacct_sysctls_init); #endif void __delayacct_tsk_init(struct task_struct *tsk) { tsk->delays = kmem_cache_zalloc(delayacct_cache, GFP_KERNEL); if (tsk->delays) raw_spin_lock_init(&tsk->delays->lock); } /* * Finish delay accounting for a statistic using its timestamps (@start), * accumalator (@total) and @count */ static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *count) { s64 ns = local_clock() - *start; unsigned long flags; if (ns > 0) { raw_spin_lock_irqsave(lock, flags); *total += ns; (*count)++; raw_spin_unlock_irqrestore(lock, flags); } } void __delayacct_blkio_start(void) { current->delays->blkio_start = local_clock(); } /* * We cannot rely on the `current` macro, as we haven't yet switched back to * the process being woken. */ void __delayacct_blkio_end(struct task_struct *p) { delayacct_end(&p->delays->lock, &p->delays->blkio_start, &p->delays->blkio_delay, &p->delays->blkio_count); } int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) { u64 utime, stime, stimescaled, utimescaled; unsigned long long t2, t3; unsigned long flags, t1; s64 tmp; task_cputime(tsk, &utime, &stime); tmp = (s64)d->cpu_run_real_total; tmp += utime + stime; d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp; task_cputime_scaled(tsk, &utimescaled, &stimescaled); tmp = (s64)d->cpu_scaled_run_real_total; tmp += utimescaled + stimescaled; d->cpu_scaled_run_real_total = (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp; /* * No locking available for sched_info (and too expensive to add one) * Mitigate by taking snapshot of values */ t1 = tsk->sched_info.pcount; t2 = tsk->sched_info.run_delay; t3 = tsk->se.sum_exec_runtime; d->cpu_count += t1; tmp = (s64)d->cpu_delay_total + t2; d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp; tmp = (s64)d->cpu_run_virtual_total + t3; d->cpu_run_virtual_total = (tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp; if (!tsk->delays) return 0; /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */ raw_spin_lock_irqsave(&tsk->delays->lock, flags); tmp = d->blkio_delay_total + tsk->delays->blkio_delay; d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp; tmp = d->swapin_delay_total + tsk->delays->swapin_delay; d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp; tmp = d->freepages_delay_total + tsk->delays->freepages_delay; d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp; tmp = d->thrashing_delay_total + tsk->delays->thrashing_delay; d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp; tmp = d->compact_delay_total + tsk->delays->compact_delay; d->compact_delay_total = (tmp < d->compact_delay_total) ? 0 : tmp; tmp = d->wpcopy_delay_total + tsk->delays->wpcopy_delay; d->wpcopy_delay_total = (tmp < d->wpcopy_delay_total) ? 0 : tmp; tmp = d->irq_delay_total + tsk->delays->irq_delay; d->irq_delay_total = (tmp < d->irq_delay_total) ? 0 : tmp; d->blkio_count += tsk->delays->blkio_count; d->swapin_count += tsk->delays->swapin_count; d->freepages_count += tsk->delays->freepages_count; d->thrashing_count += tsk->delays->thrashing_count; d->compact_count += tsk->delays->compact_count; d->wpcopy_count += tsk->delays->wpcopy_count; d->irq_count += tsk->delays->irq_count; raw_spin_unlock_irqrestore(&tsk->delays->lock, flags); return 0; } __u64 __delayacct_blkio_ticks(struct task_struct *tsk) { __u64 ret; unsigned long flags; raw_spin_lock_irqsave(&tsk->delays->lock, flags); ret = nsec_to_clock_t(tsk->delays->blkio_delay); raw_spin_unlock_irqrestore(&tsk->delays->lock, flags); return ret; } void __delayacct_freepages_start(void) { current->delays->freepages_start = local_clock(); } void __delayacct_freepages_end(void) { delayacct_end(¤t->delays->lock, ¤t->delays->freepages_start, ¤t->delays->freepages_delay, ¤t->delays->freepages_count); } void __delayacct_thrashing_start(bool *in_thrashing) { *in_thrashing = !!current->in_thrashing; if (*in_thrashing) return; current->in_thrashing = 1; current->delays->thrashing_start = local_clock(); } void __delayacct_thrashing_end(bool *in_thrashing) { if (*in_thrashing) return; current->in_thrashing = 0; delayacct_end(¤t->delays->lock, ¤t->delays->thrashing_start, ¤t->delays->thrashing_delay, ¤t->delays->thrashing_count); } void __delayacct_swapin_start(void) { current->delays->swapin_start = local_clock(); } void __delayacct_swapin_end(void) { delayacct_end(¤t->delays->lock, ¤t->delays->swapin_start, ¤t->delays->swapin_delay, ¤t->delays->swapin_count); } void __delayacct_compact_start(void) { current->delays->compact_start = local_clock(); } void __delayacct_compact_end(void) { delayacct_end(¤t->delays->lock, ¤t->delays->compact_start, ¤t->delays->compact_delay, ¤t->delays->compact_count); } void __delayacct_wpcopy_start(void) { current->delays->wpcopy_start = local_clock(); } void __delayacct_wpcopy_end(void) { delayacct_end(¤t->delays->lock, ¤t->delays->wpcopy_start, ¤t->delays->wpcopy_delay, ¤t->delays->wpcopy_count); } void __delayacct_irq(struct task_struct *task, u32 delta) { unsigned long flags; raw_spin_lock_irqsave(&task->delays->lock, flags); task->delays->irq_delay += delta; task->delays->irq_count++; raw_spin_unlock_irqrestore(&task->delays->lock, flags); } |
9 9 9 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 | // SPDX-License-Identifier: MIT /* * Copyright © 2021 Intel Corporation */ #include <drm/drm_edid.h> #include <drm/drm_print.h> #include "drm_crtc_internal.h" #include "drm_displayid_internal.h" static const struct displayid_header * displayid_get_header(const u8 *displayid, int length, int index) { const struct displayid_header *base; if (sizeof(*base) > length - index) return ERR_PTR(-EINVAL); base = (const struct displayid_header *)&displayid[index]; return base; } static const struct displayid_header * validate_displayid(const u8 *displayid, int length, int idx) { int i, dispid_length; u8 csum = 0; const struct displayid_header *base; base = displayid_get_header(displayid, length, idx); if (IS_ERR(base)) return base; DRM_DEBUG_KMS("base revision 0x%x, length %d, %d %d\n", base->rev, base->bytes, base->prod_id, base->ext_count); /* +1 for DispID checksum */ dispid_length = sizeof(*base) + base->bytes + 1; if (dispid_length > length - idx) return ERR_PTR(-EINVAL); for (i = 0; i < dispid_length; i++) csum += displayid[idx + i]; if (csum) { DRM_NOTE("DisplayID checksum invalid, remainder is %d\n", csum); return ERR_PTR(-EINVAL); } return base; } static const u8 *drm_find_displayid_extension(const struct drm_edid *drm_edid, int *length, int *idx, int *ext_index) { const struct displayid_header *base; const u8 *displayid; displayid = drm_edid_find_extension(drm_edid, DISPLAYID_EXT, ext_index); if (!displayid) return NULL; /* EDID extensions block checksum isn't for us */ *length = EDID_LENGTH - 1; *idx = 1; base = validate_displayid(displayid, *length, *idx); if (IS_ERR(base)) return NULL; *length = *idx + sizeof(*base) + base->bytes; return displayid; } void displayid_iter_edid_begin(const struct drm_edid *drm_edid, struct displayid_iter *iter) { memset(iter, 0, sizeof(*iter)); iter->drm_edid = drm_edid; } static const struct displayid_block * displayid_iter_block(const struct displayid_iter *iter) { const struct displayid_block *block; if (!iter->section) return NULL; block = (const struct displayid_block *)&iter->section[iter->idx]; if (iter->idx + sizeof(*block) <= iter->length && iter->idx + sizeof(*block) + block->num_bytes <= iter->length) return block; return NULL; } const struct displayid_block * __displayid_iter_next(struct displayid_iter *iter) { const struct displayid_block *block; if (!iter->drm_edid) return NULL; if (iter->section) { /* current block should always be valid */ block = displayid_iter_block(iter); if (WARN_ON(!block)) { iter->section = NULL; iter->drm_edid = NULL; return NULL; } /* next block in section */ iter->idx += sizeof(*block) + block->num_bytes; block = displayid_iter_block(iter); if (block) return block; } for (;;) { /* The first section we encounter is the base section */ bool base_section = !iter->section; iter->section = drm_find_displayid_extension(iter->drm_edid, &iter->length, &iter->idx, &iter->ext_index); if (!iter->section) { iter->drm_edid = NULL; return NULL; } /* Save the structure version and primary use case. */ if (base_section) { const struct displayid_header *base; base = displayid_get_header(iter->section, iter->length, iter->idx); if (!IS_ERR(base)) { iter->version = base->rev; iter->primary_use = base->prod_id; } } iter->idx += sizeof(struct displayid_header); block = displayid_iter_block(iter); if (block) return block; } } void displayid_iter_end(struct displayid_iter *iter) { memset(iter, 0, sizeof(*iter)); } /* DisplayID Structure Version/Revision from the Base Section. */ u8 displayid_version(const struct displayid_iter *iter) { return iter->version; } /* * DisplayID Primary Use Case (2.0+) or Product Type Identifier (1.0-1.3) from * the Base Section. */ u8 displayid_primary_use(const struct displayid_iter *iter) { return iter->primary_use; } |
1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 | // SPDX-License-Identifier: LGPL-2.1-or-later /* * dmxdev.c - DVB demultiplexer device * * Copyright (C) 2000 Ralph Metzler & Marcus Metzler * for convergence integrated media GmbH */ #define pr_fmt(fmt) "dmxdev: " fmt #include <linux/sched.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/module.h> #include <linux/poll.h> #include <linux/ioctl.h> #include <linux/wait.h> #include <linux/uaccess.h> #include <media/dmxdev.h> #include <media/dvb_vb2.h> static int debug; module_param(debug, int, 0644); MODULE_PARM_DESC(debug, "Turn on/off debugging (default:off)."); #define dprintk(fmt, arg...) do { \ if (debug) \ printk(KERN_DEBUG pr_fmt("%s: " fmt), \ __func__, ##arg); \ } while (0) static int dvb_dmxdev_buffer_write(struct dvb_ringbuffer *buf, const u8 *src, size_t len) { ssize_t free; if (!len) return 0; if (!buf->data) return 0; free = dvb_ringbuffer_free(buf); if (len > free) { dprintk("buffer overflow\n"); return -EOVERFLOW; } return dvb_ringbuffer_write(buf, src, len); } static ssize_t dvb_dmxdev_buffer_read(struct dvb_ringbuffer *src, int non_blocking, char __user *buf, size_t count, loff_t *ppos) { size_t todo; ssize_t avail; ssize_t ret = 0; if (!src->data) return 0; if (src->error) { ret = src->error; dvb_ringbuffer_flush(src); return ret; } for (todo = count; todo > 0; todo -= ret) { if (non_blocking && dvb_ringbuffer_empty(src)) { ret = -EWOULDBLOCK; break; } ret = wait_event_interruptible(src->queue, !dvb_ringbuffer_empty(src) || (src->error != 0)); if (ret < 0) break; if (src->error) { ret = src->error; dvb_ringbuffer_flush(src); break; } avail = dvb_ringbuffer_avail(src); if (avail > todo) avail = todo; ret = dvb_ringbuffer_read_user(src, buf, avail); if (ret < 0) break; buf += ret; } return (count - todo) ? (count - todo) : ret; } static struct dmx_frontend *get_fe(struct dmx_demux *demux, int type) { struct list_head *head, *pos; head = demux->get_frontends(demux); if (!head) return NULL; list_for_each(pos, head) if (DMX_FE_ENTRY(pos)->source == type) return DMX_FE_ENTRY(pos); return NULL; } static int dvb_dvr_open(struct inode *inode, struct file *file) { struct dvb_device *dvbdev = file->private_data; struct dmxdev *dmxdev = dvbdev->priv; struct dmx_frontend *front; bool need_ringbuffer = false; dprintk("%s\n", __func__); if (mutex_lock_interruptible(&dmxdev->mutex)) return -ERESTARTSYS; if (dmxdev->exit) { mutex_unlock(&dmxdev->mutex); return -ENODEV; } dmxdev->may_do_mmap = 0; /* * The logic here is a little tricky due to the ifdef. * * The ringbuffer is used for both read and mmap. * * It is not needed, however, on two situations: * - Write devices (access with O_WRONLY); * - For duplex device nodes, opened with O_RDWR. */ if ((file->f_flags & O_ACCMODE) == O_RDONLY) need_ringbuffer = true; else if ((file->f_flags & O_ACCMODE) == O_RDWR) { if (!(dmxdev->capabilities & DMXDEV_CAP_DUPLEX)) { #ifdef CONFIG_DVB_MMAP dmxdev->may_do_mmap = 1; need_ringbuffer = true; #else mutex_unlock(&dmxdev->mutex); return -EOPNOTSUPP; #endif } } if (need_ringbuffer) { void *mem; if (!dvbdev->readers) { mutex_unlock(&dmxdev->mutex); return -EBUSY; } mem = vmalloc(DVR_BUFFER_SIZE); if (!mem) { mutex_unlock(&dmxdev->mutex); return -ENOMEM; } dvb_ringbuffer_init(&dmxdev->dvr_buffer, mem, DVR_BUFFER_SIZE); if (dmxdev->may_do_mmap) dvb_vb2_init(&dmxdev->dvr_vb2_ctx, "dvr", file->f_flags & O_NONBLOCK); dvbdev->readers--; } if ((file->f_flags & O_ACCMODE) == O_WRONLY) { dmxdev->dvr_orig_fe = dmxdev->demux->frontend; if (!dmxdev->demux->write) { mutex_unlock(&dmxdev->mutex); return -EOPNOTSUPP; } front = get_fe(dmxdev->demux, DMX_MEMORY_FE); if (!front) { mutex_unlock(&dmxdev->mutex); return -EINVAL; } dmxdev->demux->disconnect_frontend(dmxdev->demux); dmxdev->demux->connect_frontend(dmxdev->demux, front); } dvbdev->users++; mutex_unlock(&dmxdev->mutex); return 0; } static int dvb_dvr_release(struct inode *inode, struct file *file) { struct dvb_device *dvbdev = file->private_data; struct dmxdev *dmxdev = dvbdev->priv; mutex_lock(&dmxdev->mutex); if ((file->f_flags & O_ACCMODE) == O_WRONLY) { dmxdev->demux->disconnect_frontend(dmxdev->demux); dmxdev->demux->connect_frontend(dmxdev->demux, dmxdev->dvr_orig_fe); } if (((file->f_flags & O_ACCMODE) == O_RDONLY) || dmxdev->may_do_mmap) { if (dmxdev->may_do_mmap) { if (dvb_vb2_is_streaming(&dmxdev->dvr_vb2_ctx)) dvb_vb2_stream_off(&dmxdev->dvr_vb2_ctx); dvb_vb2_release(&dmxdev->dvr_vb2_ctx); } dvbdev->readers++; if (dmxdev->dvr_buffer.data) { void *mem = dmxdev->dvr_buffer.data; /*memory barrier*/ mb(); spin_lock_irq(&dmxdev->lock); dmxdev->dvr_buffer.data = NULL; spin_unlock_irq(&dmxdev->lock); vfree(mem); } } /* TODO */ dvbdev->users--; if (dvbdev->users == 1 && dmxdev->exit == 1) { mutex_unlock(&dmxdev->mutex); wake_up(&dvbdev->wait_queue); } else mutex_unlock(&dmxdev->mutex); return 0; } static ssize_t dvb_dvr_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct dvb_device *dvbdev = file->private_data; struct dmxdev *dmxdev = dvbdev->priv; int ret; if (!dmxdev->demux->write) return -EOPNOTSUPP; if ((file->f_flags & O_ACCMODE) != O_WRONLY) return -EINVAL; if (mutex_lock_interruptible(&dmxdev->mutex)) return -ERESTARTSYS; if (dmxdev->exit) { mutex_unlock(&dmxdev->mutex); return -ENODEV; } ret = dmxdev->demux->write(dmxdev->demux, buf, count); mutex_unlock(&dmxdev->mutex); return ret; } static ssize_t dvb_dvr_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct dvb_device *dvbdev = file->private_data; struct dmxdev *dmxdev = dvbdev->priv; if (dmxdev->exit) return -ENODEV; return dvb_dmxdev_buffer_read(&dmxdev->dvr_buffer, file->f_flags & O_NONBLOCK, buf, count, ppos); } static int dvb_dvr_set_buffer_size(struct dmxdev *dmxdev, unsigned long size) { struct dvb_ringbuffer *buf = &dmxdev->dvr_buffer; void *newmem; void *oldmem; dprintk("%s\n", __func__); if (buf->size == size) return 0; if (!size) return -EINVAL; newmem = vmalloc(size); if (!newmem) return -ENOMEM; oldmem = buf->data; spin_lock_irq(&dmxdev->lock); buf->data = newmem; buf->size = size; /* reset and not flush in case the buffer shrinks */ dvb_ringbuffer_reset(buf); spin_unlock_irq(&dmxdev->lock); vfree(oldmem); return 0; } static inline void dvb_dmxdev_filter_state_set(struct dmxdev_filter *dmxdevfilter, int state) { spin_lock_irq(&dmxdevfilter->dev->lock); dmxdevfilter->state = state; spin_unlock_irq(&dmxdevfilter->dev->lock); } static int dvb_dmxdev_set_buffer_size(struct dmxdev_filter *dmxdevfilter, unsigned long size) { struct dvb_ringbuffer *buf = &dmxdevfilter->buffer; void *newmem; void *oldmem; if (buf->size == size) return 0; if (!size) return -EINVAL; if (dmxdevfilter->state >= DMXDEV_STATE_GO) return -EBUSY; newmem = vmalloc(size); if (!newmem) return -ENOMEM; oldmem = buf->data; spin_lock_irq(&dmxdevfilter->dev->lock); buf->data = newmem; buf->size = size; /* reset and not flush in case the buffer shrinks */ dvb_ringbuffer_reset(buf); spin_unlock_irq(&dmxdevfilter->dev->lock); vfree(oldmem); return 0; } static void dvb_dmxdev_filter_timeout(struct timer_list *t) { struct dmxdev_filter *dmxdevfilter = from_timer(dmxdevfilter, t, timer); dmxdevfilter->buffer.error = -ETIMEDOUT; spin_lock_irq(&dmxdevfilter->dev->lock); dmxdevfilter->state = DMXDEV_STATE_TIMEDOUT; spin_unlock_irq(&dmxdevfilter->dev->lock); wake_up(&dmxdevfilter->buffer.queue); } static void dvb_dmxdev_filter_timer(struct dmxdev_filter *dmxdevfilter) { struct dmx_sct_filter_params *para = &dmxdevfilter->params.sec; del_timer(&dmxdevfilter->timer); if (para->timeout) { dmxdevfilter->timer.expires = jiffies + 1 + (HZ / 2 + HZ * para->timeout) / 1000; add_timer(&dmxdevfilter->timer); } } static int dvb_dmxdev_section_callback(const u8 *buffer1, size_t buffer1_len, const u8 *buffer2, size_t buffer2_len, struct dmx_section_filter *filter, u32 *buffer_flags) { struct dmxdev_filter *dmxdevfilter = filter->priv; int ret; if (!dvb_vb2_is_streaming(&dmxdevfilter->vb2_ctx) && dmxdevfilter->buffer.error) { wake_up(&dmxdevfilter->buffer.queue); return 0; } spin_lock(&dmxdevfilter->dev->lock); if (dmxdevfilter->state != DMXDEV_STATE_GO) { spin_unlock(&dmxdevfilter->dev->lock); return 0; } del_timer(&dmxdevfilter->timer); dprintk("section callback %*ph\n", 6, buffer1); if (dvb_vb2_is_streaming(&dmxdevfilter->vb2_ctx)) { ret = dvb_vb2_fill_buffer(&dmxdevfilter->vb2_ctx, buffer1, buffer1_len, buffer_flags); if (ret == buffer1_len) ret = dvb_vb2_fill_buffer(&dmxdevfilter->vb2_ctx, buffer2, buffer2_len, buffer_flags); } else { ret = dvb_dmxdev_buffer_write(&dmxdevfilter->buffer, buffer1, buffer1_len); if (ret == buffer1_len) { ret = dvb_dmxdev_buffer_write(&dmxdevfilter->buffer, buffer2, buffer2_len); } } if (ret < 0) dmxdevfilter->buffer.error = ret; if (dmxdevfilter->params.sec.flags & DMX_ONESHOT) dmxdevfilter->state = DMXDEV_STATE_DONE; spin_unlock(&dmxdevfilter->dev->lock); wake_up(&dmxdevfilter->buffer.queue); return 0; } static int dvb_dmxdev_ts_callback(const u8 *buffer1, size_t buffer1_len, const u8 *buffer2, size_t buffer2_len, struct dmx_ts_feed *feed, u32 *buffer_flags) { struct dmxdev_filter *dmxdevfilter = feed->priv; struct dvb_ringbuffer *buffer; #ifdef CONFIG_DVB_MMAP struct dvb_vb2_ctx *ctx; #endif int ret; spin_lock(&dmxdevfilter->dev->lock); if (dmxdevfilter->params.pes.output == DMX_OUT_DECODER) { spin_unlock(&dmxdevfilter->dev->lock); return 0; } if (dmxdevfilter->params.pes.output == DMX_OUT_TAP || dmxdevfilter->params.pes.output == DMX_OUT_TSDEMUX_TAP) { buffer = &dmxdevfilter->buffer; #ifdef CONFIG_DVB_MMAP ctx = &dmxdevfilter->vb2_ctx; #endif } else { buffer = &dmxdevfilter->dev->dvr_buffer; #ifdef CONFIG_DVB_MMAP ctx = &dmxdevfilter->dev->dvr_vb2_ctx; #endif } if (dvb_vb2_is_streaming(ctx)) { ret = dvb_vb2_fill_buffer(ctx, buffer1, buffer1_len, buffer_flags); if (ret == buffer1_len) ret = dvb_vb2_fill_buffer(ctx, buffer2, buffer2_len, buffer_flags); } else { if (buffer->error) { spin_unlock(&dmxdevfilter->dev->lock); wake_up(&buffer->queue); return 0; } ret = dvb_dmxdev_buffer_write(buffer, buffer1, buffer1_len); if (ret == buffer1_len) ret = dvb_dmxdev_buffer_write(buffer, buffer2, buffer2_len); } if (ret < 0) buffer->error = ret; spin_unlock(&dmxdevfilter->dev->lock); wake_up(&buffer->queue); return 0; } /* stop feed but only mark the specified filter as stopped (state set) */ static int dvb_dmxdev_feed_stop(struct dmxdev_filter *dmxdevfilter) { struct dmxdev_feed *feed; dvb_dmxdev_filter_state_set(dmxdevfilter, DMXDEV_STATE_SET); switch (dmxdevfilter->type) { case DMXDEV_TYPE_SEC: del_timer(&dmxdevfilter->timer); dmxdevfilter->feed.sec->stop_filtering(dmxdevfilter->feed.sec); break; case DMXDEV_TYPE_PES: list_for_each_entry(feed, &dmxdevfilter->feed.ts, next) feed->ts->stop_filtering(feed->ts); break; default: return -EINVAL; } return 0; } /* start feed associated with the specified filter */ static int dvb_dmxdev_feed_start(struct dmxdev_filter *filter) { struct dmxdev_feed *feed; int ret; dvb_dmxdev_filter_state_set(filter, DMXDEV_STATE_GO); switch (filter->type) { case DMXDEV_TYPE_SEC: return filter->feed.sec->start_filtering(filter->feed.sec); case DMXDEV_TYPE_PES: list_for_each_entry(feed, &filter->feed.ts, next) { ret = feed->ts->start_filtering(feed->ts); if (ret < 0) { dvb_dmxdev_feed_stop(filter); return ret; } } break; default: return -EINVAL; } return 0; } /* restart section feed if it has filters left associated with it, otherwise release the feed */ static int dvb_dmxdev_feed_restart(struct dmxdev_filter *filter) { int i; struct dmxdev *dmxdev = filter->dev; u16 pid = filter->params.sec.pid; for (i = 0; i < dmxdev->filternum; i++) if (dmxdev->filter[i].state >= DMXDEV_STATE_GO && dmxdev->filter[i].type == DMXDEV_TYPE_SEC && dmxdev->filter[i].params.sec.pid == pid) { dvb_dmxdev_feed_start(&dmxdev->filter[i]); return 0; } filter->dev->demux->release_section_feed(dmxdev->demux, filter->feed.sec); return 0; } static int dvb_dmxdev_filter_stop(struct dmxdev_filter *dmxdevfilter) { struct dmxdev_feed *feed; struct dmx_demux *demux; if (dmxdevfilter->state < DMXDEV_STATE_GO) return 0; switch (dmxdevfilter->type) { case DMXDEV_TYPE_SEC: if (!dmxdevfilter->feed.sec) break; dvb_dmxdev_feed_stop(dmxdevfilter); if (dmxdevfilter->filter.sec) dmxdevfilter->feed.sec-> release_filter(dmxdevfilter->feed.sec, dmxdevfilter->filter.sec); dvb_dmxdev_feed_restart(dmxdevfilter); dmxdevfilter->feed.sec = NULL; break; case DMXDEV_TYPE_PES: dvb_dmxdev_feed_stop(dmxdevfilter); demux = dmxdevfilter->dev->demux; list_for_each_entry(feed, &dmxdevfilter->feed.ts, next) { demux->release_ts_feed(demux, feed->ts); feed->ts = NULL; } break; default: if (dmxdevfilter->state == DMXDEV_STATE_ALLOCATED) return 0; return -EINVAL; } dvb_ringbuffer_flush(&dmxdevfilter->buffer); return 0; } static void dvb_dmxdev_delete_pids(struct dmxdev_filter *dmxdevfilter) { struct dmxdev_feed *feed, *tmp; /* delete all PIDs */ list_for_each_entry_safe(feed, tmp, &dmxdevfilter->feed.ts, next) { list_del(&feed->next); kfree(feed); } BUG_ON(!list_empty(&dmxdevfilter->feed.ts)); } static inline int dvb_dmxdev_filter_reset(struct dmxdev_filter *dmxdevfilter) { if (dmxdevfilter->state < DMXDEV_STATE_SET) return 0; if (dmxdevfilter->type == DMXDEV_TYPE_PES) dvb_dmxdev_delete_pids(dmxdevfilter); dmxdevfilter->type = DMXDEV_TYPE_NONE; dvb_dmxdev_filter_state_set(dmxdevfilter, DMXDEV_STATE_ALLOCATED); return 0; } static int dvb_dmxdev_start_feed(struct dmxdev *dmxdev, struct dmxdev_filter *filter, struct dmxdev_feed *feed) { ktime_t timeout = ktime_set(0, 0); struct dmx_pes_filter_params *para = &filter->params.pes; enum dmx_output otype; int ret; int ts_type; enum dmx_ts_pes ts_pes; struct dmx_ts_feed *tsfeed; feed->ts = NULL; otype = para->output; ts_pes = para->pes_type; if (ts_pes < DMX_PES_OTHER) ts_type = TS_DECODER; else ts_type = 0; if (otype == DMX_OUT_TS_TAP) ts_type |= TS_PACKET; else if (otype == DMX_OUT_TSDEMUX_TAP) ts_type |= TS_PACKET | TS_DEMUX; else if (otype == DMX_OUT_TAP) ts_type |= TS_PACKET | TS_DEMUX | TS_PAYLOAD_ONLY; ret = dmxdev->demux->allocate_ts_feed(dmxdev->demux, &feed->ts, dvb_dmxdev_ts_callback); if (ret < 0) return ret; tsfeed = feed->ts; tsfeed->priv = filter; ret = tsfeed->set(tsfeed, feed->pid, ts_type, ts_pes, timeout); if (ret < 0) { dmxdev->demux->release_ts_feed(dmxdev->demux, tsfeed); return ret; } ret = tsfeed->start_filtering(tsfeed); if (ret < 0) { dmxdev->demux->release_ts_feed(dmxdev->demux, tsfeed); return ret; } return 0; } static int dvb_dmxdev_filter_start(struct dmxdev_filter *filter) { struct dmxdev *dmxdev = filter->dev; struct dmxdev_feed *feed; void *mem; int ret, i; if (filter->state < DMXDEV_STATE_SET) return -EINVAL; if (filter->state >= DMXDEV_STATE_GO) dvb_dmxdev_filter_stop(filter); if (!filter->buffer.data) { mem = vmalloc(filter->buffer.size); if (!mem) return -ENOMEM; spin_lock_irq(&filter->dev->lock); filter->buffer.data = mem; spin_unlock_irq(&filter->dev->lock); } dvb_ringbuffer_flush(&filter->buffer); switch (filter->type) { case DMXDEV_TYPE_SEC: { struct dmx_sct_filter_params *para = &filter->params.sec; struct dmx_section_filter **secfilter = &filter->filter.sec; struct dmx_section_feed **secfeed = &filter->feed.sec; *secfilter = NULL; *secfeed = NULL; /* find active filter/feed with same PID */ for (i = 0; i < dmxdev->filternum; i++) { if (dmxdev->filter[i].state >= DMXDEV_STATE_GO && dmxdev->filter[i].type == DMXDEV_TYPE_SEC && dmxdev->filter[i].params.sec.pid == para->pid) { *secfeed = dmxdev->filter[i].feed.sec; break; } } /* if no feed found, try to allocate new one */ if (!*secfeed) { ret = dmxdev->demux->allocate_section_feed(dmxdev->demux, secfeed, dvb_dmxdev_section_callback); if (!*secfeed) { pr_err("DVB (%s): could not alloc feed\n", __func__); return ret; } ret = (*secfeed)->set(*secfeed, para->pid, (para->flags & DMX_CHECK_CRC) ? 1 : 0); if (ret < 0) { pr_err("DVB (%s): could not set feed\n", __func__); dvb_dmxdev_feed_restart(filter); return ret; } } else { dvb_dmxdev_feed_stop(filter); } ret = (*secfeed)->allocate_filter(*secfeed, secfilter); if (ret < 0) { dvb_dmxdev_feed_restart(filter); filter->feed.sec->start_filtering(*secfeed); dprintk("could not get filter\n"); return ret; } (*secfilter)->priv = filter; memcpy(&((*secfilter)->filter_value[3]), &(para->filter.filter[1]), DMX_FILTER_SIZE - 1); memcpy(&(*secfilter)->filter_mask[3], ¶->filter.mask[1], DMX_FILTER_SIZE - 1); memcpy(&(*secfilter)->filter_mode[3], ¶->filter.mode[1], DMX_FILTER_SIZE - 1); (*secfilter)->filter_value[0] = para->filter.filter[0]; (*secfilter)->filter_mask[0] = para->filter.mask[0]; (*secfilter)->filter_mode[0] = para->filter.mode[0]; (*secfilter)->filter_mask[1] = 0; (*secfilter)->filter_mask[2] = 0; filter->todo = 0; ret = filter->feed.sec->start_filtering(filter->feed.sec); if (ret < 0) return ret; dvb_dmxdev_filter_timer(filter); break; } case DMXDEV_TYPE_PES: list_for_each_entry(feed, &filter->feed.ts, next) { ret = dvb_dmxdev_start_feed(dmxdev, filter, feed); if (ret < 0) { dvb_dmxdev_filter_stop(filter); return ret; } } break; default: return -EINVAL; } dvb_dmxdev_filter_state_set(filter, DMXDEV_STATE_GO); return 0; } static int dvb_demux_open(struct inode *inode, struct file *file) { struct dvb_device *dvbdev = file->private_data; struct dmxdev *dmxdev = dvbdev->priv; int i; struct dmxdev_filter *dmxdevfilter; if (!dmxdev->filter) return -EINVAL; if (mutex_lock_interruptible(&dmxdev->mutex)) return -ERESTARTSYS; if (dmxdev->exit) { mutex_unlock(&dmxdev->mutex); return -ENODEV; } for (i = 0; i < dmxdev->filternum; i++) if (dmxdev->filter[i].state == DMXDEV_STATE_FREE) break; if (i == dmxdev->filternum) { mutex_unlock(&dmxdev->mutex); return -EMFILE; } dmxdevfilter = &dmxdev->filter[i]; mutex_init(&dmxdevfilter->mutex); file->private_data = dmxdevfilter; #ifdef CONFIG_DVB_MMAP dmxdev->may_do_mmap = 1; #else dmxdev->may_do_mmap = 0; #endif dvb_ringbuffer_init(&dmxdevfilter->buffer, NULL, 8192); dvb_vb2_init(&dmxdevfilter->vb2_ctx, "demux_filter", file->f_flags & O_NONBLOCK); dmxdevfilter->type = DMXDEV_TYPE_NONE; dvb_dmxdev_filter_state_set(dmxdevfilter, DMXDEV_STATE_ALLOCATED); timer_setup(&dmxdevfilter->timer, dvb_dmxdev_filter_timeout, 0); dvbdev->users++; mutex_unlock(&dmxdev->mutex); return 0; } static int dvb_dmxdev_filter_free(struct dmxdev *dmxdev, struct dmxdev_filter *dmxdevfilter) { mutex_lock(&dmxdev->mutex); mutex_lock(&dmxdevfilter->mutex); if (dvb_vb2_is_streaming(&dmxdevfilter->vb2_ctx)) dvb_vb2_stream_off(&dmxdevfilter->vb2_ctx); dvb_vb2_release(&dmxdevfilter->vb2_ctx); dvb_dmxdev_filter_stop(dmxdevfilter); dvb_dmxdev_filter_reset(dmxdevfilter); if (dmxdevfilter->buffer.data) { void *mem = dmxdevfilter->buffer.data; spin_lock_irq(&dmxdev->lock); dmxdevfilter->buffer.data = NULL; spin_unlock_irq(&dmxdev->lock); vfree(mem); } dvb_dmxdev_filter_state_set(dmxdevfilter, DMXDEV_STATE_FREE); wake_up(&dmxdevfilter->buffer.queue); mutex_unlock(&dmxdevfilter->mutex); mutex_unlock(&dmxdev->mutex); return 0; } static inline void invert_mode(struct dmx_filter *filter) { int i; for (i = 0; i < DMX_FILTER_SIZE; i++) filter->mode[i] ^= 0xff; } static int dvb_dmxdev_add_pid(struct dmxdev *dmxdev, struct dmxdev_filter *filter, u16 pid) { struct dmxdev_feed *feed; if ((filter->type != DMXDEV_TYPE_PES) || (filter->state < DMXDEV_STATE_SET)) return -EINVAL; /* only TS packet filters may have multiple PIDs */ if ((filter->params.pes.output != DMX_OUT_TSDEMUX_TAP) && (!list_empty(&filter->feed.ts))) return -EINVAL; feed = kzalloc(sizeof(struct dmxdev_feed), GFP_KERNEL); if (feed == NULL) return -ENOMEM; feed->pid = pid; list_add(&feed->next, &filter->feed.ts); if (filter->state >= DMXDEV_STATE_GO) return dvb_dmxdev_start_feed(dmxdev, filter, feed); return 0; } static int dvb_dmxdev_remove_pid(struct dmxdev *dmxdev, struct dmxdev_filter *filter, u16 pid) { struct dmxdev_feed *feed, *tmp; if ((filter->type != DMXDEV_TYPE_PES) || (filter->state < DMXDEV_STATE_SET)) return -EINVAL; list_for_each_entry_safe(feed, tmp, &filter->feed.ts, next) { if ((feed->pid == pid) && (feed->ts != NULL)) { feed->ts->stop_filtering(feed->ts); filter->dev->demux->release_ts_feed(filter->dev->demux, feed->ts); list_del(&feed->next); kfree(feed); } } return 0; } static int dvb_dmxdev_filter_set(struct dmxdev *dmxdev, struct dmxdev_filter *dmxdevfilter, struct dmx_sct_filter_params *params) { dprintk("%s: PID=0x%04x, flags=%02x, timeout=%d\n", __func__, params->pid, params->flags, params->timeout); dvb_dmxdev_filter_stop(dmxdevfilter); dmxdevfilter->type = DMXDEV_TYPE_SEC; memcpy(&dmxdevfilter->params.sec, params, sizeof(struct dmx_sct_filter_params)); invert_mode(&dmxdevfilter->params.sec.filter); dvb_dmxdev_filter_state_set(dmxdevfilter, DMXDEV_STATE_SET); if (params->flags & DMX_IMMEDIATE_START) return dvb_dmxdev_filter_start(dmxdevfilter); return 0; } static int dvb_dmxdev_pes_filter_set(struct dmxdev *dmxdev, struct dmxdev_filter *dmxdevfilter, struct dmx_pes_filter_params *params) { int ret; dvb_dmxdev_filter_stop(dmxdevfilter); dvb_dmxdev_filter_reset(dmxdevfilter); if ((unsigned int)params->pes_type > DMX_PES_OTHER) return -EINVAL; dmxdevfilter->type = DMXDEV_TYPE_PES; memcpy(&dmxdevfilter->params, params, sizeof(struct dmx_pes_filter_params)); INIT_LIST_HEAD(&dmxdevfilter->feed.ts); dvb_dmxdev_filter_state_set(dmxdevfilter, DMXDEV_STATE_SET); ret = dvb_dmxdev_add_pid(dmxdev, dmxdevfilter, dmxdevfilter->params.pes.pid); if (ret < 0) return ret; if (params->flags & DMX_IMMEDIATE_START) return dvb_dmxdev_filter_start(dmxdevfilter); return 0; } static ssize_t dvb_dmxdev_read_sec(struct dmxdev_filter *dfil, struct file *file, char __user *buf, size_t count, loff_t *ppos) { int result, hcount; int done = 0; if (dfil->todo <= 0) { hcount = 3 + dfil->todo; if (hcount > count) hcount = count; result = dvb_dmxdev_buffer_read(&dfil->buffer, file->f_flags & O_NONBLOCK, buf, hcount, ppos); if (result < 0) { dfil->todo = 0; return result; } if (copy_from_user(dfil->secheader - dfil->todo, buf, result)) return -EFAULT; buf += result; done = result; count -= result; dfil->todo -= result; if (dfil->todo > -3) return done; dfil->todo = ((dfil->secheader[1] << 8) | dfil->secheader[2]) & 0xfff; if (!count) return done; } if (count > dfil->todo) count = dfil->todo; result = dvb_dmxdev_buffer_read(&dfil->buffer, file->f_flags & O_NONBLOCK, buf, count, ppos); if (result < 0) return result; dfil->todo -= result; return (result + done); } static ssize_t dvb_demux_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct dmxdev_filter *dmxdevfilter = file->private_data; int ret; if (mutex_lock_interruptible(&dmxdevfilter->mutex)) return -ERESTARTSYS; if (dmxdevfilter->type == DMXDEV_TYPE_SEC) ret = dvb_dmxdev_read_sec(dmxdevfilter, file, buf, count, ppos); else ret = dvb_dmxdev_buffer_read(&dmxdevfilter->buffer, file->f_flags & O_NONBLOCK, buf, count, ppos); mutex_unlock(&dmxdevfilter->mutex); return ret; } static int dvb_demux_do_ioctl(struct file *file, unsigned int cmd, void *parg) { struct dmxdev_filter *dmxdevfilter = file->private_data; struct dmxdev *dmxdev = dmxdevfilter->dev; unsigned long arg = (unsigned long)parg; int ret = 0; if (mutex_lock_interruptible(&dmxdev->mutex)) return -ERESTARTSYS; switch (cmd) { case DMX_START: if (mutex_lock_interruptible(&dmxdevfilter->mutex)) { mutex_unlock(&dmxdev->mutex); return -ERESTARTSYS; } if (dmxdevfilter->state < DMXDEV_STATE_SET) ret = -EINVAL; else ret = dvb_dmxdev_filter_start(dmxdevfilter); mutex_unlock(&dmxdevfilter->mutex); break; case DMX_STOP: if (mutex_lock_interruptible(&dmxdevfilter->mutex)) { mutex_unlock(&dmxdev->mutex); return -ERESTARTSYS; } ret = dvb_dmxdev_filter_stop(dmxdevfilter); mutex_unlock(&dmxdevfilter->mutex); break; case DMX_SET_FILTER: if (mutex_lock_interruptible(&dmxdevfilter->mutex)) { mutex_unlock(&dmxdev->mutex); return -ERESTARTSYS; } ret = dvb_dmxdev_filter_set(dmxdev, dmxdevfilter, parg); mutex_unlock(&dmxdevfilter->mutex); break; case DMX_SET_PES_FILTER: if (mutex_lock_interruptible(&dmxdevfilter->mutex)) { mutex_unlock(&dmxdev->mutex); return -ERESTARTSYS; } ret = dvb_dmxdev_pes_filter_set(dmxdev, dmxdevfilter, parg); mutex_unlock(&dmxdevfilter->mutex); break; case DMX_SET_BUFFER_SIZE: if (mutex_lock_interruptible(&dmxdevfilter->mutex)) { mutex_unlock(&dmxdev->mutex); return -ERESTARTSYS; } ret = dvb_dmxdev_set_buffer_size(dmxdevfilter, arg); mutex_unlock(&dmxdevfilter->mutex); break; case DMX_GET_PES_PIDS: if (!dmxdev->demux->get_pes_pids) { ret = -EINVAL; break; } dmxdev->demux->get_pes_pids(dmxdev->demux, parg); break; case DMX_GET_STC: if (!dmxdev->demux->get_stc) { ret = -EINVAL; break; } ret = dmxdev->demux->get_stc(dmxdev->demux, ((struct dmx_stc *)parg)->num, &((struct dmx_stc *)parg)->stc, &((struct dmx_stc *)parg)->base); break; case DMX_ADD_PID: if (mutex_lock_interruptible(&dmxdevfilter->mutex)) { ret = -ERESTARTSYS; break; } ret = dvb_dmxdev_add_pid(dmxdev, dmxdevfilter, *(u16 *)parg); mutex_unlock(&dmxdevfilter->mutex); break; case DMX_REMOVE_PID: if (mutex_lock_interruptible(&dmxdevfilter->mutex)) { ret = -ERESTARTSYS; break; } ret = dvb_dmxdev_remove_pid(dmxdev, dmxdevfilter, *(u16 *)parg); mutex_unlock(&dmxdevfilter->mutex); break; #ifdef CONFIG_DVB_MMAP case DMX_REQBUFS: if (mutex_lock_interruptible(&dmxdevfilter->mutex)) { mutex_unlock(&dmxdev->mutex); return -ERESTARTSYS; } ret = dvb_vb2_reqbufs(&dmxdevfilter->vb2_ctx, parg); mutex_unlock(&dmxdevfilter->mutex); break; case DMX_QUERYBUF: if (mutex_lock_interruptible(&dmxdevfilter->mutex)) { mutex_unlock(&dmxdev->mutex); return -ERESTARTSYS; } ret = dvb_vb2_querybuf(&dmxdevfilter->vb2_ctx, parg); mutex_unlock(&dmxdevfilter->mutex); break; case DMX_EXPBUF: if (mutex_lock_interruptible(&dmxdevfilter->mutex)) { mutex_unlock(&dmxdev->mutex); return -ERESTARTSYS; } ret = dvb_vb2_expbuf(&dmxdevfilter->vb2_ctx, parg); mutex_unlock(&dmxdevfilter->mutex); break; case DMX_QBUF: if (mutex_lock_interruptible(&dmxdevfilter->mutex)) { mutex_unlock(&dmxdev->mutex); return -ERESTARTSYS; } ret = dvb_vb2_qbuf(&dmxdevfilter->vb2_ctx, parg); if (ret == 0 && !dvb_vb2_is_streaming(&dmxdevfilter->vb2_ctx)) ret = dvb_vb2_stream_on(&dmxdevfilter->vb2_ctx); mutex_unlock(&dmxdevfilter->mutex); break; case DMX_DQBUF: if (mutex_lock_interruptible(&dmxdevfilter->mutex)) { mutex_unlock(&dmxdev->mutex); return -ERESTARTSYS; } ret = dvb_vb2_dqbuf(&dmxdevfilter->vb2_ctx, parg); mutex_unlock(&dmxdevfilter->mutex); break; #endif default: ret = -ENOTTY; break; } mutex_unlock(&dmxdev->mutex); return ret; } static long dvb_demux_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return dvb_usercopy(file, cmd, arg, dvb_demux_do_ioctl); } static __poll_t dvb_demux_poll(struct file *file, poll_table *wait) { struct dmxdev_filter *dmxdevfilter = file->private_data; __poll_t mask = 0; poll_wait(file, &dmxdevfilter->buffer.queue, wait); if ((!dmxdevfilter) || dmxdevfilter->dev->exit) return EPOLLERR; if (dvb_vb2_is_streaming(&dmxdevfilter->vb2_ctx)) return dvb_vb2_poll(&dmxdevfilter->vb2_ctx, file, wait); if (dmxdevfilter->state != DMXDEV_STATE_GO && dmxdevfilter->state != DMXDEV_STATE_DONE && dmxdevfilter->state != DMXDEV_STATE_TIMEDOUT) return 0; if (dmxdevfilter->buffer.error) mask |= (EPOLLIN | EPOLLRDNORM | EPOLLPRI | EPOLLERR); if (!dvb_ringbuffer_empty(&dmxdevfilter->buffer)) mask |= (EPOLLIN | EPOLLRDNORM | EPOLLPRI); return mask; } #ifdef CONFIG_DVB_MMAP static int dvb_demux_mmap(struct file *file, struct vm_area_struct *vma) { struct dmxdev_filter *dmxdevfilter = file->private_data; struct dmxdev *dmxdev = dmxdevfilter->dev; int ret; if (!dmxdev->may_do_mmap) return -ENOTTY; if (mutex_lock_interruptible(&dmxdev->mutex)) return -ERESTARTSYS; if (mutex_lock_interruptible(&dmxdevfilter->mutex)) { mutex_unlock(&dmxdev->mutex); return -ERESTARTSYS; } ret = dvb_vb2_mmap(&dmxdevfilter->vb2_ctx, vma); mutex_unlock(&dmxdevfilter->mutex); mutex_unlock(&dmxdev->mutex); return ret; } #endif static int dvb_demux_release(struct inode *inode, struct file *file) { struct dmxdev_filter *dmxdevfilter = file->private_data; struct dmxdev *dmxdev = dmxdevfilter->dev; int ret; ret = dvb_dmxdev_filter_free(dmxdev, dmxdevfilter); mutex_lock(&dmxdev->mutex); dmxdev->dvbdev->users--; if (dmxdev->dvbdev->users == 1 && dmxdev->exit == 1) { mutex_unlock(&dmxdev->mutex); wake_up(&dmxdev->dvbdev->wait_queue); } else mutex_unlock(&dmxdev->mutex); return ret; } static const struct file_operations dvb_demux_fops = { .owner = THIS_MODULE, .read = dvb_demux_read, .unlocked_ioctl = dvb_demux_ioctl, .compat_ioctl = dvb_demux_ioctl, .open = dvb_demux_open, .release = dvb_demux_release, .poll = dvb_demux_poll, .llseek = default_llseek, #ifdef CONFIG_DVB_MMAP .mmap = dvb_demux_mmap, #endif }; static const struct dvb_device dvbdev_demux = { .priv = NULL, .users = 1, .writers = 1, #if defined(CONFIG_MEDIA_CONTROLLER_DVB) .name = "dvb-demux", #endif .fops = &dvb_demux_fops }; static int dvb_dvr_do_ioctl(struct file *file, unsigned int cmd, void *parg) { struct dvb_device *dvbdev = file->private_data; struct dmxdev *dmxdev = dvbdev->priv; unsigned long arg = (unsigned long)parg; int ret; if (mutex_lock_interruptible(&dmxdev->mutex)) return -ERESTARTSYS; switch (cmd) { case DMX_SET_BUFFER_SIZE: ret = dvb_dvr_set_buffer_size(dmxdev, arg); break; #ifdef CONFIG_DVB_MMAP case DMX_REQBUFS: ret = dvb_vb2_reqbufs(&dmxdev->dvr_vb2_ctx, parg); break; case DMX_QUERYBUF: ret = dvb_vb2_querybuf(&dmxdev->dvr_vb2_ctx, parg); break; case DMX_EXPBUF: ret = dvb_vb2_expbuf(&dmxdev->dvr_vb2_ctx, parg); break; case DMX_QBUF: ret = dvb_vb2_qbuf(&dmxdev->dvr_vb2_ctx, parg); if (ret == 0 && !dvb_vb2_is_streaming(&dmxdev->dvr_vb2_ctx)) ret = dvb_vb2_stream_on(&dmxdev->dvr_vb2_ctx); break; case DMX_DQBUF: ret = dvb_vb2_dqbuf(&dmxdev->dvr_vb2_ctx, parg); break; #endif default: ret = -ENOTTY; break; } mutex_unlock(&dmxdev->mutex); return ret; } static long dvb_dvr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return dvb_usercopy(file, cmd, arg, dvb_dvr_do_ioctl); } static __poll_t dvb_dvr_poll(struct file *file, poll_table *wait) { struct dvb_device *dvbdev = file->private_data; struct dmxdev *dmxdev = dvbdev->priv; __poll_t mask = 0; dprintk("%s\n", __func__); poll_wait(file, &dmxdev->dvr_buffer.queue, wait); if (dmxdev->exit) return EPOLLERR; if (dvb_vb2_is_streaming(&dmxdev->dvr_vb2_ctx)) return dvb_vb2_poll(&dmxdev->dvr_vb2_ctx, file, wait); if (((file->f_flags & O_ACCMODE) == O_RDONLY) || dmxdev->may_do_mmap) { if (dmxdev->dvr_buffer.error) mask |= (EPOLLIN | EPOLLRDNORM | EPOLLPRI | EPOLLERR); if (!dvb_ringbuffer_empty(&dmxdev->dvr_buffer)) mask |= (EPOLLIN | EPOLLRDNORM | EPOLLPRI); } else mask |= (EPOLLOUT | EPOLLWRNORM | EPOLLPRI); return mask; } #ifdef CONFIG_DVB_MMAP static int dvb_dvr_mmap(struct file *file, struct vm_area_struct *vma) { struct dvb_device *dvbdev = file->private_data; struct dmxdev *dmxdev = dvbdev->priv; int ret; if (!dmxdev->may_do_mmap) return -ENOTTY; if (dmxdev->exit) return -ENODEV; if (mutex_lock_interruptible(&dmxdev->mutex)) return -ERESTARTSYS; ret = dvb_vb2_mmap(&dmxdev->dvr_vb2_ctx, vma); mutex_unlock(&dmxdev->mutex); return ret; } #endif static const struct file_operations dvb_dvr_fops = { .owner = THIS_MODULE, .read = dvb_dvr_read, .write = dvb_dvr_write, .unlocked_ioctl = dvb_dvr_ioctl, .open = dvb_dvr_open, .release = dvb_dvr_release, .poll = dvb_dvr_poll, .llseek = default_llseek, #ifdef CONFIG_DVB_MMAP .mmap = dvb_dvr_mmap, #endif }; static const struct dvb_device dvbdev_dvr = { .priv = NULL, .readers = 1, .users = 1, #if defined(CONFIG_MEDIA_CONTROLLER_DVB) .name = "dvb-dvr", #endif .fops = &dvb_dvr_fops }; int dvb_dmxdev_init(struct dmxdev *dmxdev, struct dvb_adapter *dvb_adapter) { int i, ret; if (dmxdev->demux->open(dmxdev->demux) < 0) return -EUSERS; dmxdev->filter = vmalloc(array_size(sizeof(struct dmxdev_filter), dmxdev->filternum)); if (!dmxdev->filter) return -ENOMEM; mutex_init(&dmxdev->mutex); spin_lock_init(&dmxdev->lock); for (i = 0; i < dmxdev->filternum; i++) { dmxdev->filter[i].dev = dmxdev; dmxdev->filter[i].buffer.data = NULL; dvb_dmxdev_filter_state_set(&dmxdev->filter[i], DMXDEV_STATE_FREE); } ret = dvb_register_device(dvb_adapter, &dmxdev->dvbdev, &dvbdev_demux, dmxdev, DVB_DEVICE_DEMUX, dmxdev->filternum); if (ret < 0) goto err_register_dvbdev; ret = dvb_register_device(dvb_adapter, &dmxdev->dvr_dvbdev, &dvbdev_dvr, dmxdev, DVB_DEVICE_DVR, dmxdev->filternum); if (ret < 0) goto err_register_dvr_dvbdev; dvb_ringbuffer_init(&dmxdev->dvr_buffer, NULL, 8192); return 0; err_register_dvr_dvbdev: dvb_unregister_device(dmxdev->dvbdev); err_register_dvbdev: vfree(dmxdev->filter); dmxdev->filter = NULL; return ret; } EXPORT_SYMBOL(dvb_dmxdev_init); void dvb_dmxdev_release(struct dmxdev *dmxdev) { mutex_lock(&dmxdev->mutex); dmxdev->exit = 1; mutex_unlock(&dmxdev->mutex); if (dmxdev->dvbdev->users > 1) { wait_event(dmxdev->dvbdev->wait_queue, dmxdev->dvbdev->users == 1); } if (dmxdev->dvr_dvbdev->users > 1) { wait_event(dmxdev->dvr_dvbdev->wait_queue, dmxdev->dvr_dvbdev->users == 1); } dvb_unregister_device(dmxdev->dvbdev); dvb_unregister_device(dmxdev->dvr_dvbdev); vfree(dmxdev->filter); dmxdev->filter = NULL; dmxdev->demux->close(dmxdev->demux); } EXPORT_SYMBOL(dvb_dmxdev_release); |
4 6 17 18 18 18 18 35 24 23 24 23 24 30 10 4 10 21 21 21 21 21 21 21 18 27 18 18 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 | /* SPDX-License-Identifier: GPL-2.0 */ /* * fs/f2fs/segment.h * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ */ #include <linux/blkdev.h> #include <linux/backing-dev.h> /* constant macro */ #define NULL_SEGNO ((unsigned int)(~0)) #define NULL_SECNO ((unsigned int)(~0)) #define DEF_RECLAIM_PREFREE_SEGMENTS 5 /* 5% over total segments */ #define DEF_MAX_RECLAIM_PREFREE_SEGMENTS 4096 /* 8GB in maximum */ #define F2FS_MIN_SEGMENTS 9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */ #define F2FS_MIN_META_SEGMENTS 8 /* SB + 2 (CP + SIT + NAT) + SSA */ /* L: Logical segment # in volume, R: Relative segment # in main area */ #define GET_L2R_SEGNO(free_i, segno) ((segno) - (free_i)->start_segno) #define GET_R2L_SEGNO(free_i, segno) ((segno) + (free_i)->start_segno) #define IS_DATASEG(t) ((t) <= CURSEG_COLD_DATA) #define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE && (t) <= CURSEG_COLD_NODE) #define SE_PAGETYPE(se) ((IS_NODESEG((se)->type) ? NODE : DATA)) static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, unsigned short seg_type) { f2fs_bug_on(sbi, seg_type >= NR_PERSISTENT_LOG); } #define IS_HOT(t) ((t) == CURSEG_HOT_NODE || (t) == CURSEG_HOT_DATA) #define IS_WARM(t) ((t) == CURSEG_WARM_NODE || (t) == CURSEG_WARM_DATA) #define IS_COLD(t) ((t) == CURSEG_COLD_NODE || (t) == CURSEG_COLD_DATA) #define IS_CURSEG(sbi, seg) \ (((seg) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ ((seg) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \ ((seg) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ ((seg) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ ((seg) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno) || \ ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA_PINNED)->segno) || \ ((seg) == CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC)->segno)) #define IS_CURSEC(sbi, secno) \ (((secno) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \ SEGS_PER_SEC(sbi)) || \ ((secno) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \ SEGS_PER_SEC(sbi)) || \ ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \ SEGS_PER_SEC(sbi)) || \ ((secno) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \ SEGS_PER_SEC(sbi)) || \ ((secno) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \ SEGS_PER_SEC(sbi)) || \ ((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ SEGS_PER_SEC(sbi)) || \ ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA_PINNED)->segno / \ SEGS_PER_SEC(sbi)) || \ ((secno) == CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC)->segno / \ SEGS_PER_SEC(sbi))) #define MAIN_BLKADDR(sbi) \ (SM_I(sbi) ? SM_I(sbi)->main_blkaddr : \ le32_to_cpu(F2FS_RAW_SUPER(sbi)->main_blkaddr)) #define SEG0_BLKADDR(sbi) \ (SM_I(sbi) ? SM_I(sbi)->seg0_blkaddr : \ le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment0_blkaddr)) #define MAIN_SEGS(sbi) (SM_I(sbi)->main_segments) #define MAIN_SECS(sbi) ((sbi)->total_sections) #define TOTAL_SEGS(sbi) \ (SM_I(sbi) ? SM_I(sbi)->segment_count : \ le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count)) #define TOTAL_BLKS(sbi) (SEGS_TO_BLKS(sbi, TOTAL_SEGS(sbi))) #define MAX_BLKADDR(sbi) (SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi)) #define SEGMENT_SIZE(sbi) (1ULL << ((sbi)->log_blocksize + \ (sbi)->log_blocks_per_seg)) #define START_BLOCK(sbi, segno) (SEG0_BLKADDR(sbi) + \ (SEGS_TO_BLKS(sbi, GET_R2L_SEGNO(FREE_I(sbi), segno)))) #define NEXT_FREE_BLKADDR(sbi, curseg) \ (START_BLOCK(sbi, (curseg)->segno) + (curseg)->next_blkoff) #define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) ((blk_addr) - SEG0_BLKADDR(sbi)) #define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \ (BLKS_TO_SEGS(sbi, GET_SEGOFF_FROM_SEG0(sbi, blk_addr))) #define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \ (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & (BLKS_PER_SEG(sbi) - 1)) #define GET_SEGNO(sbi, blk_addr) \ ((!__is_valid_data_blkaddr(blk_addr)) ? \ NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \ GET_SEGNO_FROM_SEG0(sbi, blk_addr))) #define CAP_BLKS_PER_SEC(sbi) \ (BLKS_PER_SEC(sbi) - (sbi)->unusable_blocks_per_sec) #define CAP_SEGS_PER_SEC(sbi) \ (SEGS_PER_SEC(sbi) - \ BLKS_TO_SEGS(sbi, (sbi)->unusable_blocks_per_sec)) #define GET_SEC_FROM_SEG(sbi, segno) \ (((segno) == -1) ? -1 : (segno) / SEGS_PER_SEC(sbi)) #define GET_SEG_FROM_SEC(sbi, secno) \ ((secno) * SEGS_PER_SEC(sbi)) #define GET_ZONE_FROM_SEC(sbi, secno) \ (((secno) == -1) ? -1 : (secno) / (sbi)->secs_per_zone) #define GET_ZONE_FROM_SEG(sbi, segno) \ GET_ZONE_FROM_SEC(sbi, GET_SEC_FROM_SEG(sbi, segno)) #define GET_SUM_BLOCK(sbi, segno) \ ((sbi)->sm_info->ssa_blkaddr + (segno)) #define GET_SUM_TYPE(footer) ((footer)->entry_type) #define SET_SUM_TYPE(footer, type) ((footer)->entry_type = (type)) #define SIT_ENTRY_OFFSET(sit_i, segno) \ ((segno) % (sit_i)->sents_per_block) #define SIT_BLOCK_OFFSET(segno) \ ((segno) / SIT_ENTRY_PER_BLOCK) #define START_SEGNO(segno) \ (SIT_BLOCK_OFFSET(segno) * SIT_ENTRY_PER_BLOCK) #define SIT_BLK_CNT(sbi) \ DIV_ROUND_UP(MAIN_SEGS(sbi), SIT_ENTRY_PER_BLOCK) #define f2fs_bitmap_size(nr) \ (BITS_TO_LONGS(nr) * sizeof(unsigned long)) #define SECTOR_FROM_BLOCK(blk_addr) \ (((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK) #define SECTOR_TO_BLOCK(sectors) \ ((sectors) >> F2FS_LOG_SECTORS_PER_BLOCK) /* * In the victim_sel_policy->alloc_mode, there are three block allocation modes. * LFS writes data sequentially with cleaning operations. * SSR (Slack Space Recycle) reuses obsolete space without cleaning operations. * AT_SSR (Age Threshold based Slack Space Recycle) merges fragments into * fragmented segment which has similar aging degree. */ enum { LFS = 0, SSR, AT_SSR, }; /* * In the victim_sel_policy->gc_mode, there are three gc, aka cleaning, modes. * GC_CB is based on cost-benefit algorithm. * GC_GREEDY is based on greedy algorithm. * GC_AT is based on age-threshold algorithm. */ enum { GC_CB = 0, GC_GREEDY, GC_AT, ALLOC_NEXT, FLUSH_DEVICE, MAX_GC_POLICY, }; /* * BG_GC means the background cleaning job. * FG_GC means the on-demand cleaning job. */ enum { BG_GC = 0, FG_GC, }; /* for a function parameter to select a victim segment */ struct victim_sel_policy { int alloc_mode; /* LFS or SSR */ int gc_mode; /* GC_CB or GC_GREEDY */ unsigned long *dirty_bitmap; /* dirty segment/section bitmap */ unsigned int max_search; /* * maximum # of segments/sections * to search */ unsigned int offset; /* last scanned bitmap offset */ unsigned int ofs_unit; /* bitmap search unit */ unsigned int min_cost; /* minimum cost */ unsigned long long oldest_age; /* oldest age of segments having the same min cost */ unsigned int min_segno; /* segment # having min. cost */ unsigned long long age; /* mtime of GCed section*/ unsigned long long age_threshold;/* age threshold */ }; struct seg_entry { unsigned int type:6; /* segment type like CURSEG_XXX_TYPE */ unsigned int valid_blocks:10; /* # of valid blocks */ unsigned int ckpt_valid_blocks:10; /* # of valid blocks last cp */ unsigned int padding:6; /* padding */ unsigned char *cur_valid_map; /* validity bitmap of blocks */ #ifdef CONFIG_F2FS_CHECK_FS unsigned char *cur_valid_map_mir; /* mirror of current valid bitmap */ #endif /* * # of valid blocks and the validity bitmap stored in the last * checkpoint pack. This information is used by the SSR mode. */ unsigned char *ckpt_valid_map; /* validity bitmap of blocks last cp */ unsigned char *discard_map; unsigned long long mtime; /* modification time of the segment */ }; struct sec_entry { unsigned int valid_blocks; /* # of valid blocks in a section */ }; #define MAX_SKIP_GC_COUNT 16 struct revoke_entry { struct list_head list; block_t old_addr; /* for revoking when fail to commit */ pgoff_t index; }; struct sit_info { block_t sit_base_addr; /* start block address of SIT area */ block_t sit_blocks; /* # of blocks used by SIT area */ block_t written_valid_blocks; /* # of valid blocks in main area */ char *bitmap; /* all bitmaps pointer */ char *sit_bitmap; /* SIT bitmap pointer */ #ifdef CONFIG_F2FS_CHECK_FS char *sit_bitmap_mir; /* SIT bitmap mirror */ /* bitmap of segments to be ignored by GC in case of errors */ unsigned long *invalid_segmap; #endif unsigned int bitmap_size; /* SIT bitmap size */ unsigned long *tmp_map; /* bitmap for temporal use */ unsigned long *dirty_sentries_bitmap; /* bitmap for dirty sentries */ unsigned int dirty_sentries; /* # of dirty sentries */ unsigned int sents_per_block; /* # of SIT entries per block */ struct rw_semaphore sentry_lock; /* to protect SIT cache */ struct seg_entry *sentries; /* SIT segment-level cache */ struct sec_entry *sec_entries; /* SIT section-level cache */ /* for cost-benefit algorithm in cleaning procedure */ unsigned long long elapsed_time; /* elapsed time after mount */ unsigned long long mounted_time; /* mount time */ unsigned long long min_mtime; /* min. modification time */ unsigned long long max_mtime; /* max. modification time */ unsigned long long dirty_min_mtime; /* rerange candidates in GC_AT */ unsigned long long dirty_max_mtime; /* rerange candidates in GC_AT */ unsigned int last_victim[MAX_GC_POLICY]; /* last victim segment # */ }; struct free_segmap_info { unsigned int start_segno; /* start segment number logically */ unsigned int free_segments; /* # of free segments */ unsigned int free_sections; /* # of free sections */ spinlock_t segmap_lock; /* free segmap lock */ unsigned long *free_segmap; /* free segment bitmap */ unsigned long *free_secmap; /* free section bitmap */ }; /* Notice: The order of dirty type is same with CURSEG_XXX in f2fs.h */ enum dirty_type { DIRTY_HOT_DATA, /* dirty segments assigned as hot data logs */ DIRTY_WARM_DATA, /* dirty segments assigned as warm data logs */ DIRTY_COLD_DATA, /* dirty segments assigned as cold data logs */ DIRTY_HOT_NODE, /* dirty segments assigned as hot node logs */ DIRTY_WARM_NODE, /* dirty segments assigned as warm node logs */ DIRTY_COLD_NODE, /* dirty segments assigned as cold node logs */ DIRTY, /* to count # of dirty segments */ PRE, /* to count # of entirely obsolete segments */ NR_DIRTY_TYPE }; struct dirty_seglist_info { unsigned long *dirty_segmap[NR_DIRTY_TYPE]; unsigned long *dirty_secmap; struct mutex seglist_lock; /* lock for segment bitmaps */ int nr_dirty[NR_DIRTY_TYPE]; /* # of dirty segments */ unsigned long *victim_secmap; /* background GC victims */ unsigned long *pinned_secmap; /* pinned victims from foreground GC */ unsigned int pinned_secmap_cnt; /* count of victims which has pinned data */ bool enable_pin_section; /* enable pinning section */ }; /* for active log information */ struct curseg_info { struct mutex curseg_mutex; /* lock for consistency */ struct f2fs_summary_block *sum_blk; /* cached summary block */ struct rw_semaphore journal_rwsem; /* protect journal area */ struct f2fs_journal *journal; /* cached journal info */ unsigned char alloc_type; /* current allocation type */ unsigned short seg_type; /* segment type like CURSEG_XXX_TYPE */ unsigned int segno; /* current segment number */ unsigned short next_blkoff; /* next block offset to write */ unsigned int zone; /* current zone number */ unsigned int next_segno; /* preallocated segment */ int fragment_remained_chunk; /* remained block size in a chunk for block fragmentation mode */ bool inited; /* indicate inmem log is inited */ }; struct sit_entry_set { struct list_head set_list; /* link with all sit sets */ unsigned int start_segno; /* start segno of sits in set */ unsigned int entry_cnt; /* the # of sit entries in set */ }; /* * inline functions */ static inline struct curseg_info *CURSEG_I(struct f2fs_sb_info *sbi, int type) { return (struct curseg_info *)(SM_I(sbi)->curseg_array + type); } static inline struct seg_entry *get_seg_entry(struct f2fs_sb_info *sbi, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); return &sit_i->sentries[segno]; } static inline struct sec_entry *get_sec_entry(struct f2fs_sb_info *sbi, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); return &sit_i->sec_entries[GET_SEC_FROM_SEG(sbi, segno)]; } static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi, unsigned int segno, bool use_section) { /* * In order to get # of valid blocks in a section instantly from many * segments, f2fs manages two counting structures separately. */ if (use_section && __is_large_section(sbi)) return get_sec_entry(sbi, segno)->valid_blocks; else return get_seg_entry(sbi, segno)->valid_blocks; } static inline unsigned int get_ckpt_valid_blocks(struct f2fs_sb_info *sbi, unsigned int segno, bool use_section) { if (use_section && __is_large_section(sbi)) { unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); unsigned int blocks = 0; int i; for (i = 0; i < SEGS_PER_SEC(sbi); i++, start_segno++) { struct seg_entry *se = get_seg_entry(sbi, start_segno); blocks += se->ckpt_valid_blocks; } return blocks; } return get_seg_entry(sbi, segno)->ckpt_valid_blocks; } static inline void seg_info_from_raw_sit(struct seg_entry *se, struct f2fs_sit_entry *rs) { se->valid_blocks = GET_SIT_VBLOCKS(rs); se->ckpt_valid_blocks = GET_SIT_VBLOCKS(rs); memcpy(se->cur_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); #ifdef CONFIG_F2FS_CHECK_FS memcpy(se->cur_valid_map_mir, rs->valid_map, SIT_VBLOCK_MAP_SIZE); #endif se->type = GET_SIT_TYPE(rs); se->mtime = le64_to_cpu(rs->mtime); } static inline void __seg_info_to_raw_sit(struct seg_entry *se, struct f2fs_sit_entry *rs) { unsigned short raw_vblocks = (se->type << SIT_VBLOCKS_SHIFT) | se->valid_blocks; rs->vblocks = cpu_to_le16(raw_vblocks); memcpy(rs->valid_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE); rs->mtime = cpu_to_le64(se->mtime); } static inline void seg_info_to_sit_page(struct f2fs_sb_info *sbi, struct page *page, unsigned int start) { struct f2fs_sit_block *raw_sit; struct seg_entry *se; struct f2fs_sit_entry *rs; unsigned int end = min(start + SIT_ENTRY_PER_BLOCK, (unsigned long)MAIN_SEGS(sbi)); int i; raw_sit = (struct f2fs_sit_block *)page_address(page); memset(raw_sit, 0, PAGE_SIZE); for (i = 0; i < end - start; i++) { rs = &raw_sit->entries[i]; se = get_seg_entry(sbi, start + i); __seg_info_to_raw_sit(se, rs); } } static inline void seg_info_to_raw_sit(struct seg_entry *se, struct f2fs_sit_entry *rs) { __seg_info_to_raw_sit(se, rs); memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); se->ckpt_valid_blocks = se->valid_blocks; } static inline unsigned int find_next_inuse(struct free_segmap_info *free_i, unsigned int max, unsigned int segno) { unsigned int ret; spin_lock(&free_i->segmap_lock); ret = find_next_bit(free_i->free_segmap, max, segno); spin_unlock(&free_i->segmap_lock); return ret; } static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); unsigned int next; unsigned int usable_segs = f2fs_usable_segs_in_sec(sbi, segno); spin_lock(&free_i->segmap_lock); clear_bit(segno, free_i->free_segmap); free_i->free_segments++; next = find_next_bit(free_i->free_segmap, start_segno + SEGS_PER_SEC(sbi), start_segno); if (next >= start_segno + usable_segs) { clear_bit(secno, free_i->free_secmap); free_i->free_sections++; } spin_unlock(&free_i->segmap_lock); } static inline void __set_inuse(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); set_bit(segno, free_i->free_segmap); free_i->free_segments--; if (!test_and_set_bit(secno, free_i->free_secmap)) free_i->free_sections--; } static inline void __set_test_and_free(struct f2fs_sb_info *sbi, unsigned int segno, bool inmem) { struct free_segmap_info *free_i = FREE_I(sbi); unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); unsigned int next; unsigned int usable_segs = f2fs_usable_segs_in_sec(sbi, segno); spin_lock(&free_i->segmap_lock); if (test_and_clear_bit(segno, free_i->free_segmap)) { free_i->free_segments++; if (!inmem && IS_CURSEC(sbi, secno)) goto skip_free; next = find_next_bit(free_i->free_segmap, start_segno + SEGS_PER_SEC(sbi), start_segno); if (next >= start_segno + usable_segs) { if (test_and_clear_bit(secno, free_i->free_secmap)) free_i->free_sections++; } } skip_free: spin_unlock(&free_i->segmap_lock); } static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); spin_lock(&free_i->segmap_lock); if (!test_and_set_bit(segno, free_i->free_segmap)) { free_i->free_segments--; if (!test_and_set_bit(secno, free_i->free_secmap)) free_i->free_sections--; } spin_unlock(&free_i->segmap_lock); } static inline void get_sit_bitmap(struct f2fs_sb_info *sbi, void *dst_addr) { struct sit_info *sit_i = SIT_I(sbi); #ifdef CONFIG_F2FS_CHECK_FS if (memcmp(sit_i->sit_bitmap, sit_i->sit_bitmap_mir, sit_i->bitmap_size)) f2fs_bug_on(sbi, 1); #endif memcpy(dst_addr, sit_i->sit_bitmap, sit_i->bitmap_size); } static inline block_t written_block_count(struct f2fs_sb_info *sbi) { return SIT_I(sbi)->written_valid_blocks; } static inline unsigned int free_segments(struct f2fs_sb_info *sbi) { return FREE_I(sbi)->free_segments; } static inline unsigned int reserved_segments(struct f2fs_sb_info *sbi) { return SM_I(sbi)->reserved_segments + SM_I(sbi)->additional_reserved_segments; } static inline unsigned int free_sections(struct f2fs_sb_info *sbi) { return FREE_I(sbi)->free_sections; } static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi) { return DIRTY_I(sbi)->nr_dirty[PRE]; } static inline unsigned int dirty_segments(struct f2fs_sb_info *sbi) { return DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_DATA] + DIRTY_I(sbi)->nr_dirty[DIRTY_WARM_DATA] + DIRTY_I(sbi)->nr_dirty[DIRTY_COLD_DATA] + DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_NODE] + DIRTY_I(sbi)->nr_dirty[DIRTY_WARM_NODE] + DIRTY_I(sbi)->nr_dirty[DIRTY_COLD_NODE]; } static inline int overprovision_segments(struct f2fs_sb_info *sbi) { return SM_I(sbi)->ovp_segments; } static inline int reserved_sections(struct f2fs_sb_info *sbi) { return GET_SEC_FROM_SEG(sbi, reserved_segments(sbi)); } static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, unsigned int node_blocks, unsigned int dent_blocks) { unsigned segno, left_blocks; int i; /* check current node sections in the worst case. */ for (i = CURSEG_HOT_NODE; i <= CURSEG_COLD_NODE; i++) { segno = CURSEG_I(sbi, i)->segno; left_blocks = CAP_BLKS_PER_SEC(sbi) - get_ckpt_valid_blocks(sbi, segno, true); if (node_blocks > left_blocks) return false; } /* check current data section for dentry blocks. */ segno = CURSEG_I(sbi, CURSEG_HOT_DATA)->segno; left_blocks = CAP_BLKS_PER_SEC(sbi) - get_ckpt_valid_blocks(sbi, segno, true); if (dent_blocks > left_blocks) return false; return true; } /* * calculate needed sections for dirty node/dentry * and call has_curseg_enough_space */ static inline void __get_secs_required(struct f2fs_sb_info *sbi, unsigned int *lower_p, unsigned int *upper_p, bool *curseg_p) { unsigned int total_node_blocks = get_pages(sbi, F2FS_DIRTY_NODES) + get_pages(sbi, F2FS_DIRTY_DENTS) + get_pages(sbi, F2FS_DIRTY_IMETA); unsigned int total_dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS); unsigned int node_secs = total_node_blocks / CAP_BLKS_PER_SEC(sbi); unsigned int dent_secs = total_dent_blocks / CAP_BLKS_PER_SEC(sbi); unsigned int node_blocks = total_node_blocks % CAP_BLKS_PER_SEC(sbi); unsigned int dent_blocks = total_dent_blocks % CAP_BLKS_PER_SEC(sbi); if (lower_p) *lower_p = node_secs + dent_secs; if (upper_p) *upper_p = node_secs + dent_secs + (node_blocks ? 1 : 0) + (dent_blocks ? 1 : 0); if (curseg_p) *curseg_p = has_curseg_enough_space(sbi, node_blocks, dent_blocks); } static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed, int needed) { unsigned int free_secs, lower_secs, upper_secs; bool curseg_space; if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return false; __get_secs_required(sbi, &lower_secs, &upper_secs, &curseg_space); free_secs = free_sections(sbi) + freed; lower_secs += needed + reserved_sections(sbi); upper_secs += needed + reserved_sections(sbi); if (free_secs > upper_secs) return false; if (free_secs <= lower_secs) return true; return !curseg_space; } static inline bool has_enough_free_secs(struct f2fs_sb_info *sbi, int freed, int needed) { return !has_not_enough_free_secs(sbi, freed, needed); } static inline bool f2fs_is_checkpoint_ready(struct f2fs_sb_info *sbi) { if (likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED))) return true; if (likely(has_enough_free_secs(sbi, 0, 0))) return true; return false; } static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi) { return prefree_segments(sbi) > SM_I(sbi)->rec_prefree_segments; } static inline int utilization(struct f2fs_sb_info *sbi) { return div_u64((u64)valid_user_blocks(sbi) * 100, sbi->user_block_count); } /* * Sometimes f2fs may be better to drop out-of-place update policy. * And, users can control the policy through sysfs entries. * There are five policies with triggering conditions as follows. * F2FS_IPU_FORCE - all the time, * F2FS_IPU_SSR - if SSR mode is activated, * F2FS_IPU_UTIL - if FS utilization is over threashold, * F2FS_IPU_SSR_UTIL - if SSR mode is activated and FS utilization is over * threashold, * F2FS_IPU_FSYNC - activated in fsync path only for high performance flash * storages. IPU will be triggered only if the # of dirty * pages over min_fsync_blocks. (=default option) * F2FS_IPU_ASYNC - do IPU given by asynchronous write requests. * F2FS_IPU_NOCACHE - disable IPU bio cache. * F2FS_IPU_HONOR_OPU_WRITE - use OPU write prior to IPU write if inode has * FI_OPU_WRITE flag. * F2FS_IPU_DISABLE - disable IPU. (=default option in LFS mode) */ #define DEF_MIN_IPU_UTIL 70 #define DEF_MIN_FSYNC_BLOCKS 8 #define DEF_MIN_HOT_BLOCKS 16 #define SMALL_VOLUME_SEGMENTS (16 * 512) /* 16GB */ #define F2FS_IPU_DISABLE 0 /* Modification on enum should be synchronized with ipu_mode_names array */ enum { F2FS_IPU_FORCE, F2FS_IPU_SSR, F2FS_IPU_UTIL, F2FS_IPU_SSR_UTIL, F2FS_IPU_FSYNC, F2FS_IPU_ASYNC, F2FS_IPU_NOCACHE, F2FS_IPU_HONOR_OPU_WRITE, F2FS_IPU_MAX, }; static inline bool IS_F2FS_IPU_DISABLE(struct f2fs_sb_info *sbi) { return SM_I(sbi)->ipu_policy == F2FS_IPU_DISABLE; } #define F2FS_IPU_POLICY(name) \ static inline bool IS_##name(struct f2fs_sb_info *sbi) \ { \ return SM_I(sbi)->ipu_policy & BIT(name); \ } F2FS_IPU_POLICY(F2FS_IPU_FORCE); F2FS_IPU_POLICY(F2FS_IPU_SSR); F2FS_IPU_POLICY(F2FS_IPU_UTIL); F2FS_IPU_POLICY(F2FS_IPU_SSR_UTIL); F2FS_IPU_POLICY(F2FS_IPU_FSYNC); F2FS_IPU_POLICY(F2FS_IPU_ASYNC); F2FS_IPU_POLICY(F2FS_IPU_NOCACHE); F2FS_IPU_POLICY(F2FS_IPU_HONOR_OPU_WRITE); static inline unsigned int curseg_segno(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); return curseg->segno; } static inline unsigned char curseg_alloc_type(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); return curseg->alloc_type; } static inline bool valid_main_segno(struct f2fs_sb_info *sbi, unsigned int segno) { return segno <= (MAIN_SEGS(sbi) - 1); } static inline void verify_fio_blkaddr(struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = fio->sbi; if (__is_valid_data_blkaddr(fio->old_blkaddr)) verify_blkaddr(sbi, fio->old_blkaddr, __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC); verify_blkaddr(sbi, fio->new_blkaddr, __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC_ENHANCE); } /* * Summary block is always treated as an invalid block */ static inline int check_block_count(struct f2fs_sb_info *sbi, int segno, struct f2fs_sit_entry *raw_sit) { bool is_valid = test_bit_le(0, raw_sit->valid_map) ? true : false; int valid_blocks = 0; int cur_pos = 0, next_pos; unsigned int usable_blks_per_seg = f2fs_usable_blks_in_seg(sbi, segno); /* check bitmap with valid block count */ do { if (is_valid) { next_pos = find_next_zero_bit_le(&raw_sit->valid_map, usable_blks_per_seg, cur_pos); valid_blocks += next_pos - cur_pos; } else next_pos = find_next_bit_le(&raw_sit->valid_map, usable_blks_per_seg, cur_pos); cur_pos = next_pos; is_valid = !is_valid; } while (cur_pos < usable_blks_per_seg); if (unlikely(GET_SIT_VBLOCKS(raw_sit) != valid_blocks)) { f2fs_err(sbi, "Mismatch valid blocks %d vs. %d", GET_SIT_VBLOCKS(raw_sit), valid_blocks); set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_handle_error(sbi, ERROR_INCONSISTENT_SIT); return -EFSCORRUPTED; } if (usable_blks_per_seg < BLKS_PER_SEG(sbi)) f2fs_bug_on(sbi, find_next_bit_le(&raw_sit->valid_map, BLKS_PER_SEG(sbi), usable_blks_per_seg) != BLKS_PER_SEG(sbi)); /* check segment usage, and check boundary of a given segment number */ if (unlikely(GET_SIT_VBLOCKS(raw_sit) > usable_blks_per_seg || !valid_main_segno(sbi, segno))) { f2fs_err(sbi, "Wrong valid blocks %d or segno %u", GET_SIT_VBLOCKS(raw_sit), segno); set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_handle_error(sbi, ERROR_INCONSISTENT_SIT); return -EFSCORRUPTED; } return 0; } static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi, unsigned int start) { struct sit_info *sit_i = SIT_I(sbi); unsigned int offset = SIT_BLOCK_OFFSET(start); block_t blk_addr = sit_i->sit_base_addr + offset; f2fs_bug_on(sbi, !valid_main_segno(sbi, start)); #ifdef CONFIG_F2FS_CHECK_FS if (f2fs_test_bit(offset, sit_i->sit_bitmap) != f2fs_test_bit(offset, sit_i->sit_bitmap_mir)) f2fs_bug_on(sbi, 1); #endif /* calculate sit block address */ if (f2fs_test_bit(offset, sit_i->sit_bitmap)) blk_addr += sit_i->sit_blocks; return blk_addr; } static inline pgoff_t next_sit_addr(struct f2fs_sb_info *sbi, pgoff_t block_addr) { struct sit_info *sit_i = SIT_I(sbi); block_addr -= sit_i->sit_base_addr; if (block_addr < sit_i->sit_blocks) block_addr += sit_i->sit_blocks; else block_addr -= sit_i->sit_blocks; return block_addr + sit_i->sit_base_addr; } static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start) { unsigned int block_off = SIT_BLOCK_OFFSET(start); f2fs_change_bit(block_off, sit_i->sit_bitmap); #ifdef CONFIG_F2FS_CHECK_FS f2fs_change_bit(block_off, sit_i->sit_bitmap_mir); #endif } static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi, bool base_time) { struct sit_info *sit_i = SIT_I(sbi); time64_t diff, now = ktime_get_boottime_seconds(); if (now >= sit_i->mounted_time) return sit_i->elapsed_time + now - sit_i->mounted_time; /* system time is set to the past */ if (!base_time) { diff = sit_i->mounted_time - now; if (sit_i->elapsed_time >= diff) return sit_i->elapsed_time - diff; return 0; } return sit_i->elapsed_time; } static inline void set_summary(struct f2fs_summary *sum, nid_t nid, unsigned int ofs_in_node, unsigned char version) { sum->nid = cpu_to_le32(nid); sum->ofs_in_node = cpu_to_le16(ofs_in_node); sum->version = version; } static inline block_t start_sum_block(struct f2fs_sb_info *sbi) { return __start_cp_addr(sbi) + le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum); } static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type) { return __start_cp_addr(sbi) + le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_total_block_count) - (base + 1) + type; } static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno) { if (IS_CURSEC(sbi, secno) || (sbi->cur_victim_sec == secno)) return true; return false; } /* * It is very important to gather dirty pages and write at once, so that we can * submit a big bio without interfering other data writes. * By default, 512 pages for directory data, * 512 pages (2MB) * 8 for nodes, and * 256 pages * 8 for meta are set. */ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) { if (sbi->sb->s_bdi->wb.dirty_exceeded) return 0; if (type == DATA) return BLKS_PER_SEG(sbi); else if (type == NODE) return SEGS_TO_BLKS(sbi, 8); else if (type == META) return 8 * BIO_MAX_VECS; else return 0; } /* * When writing pages, it'd better align nr_to_write for segment size. */ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, struct writeback_control *wbc) { long nr_to_write, desired; if (wbc->sync_mode != WB_SYNC_NONE) return 0; nr_to_write = wbc->nr_to_write; desired = BIO_MAX_VECS; if (type == NODE) desired <<= 1; wbc->nr_to_write = desired; return desired - nr_to_write; } static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; bool wakeup = false; int i; if (force) goto wake_up; mutex_lock(&dcc->cmd_lock); for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { if (i + 1 < dcc->discard_granularity) break; if (!list_empty(&dcc->pend_list[i])) { wakeup = true; break; } } mutex_unlock(&dcc->cmd_lock); if (!wakeup || !is_idle(sbi, DISCARD_TIME)) return; wake_up: dcc->discard_wake = true; wake_up_interruptible_all(&dcc->discard_wait_queue); } static inline unsigned int first_zoned_segno(struct f2fs_sb_info *sbi) { int devi; for (devi = 0; devi < sbi->s_ndevs; devi++) if (bdev_is_zoned(FDEV(devi).bdev)) return GET_SEGNO(sbi, FDEV(devi).start_blk); return 0; } |
110 110 110 110 110 110 213 212 212 212 212 212 212 21 193 193 110 110 1 1 211 212 3654 3650 27 179 3587 252 19 5 19 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 | // SPDX-License-Identifier: GPL-2.0 /* * Workingset detection * * Copyright (C) 2013 Red Hat, Inc., Johannes Weiner */ #include <linux/memcontrol.h> #include <linux/mm_inline.h> #include <linux/writeback.h> #include <linux/shmem_fs.h> #include <linux/pagemap.h> #include <linux/atomic.h> #include <linux/module.h> #include <linux/swap.h> #include <linux/dax.h> #include <linux/fs.h> #include <linux/mm.h> #include "internal.h" /* * Double CLOCK lists * * Per node, two clock lists are maintained for file pages: the * inactive and the active list. Freshly faulted pages start out at * the head of the inactive list and page reclaim scans pages from the * tail. Pages that are accessed multiple times on the inactive list * are promoted to the active list, to protect them from reclaim, * whereas active pages are demoted to the inactive list when the * active list grows too big. * * fault ------------------------+ * | * +--------------+ | +-------------+ * reclaim <- | inactive | <-+-- demotion | active | <--+ * +--------------+ +-------------+ | * | | * +-------------- promotion ------------------+ * * * Access frequency and refault distance * * A workload is thrashing when its pages are frequently used but they * are evicted from the inactive list every time before another access * would have promoted them to the active list. * * In cases where the average access distance between thrashing pages * is bigger than the size of memory there is nothing that can be * done - the thrashing set could never fit into memory under any * circumstance. * * However, the average access distance could be bigger than the * inactive list, yet smaller than the size of memory. In this case, * the set could fit into memory if it weren't for the currently * active pages - which may be used more, hopefully less frequently: * * +-memory available to cache-+ * | | * +-inactive------+-active----+ * a b | c d e f g h i | J K L M N | * +---------------+-----------+ * * It is prohibitively expensive to accurately track access frequency * of pages. But a reasonable approximation can be made to measure * thrashing on the inactive list, after which refaulting pages can be * activated optimistically to compete with the existing active pages. * * Approximating inactive page access frequency - Observations: * * 1. When a page is accessed for the first time, it is added to the * head of the inactive list, slides every existing inactive page * towards the tail by one slot, and pushes the current tail page * out of memory. * * 2. When a page is accessed for the second time, it is promoted to * the active list, shrinking the inactive list by one slot. This * also slides all inactive pages that were faulted into the cache * more recently than the activated page towards the tail of the * inactive list. * * Thus: * * 1. The sum of evictions and activations between any two points in * time indicate the minimum number of inactive pages accessed in * between. * * 2. Moving one inactive page N page slots towards the tail of the * list requires at least N inactive page accesses. * * Combining these: * * 1. When a page is finally evicted from memory, the number of * inactive pages accessed while the page was in cache is at least * the number of page slots on the inactive list. * * 2. In addition, measuring the sum of evictions and activations (E) * at the time of a page's eviction, and comparing it to another * reading (R) at the time the page faults back into memory tells * the minimum number of accesses while the page was not cached. * This is called the refault distance. * * Because the first access of the page was the fault and the second * access the refault, we combine the in-cache distance with the * out-of-cache distance to get the complete minimum access distance * of this page: * * NR_inactive + (R - E) * * And knowing the minimum access distance of a page, we can easily * tell if the page would be able to stay in cache assuming all page * slots in the cache were available: * * NR_inactive + (R - E) <= NR_inactive + NR_active * * If we have swap we should consider about NR_inactive_anon and * NR_active_anon, so for page cache and anonymous respectively: * * NR_inactive_file + (R - E) <= NR_inactive_file + NR_active_file * + NR_inactive_anon + NR_active_anon * * NR_inactive_anon + (R - E) <= NR_inactive_anon + NR_active_anon * + NR_inactive_file + NR_active_file * * Which can be further simplified to: * * (R - E) <= NR_active_file + NR_inactive_anon + NR_active_anon * * (R - E) <= NR_active_anon + NR_inactive_file + NR_active_file * * Put into words, the refault distance (out-of-cache) can be seen as * a deficit in inactive list space (in-cache). If the inactive list * had (R - E) more page slots, the page would not have been evicted * in between accesses, but activated instead. And on a full system, * the only thing eating into inactive list space is active pages. * * * Refaulting inactive pages * * All that is known about the active list is that the pages have been * accessed more than once in the past. This means that at any given * time there is actually a good chance that pages on the active list * are no longer in active use. * * So when a refault distance of (R - E) is observed and there are at * least (R - E) pages in the userspace workingset, the refaulting page * is activated optimistically in the hope that (R - E) pages are actually * used less frequently than the refaulting page - or even not used at * all anymore. * * That means if inactive cache is refaulting with a suitable refault * distance, we assume the cache workingset is transitioning and put * pressure on the current workingset. * * If this is wrong and demotion kicks in, the pages which are truly * used more frequently will be reactivated while the less frequently * used once will be evicted from memory. * * But if this is right, the stale pages will be pushed out of memory * and the used pages get to stay in cache. * * Refaulting active pages * * If on the other hand the refaulting pages have recently been * deactivated, it means that the active list is no longer protecting * actively used cache from reclaim. The cache is NOT transitioning to * a different workingset; the existing workingset is thrashing in the * space allocated to the page cache. * * * Implementation * * For each node's LRU lists, a counter for inactive evictions and * activations is maintained (node->nonresident_age). * * On eviction, a snapshot of this counter (along with some bits to * identify the node) is stored in the now empty page cache * slot of the evicted page. This is called a shadow entry. * * On cache misses for which there are shadow entries, an eligible * refault distance will immediately activate the refaulting page. */ #define WORKINGSET_SHIFT 1 #define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \ WORKINGSET_SHIFT + NODES_SHIFT + \ MEM_CGROUP_ID_SHIFT) #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) /* * Eviction timestamps need to be able to cover the full range of * actionable refaults. However, bits are tight in the xarray * entry, and after storing the identifier for the lruvec there might * not be enough left to represent every single actionable refault. In * that case, we have to sacrifice granularity for distance, and group * evictions into coarser buckets by shaving off lower timestamp bits. */ static unsigned int bucket_order __read_mostly; static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, bool workingset) { eviction &= EVICTION_MASK; eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; eviction = (eviction << NODES_SHIFT) | pgdat->node_id; eviction = (eviction << WORKINGSET_SHIFT) | workingset; return xa_mk_value(eviction); } static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, unsigned long *evictionp, bool *workingsetp) { unsigned long entry = xa_to_value(shadow); int memcgid, nid; bool workingset; workingset = entry & ((1UL << WORKINGSET_SHIFT) - 1); entry >>= WORKINGSET_SHIFT; nid = entry & ((1UL << NODES_SHIFT) - 1); entry >>= NODES_SHIFT; memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); entry >>= MEM_CGROUP_ID_SHIFT; *memcgidp = memcgid; *pgdat = NODE_DATA(nid); *evictionp = entry; *workingsetp = workingset; } #ifdef CONFIG_LRU_GEN static void *lru_gen_eviction(struct folio *folio) { int hist; unsigned long token; unsigned long min_seq; struct lruvec *lruvec; struct lru_gen_folio *lrugen; int type = folio_is_file_lru(folio); int delta = folio_nr_pages(folio); int refs = folio_lru_refs(folio); int tier = lru_tier_from_refs(refs); struct mem_cgroup *memcg = folio_memcg(folio); struct pglist_data *pgdat = folio_pgdat(folio); BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT); lruvec = mem_cgroup_lruvec(memcg, pgdat); lrugen = &lruvec->lrugen; min_seq = READ_ONCE(lrugen->min_seq[type]); token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0); hist = lru_hist_from_seq(min_seq); atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs); } /* * Tests if the shadow entry is for a folio that was recently evicted. * Fills in @lruvec, @token, @workingset with the values unpacked from shadow. */ static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec, unsigned long *token, bool *workingset) { int memcg_id; unsigned long min_seq; struct mem_cgroup *memcg; struct pglist_data *pgdat; unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset); memcg = mem_cgroup_from_id(memcg_id); *lruvec = mem_cgroup_lruvec(memcg, pgdat); min_seq = READ_ONCE((*lruvec)->lrugen.min_seq[file]); return (*token >> LRU_REFS_WIDTH) == (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH)); } static void lru_gen_refault(struct folio *folio, void *shadow) { bool recent; int hist, tier, refs; bool workingset; unsigned long token; struct lruvec *lruvec; struct lru_gen_folio *lrugen; int type = folio_is_file_lru(folio); int delta = folio_nr_pages(folio); rcu_read_lock(); recent = lru_gen_test_recent(shadow, type, &lruvec, &token, &workingset); if (lruvec != folio_lruvec(folio)) goto unlock; mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta); if (!recent) goto unlock; lrugen = &lruvec->lrugen; hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type])); /* see the comment in folio_lru_refs() */ refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset; tier = lru_tier_from_refs(refs); atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]); mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); /* * Count the following two cases as stalls: * 1. For pages accessed through page tables, hotter pages pushed out * hot pages which refaulted immediately. * 2. For pages accessed multiple times through file descriptors, * they would have been protected by sort_folio(). */ if (lru_gen_in_fault() || refs >= BIT(LRU_REFS_WIDTH) - 1) { set_mask_bits(&folio->flags, 0, LRU_REFS_MASK | BIT(PG_workingset)); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); } unlock: rcu_read_unlock(); } #else /* !CONFIG_LRU_GEN */ static void *lru_gen_eviction(struct folio *folio) { return NULL; } static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec, unsigned long *token, bool *workingset) { return false; } static void lru_gen_refault(struct folio *folio, void *shadow) { } #endif /* CONFIG_LRU_GEN */ /** * workingset_age_nonresident - age non-resident entries as LRU ages * @lruvec: the lruvec that was aged * @nr_pages: the number of pages to count * * As in-memory pages are aged, non-resident pages need to be aged as * well, in order for the refault distances later on to be comparable * to the in-memory dimensions. This function allows reclaim and LRU * operations to drive the non-resident aging along in parallel. */ void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages) { /* * Reclaiming a cgroup means reclaiming all its children in a * round-robin fashion. That means that each cgroup has an LRU * order that is composed of the LRU orders of its child * cgroups; and every page has an LRU position not just in the * cgroup that owns it, but in all of that group's ancestors. * * So when the physical inactive list of a leaf cgroup ages, * the virtual inactive lists of all its parents, including * the root cgroup's, age as well. */ do { atomic_long_add(nr_pages, &lruvec->nonresident_age); } while ((lruvec = parent_lruvec(lruvec))); } /** * workingset_eviction - note the eviction of a folio from memory * @target_memcg: the cgroup that is causing the reclaim * @folio: the folio being evicted * * Return: a shadow entry to be stored in @folio->mapping->i_pages in place * of the evicted @folio so that a later refault can be detected. */ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg) { struct pglist_data *pgdat = folio_pgdat(folio); unsigned long eviction; struct lruvec *lruvec; int memcgid; /* Folio is fully exclusive and pins folio's memory cgroup pointer */ VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (lru_gen_enabled()) return lru_gen_eviction(folio); lruvec = mem_cgroup_lruvec(target_memcg, pgdat); /* XXX: target_memcg can be NULL, go through lruvec */ memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); eviction = atomic_long_read(&lruvec->nonresident_age); eviction >>= bucket_order; workingset_age_nonresident(lruvec, folio_nr_pages(folio)); return pack_shadow(memcgid, pgdat, eviction, folio_test_workingset(folio)); } /** * workingset_test_recent - tests if the shadow entry is for a folio that was * recently evicted. Also fills in @workingset with the value unpacked from * shadow. * @shadow: the shadow entry to be tested. * @file: whether the corresponding folio is from the file lru. * @workingset: where the workingset value unpacked from shadow should * be stored. * @flush: whether to flush cgroup rstat. * * Return: true if the shadow is for a recently evicted folio; false otherwise. */ bool workingset_test_recent(void *shadow, bool file, bool *workingset, bool flush) { struct mem_cgroup *eviction_memcg; struct lruvec *eviction_lruvec; unsigned long refault_distance; unsigned long workingset_size; unsigned long refault; int memcgid; struct pglist_data *pgdat; unsigned long eviction; rcu_read_lock(); if (lru_gen_enabled()) { bool recent = lru_gen_test_recent(shadow, file, &eviction_lruvec, &eviction, workingset); rcu_read_unlock(); return recent; } unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset); eviction <<= bucket_order; /* * Look up the memcg associated with the stored ID. It might * have been deleted since the folio's eviction. * * Note that in rare events the ID could have been recycled * for a new cgroup that refaults a shared folio. This is * impossible to tell from the available data. However, this * should be a rare and limited disturbance, and activations * are always speculative anyway. Ultimately, it's the aging * algorithm's job to shake out the minimum access frequency * for the active cache. * * XXX: On !CONFIG_MEMCG, this will always return NULL; it * would be better if the root_mem_cgroup existed in all * configurations instead. */ eviction_memcg = mem_cgroup_from_id(memcgid); if (!mem_cgroup_disabled() && (!eviction_memcg || !mem_cgroup_tryget(eviction_memcg))) { rcu_read_unlock(); return false; } rcu_read_unlock(); /* * Flush stats (and potentially sleep) outside the RCU read section. * * Note that workingset_test_recent() itself might be called in RCU read * section (for e.g, in cachestat) - these callers need to skip flushing * stats (via the flush argument). * * XXX: With per-memcg flushing and thresholding, is ratelimiting * still needed here? */ if (flush) mem_cgroup_flush_stats_ratelimited(eviction_memcg); eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); refault = atomic_long_read(&eviction_lruvec->nonresident_age); /* * Calculate the refault distance * * The unsigned subtraction here gives an accurate distance * across nonresident_age overflows in most cases. There is a * special case: usually, shadow entries have a short lifetime * and are either refaulted or reclaimed along with the inode * before they get too old. But it is not impossible for the * nonresident_age to lap a shadow entry in the field, which * can then result in a false small refault distance, leading * to a false activation should this old entry actually * refault again. However, earlier kernels used to deactivate * unconditionally with *every* reclaim invocation for the * longest time, so the occasional inappropriate activation * leading to pressure on the active list is not a problem. */ refault_distance = (refault - eviction) & EVICTION_MASK; /* * Compare the distance to the existing workingset size. We * don't activate pages that couldn't stay resident even if * all the memory was available to the workingset. Whether * workingset competition needs to consider anon or not depends * on having free swap space. */ workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE); if (!file) { workingset_size += lruvec_page_state(eviction_lruvec, NR_INACTIVE_FILE); } if (mem_cgroup_get_nr_swap_pages(eviction_memcg) > 0) { workingset_size += lruvec_page_state(eviction_lruvec, NR_ACTIVE_ANON); if (file) { workingset_size += lruvec_page_state(eviction_lruvec, NR_INACTIVE_ANON); } } mem_cgroup_put(eviction_memcg); return refault_distance <= workingset_size; } /** * workingset_refault - Evaluate the refault of a previously evicted folio. * @folio: The freshly allocated replacement folio. * @shadow: Shadow entry of the evicted folio. * * Calculates and evaluates the refault distance of the previously * evicted folio in the context of the node and the memcg whose memory * pressure caused the eviction. */ void workingset_refault(struct folio *folio, void *shadow) { bool file = folio_is_file_lru(folio); struct pglist_data *pgdat; struct mem_cgroup *memcg; struct lruvec *lruvec; bool workingset; long nr; if (lru_gen_enabled()) { lru_gen_refault(folio, shadow); return; } /* * The activation decision for this folio is made at the level * where the eviction occurred, as that is where the LRU order * during folio reclaim is being determined. * * However, the cgroup that will own the folio is the one that * is actually experiencing the refault event. Make sure the folio is * locked to guarantee folio_memcg() stability throughout. */ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); nr = folio_nr_pages(folio); memcg = folio_memcg(folio); pgdat = folio_pgdat(folio); lruvec = mem_cgroup_lruvec(memcg, pgdat); mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); if (!workingset_test_recent(shadow, file, &workingset, true)) return; folio_set_active(folio); workingset_age_nonresident(lruvec, nr); mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, nr); /* Folio was active prior to eviction */ if (workingset) { folio_set_workingset(folio); /* * XXX: Move to folio_add_lru() when it supports new vs * putback */ lru_note_cost_refault(folio); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr); } } /** * workingset_activation - note a page activation * @folio: Folio that is being activated. */ void workingset_activation(struct folio *folio) { struct mem_cgroup *memcg; rcu_read_lock(); /* * Filter non-memcg pages here, e.g. unmap can call * mark_page_accessed() on VDSO pages. * * XXX: See workingset_refault() - this should return * root_mem_cgroup even for !CONFIG_MEMCG. */ memcg = folio_memcg_rcu(folio); if (!mem_cgroup_disabled() && !memcg) goto out; workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio)); out: rcu_read_unlock(); } /* * Shadow entries reflect the share of the working set that does not * fit into memory, so their number depends on the access pattern of * the workload. In most cases, they will refault or get reclaimed * along with the inode, but a (malicious) workload that streams * through files with a total size several times that of available * memory, while preventing the inodes from being reclaimed, can * create excessive amounts of shadow nodes. To keep a lid on this, * track shadow nodes and reclaim them when they grow way past the * point where they would still be useful. */ struct list_lru shadow_nodes; void workingset_update_node(struct xa_node *node) { struct address_space *mapping; struct page *page = virt_to_page(node); /* * Track non-empty nodes that contain only shadow entries; * unlink those that contain pages or are being freed. * * Avoid acquiring the list_lru lock when the nodes are * already where they should be. The list_empty() test is safe * as node->private_list is protected by the i_pages lock. */ mapping = container_of(node->array, struct address_space, i_pages); lockdep_assert_held(&mapping->i_pages.xa_lock); if (node->count && node->count == node->nr_values) { if (list_empty(&node->private_list)) { list_lru_add_obj(&shadow_nodes, &node->private_list); __inc_node_page_state(page, WORKINGSET_NODES); } } else { if (!list_empty(&node->private_list)) { list_lru_del_obj(&shadow_nodes, &node->private_list); __dec_node_page_state(page, WORKINGSET_NODES); } } } static unsigned long count_shadow_nodes(struct shrinker *shrinker, struct shrink_control *sc) { unsigned long max_nodes; unsigned long nodes; unsigned long pages; nodes = list_lru_shrink_count(&shadow_nodes, sc); if (!nodes) return SHRINK_EMPTY; /* * Approximate a reasonable limit for the nodes * containing shadow entries. We don't need to keep more * shadow entries than possible pages on the active list, * since refault distances bigger than that are dismissed. * * The size of the active list converges toward 100% of * overall page cache as memory grows, with only a tiny * inactive list. Assume the total cache size for that. * * Nodes might be sparsely populated, with only one shadow * entry in the extreme case. Obviously, we cannot keep one * node for every eligible shadow entry, so compromise on a * worst-case density of 1/8th. Below that, not all eligible * refaults can be detected anymore. * * On 64-bit with 7 xa_nodes per page and 64 slots * each, this will reclaim shadow entries when they consume * ~1.8% of available memory: * * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE */ #ifdef CONFIG_MEMCG if (sc->memcg) { struct lruvec *lruvec; int i; mem_cgroup_flush_stats_ratelimited(sc->memcg); lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); for (pages = 0, i = 0; i < NR_LRU_LISTS; i++) pages += lruvec_page_state_local(lruvec, NR_LRU_BASE + i); pages += lruvec_page_state_local( lruvec, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT; pages += lruvec_page_state_local( lruvec, NR_SLAB_UNRECLAIMABLE_B) >> PAGE_SHIFT; } else #endif pages = node_present_pages(sc->nid); max_nodes = pages >> (XA_CHUNK_SHIFT - 3); if (nodes <= max_nodes) return 0; return nodes - max_nodes; } static enum lru_status shadow_lru_isolate(struct list_head *item, struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) __must_hold(lru_lock) { struct xa_node *node = container_of(item, struct xa_node, private_list); struct address_space *mapping; int ret; /* * Page cache insertions and deletions synchronously maintain * the shadow node LRU under the i_pages lock and the * lru_lock. Because the page cache tree is emptied before * the inode can be destroyed, holding the lru_lock pins any * address_space that has nodes on the LRU. * * We can then safely transition to the i_pages lock to * pin only the address_space of the particular node we want * to reclaim, take the node off-LRU, and drop the lru_lock. */ mapping = container_of(node->array, struct address_space, i_pages); /* Coming from the list, invert the lock order */ if (!xa_trylock(&mapping->i_pages)) { spin_unlock_irq(lru_lock); ret = LRU_RETRY; goto out; } /* For page cache we need to hold i_lock */ if (mapping->host != NULL) { if (!spin_trylock(&mapping->host->i_lock)) { xa_unlock(&mapping->i_pages); spin_unlock_irq(lru_lock); ret = LRU_RETRY; goto out; } } list_lru_isolate(lru, item); __dec_node_page_state(virt_to_page(node), WORKINGSET_NODES); spin_unlock(lru_lock); /* * The nodes should only contain one or more shadow entries, * no pages, so we expect to be able to remove them all and * delete and free the empty node afterwards. */ if (WARN_ON_ONCE(!node->nr_values)) goto out_invalid; if (WARN_ON_ONCE(node->count != node->nr_values)) goto out_invalid; xa_delete_node(node, workingset_update_node); __inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM); out_invalid: xa_unlock_irq(&mapping->i_pages); if (mapping->host != NULL) { if (mapping_shrinkable(mapping)) inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock); } ret = LRU_REMOVED_RETRY; out: cond_resched(); spin_lock_irq(lru_lock); return ret; } static unsigned long scan_shadow_nodes(struct shrinker *shrinker, struct shrink_control *sc) { /* list_lru lock nests inside the IRQ-safe i_pages lock */ return list_lru_shrink_walk_irq(&shadow_nodes, sc, shadow_lru_isolate, NULL); } /* * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe * i_pages lock. */ static struct lock_class_key shadow_nodes_key; static int __init workingset_init(void) { struct shrinker *workingset_shadow_shrinker; unsigned int timestamp_bits; unsigned int max_order; int ret = -ENOMEM; BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT); /* * Calculate the eviction bucket size to cover the longest * actionable refault distance, which is currently half of * memory (totalram_pages/2). However, memory hotplug may add * some more pages at runtime, so keep working with up to * double the initial memory by using totalram_pages as-is. */ timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT; max_order = fls_long(totalram_pages() - 1); if (max_order > timestamp_bits) bucket_order = max_order - timestamp_bits; pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", timestamp_bits, max_order, bucket_order); workingset_shadow_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-shadow"); if (!workingset_shadow_shrinker) goto err; ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key, workingset_shadow_shrinker); if (ret) goto err_list_lru; workingset_shadow_shrinker->count_objects = count_shadow_nodes; workingset_shadow_shrinker->scan_objects = scan_shadow_nodes; /* ->count reports only fully expendable nodes */ workingset_shadow_shrinker->seeks = 0; shrinker_register(workingset_shadow_shrinker); return 0; err_list_lru: shrinker_free(workingset_shadow_shrinker); err: return ret; } module_init(workingset_init); |
147 42 1 43 136 146 147 147 146 147 146 147 146 146 147 147 147 147 147 147 60 147 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 | // SPDX-License-Identifier: GPL-2.0-only /* * Add configfs and memory store: Kyungchan Koh <kkc6196@fb.com> and * Shaohua Li <shli@fb.com> */ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/sched.h> #include <linux/fs.h> #include <linux/init.h> #include "null_blk.h" #undef pr_fmt #define pr_fmt(fmt) "null_blk: " fmt #define FREE_BATCH 16 #define TICKS_PER_SEC 50ULL #define TIMER_INTERVAL (NSEC_PER_SEC / TICKS_PER_SEC) #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION static DECLARE_FAULT_ATTR(null_timeout_attr); static DECLARE_FAULT_ATTR(null_requeue_attr); static DECLARE_FAULT_ATTR(null_init_hctx_attr); #endif static inline u64 mb_per_tick(int mbps) { return (1 << 20) / TICKS_PER_SEC * ((u64) mbps); } /* * Status flags for nullb_device. * * CONFIGURED: Device has been configured and turned on. Cannot reconfigure. * UP: Device is currently on and visible in userspace. * THROTTLED: Device is being throttled. * CACHE: Device is using a write-back cache. */ enum nullb_device_flags { NULLB_DEV_FL_CONFIGURED = 0, NULLB_DEV_FL_UP = 1, NULLB_DEV_FL_THROTTLED = 2, NULLB_DEV_FL_CACHE = 3, }; #define MAP_SZ ((PAGE_SIZE >> SECTOR_SHIFT) + 2) /* * nullb_page is a page in memory for nullb devices. * * @page: The page holding the data. * @bitmap: The bitmap represents which sector in the page has data. * Each bit represents one block size. For example, sector 8 * will use the 7th bit * The highest 2 bits of bitmap are for special purpose. LOCK means the cache * page is being flushing to storage. FREE means the cache page is freed and * should be skipped from flushing to storage. Please see * null_make_cache_space */ struct nullb_page { struct page *page; DECLARE_BITMAP(bitmap, MAP_SZ); }; #define NULLB_PAGE_LOCK (MAP_SZ - 1) #define NULLB_PAGE_FREE (MAP_SZ - 2) static LIST_HEAD(nullb_list); static struct mutex lock; static int null_major; static DEFINE_IDA(nullb_indexes); static struct blk_mq_tag_set tag_set; enum { NULL_IRQ_NONE = 0, NULL_IRQ_SOFTIRQ = 1, NULL_IRQ_TIMER = 2, }; static bool g_virt_boundary; module_param_named(virt_boundary, g_virt_boundary, bool, 0444); MODULE_PARM_DESC(virt_boundary, "Require a virtual boundary for the device. Default: False"); static int g_no_sched; module_param_named(no_sched, g_no_sched, int, 0444); MODULE_PARM_DESC(no_sched, "No io scheduler"); static int g_submit_queues = 1; module_param_named(submit_queues, g_submit_queues, int, 0444); MODULE_PARM_DESC(submit_queues, "Number of submission queues"); static int g_poll_queues = 1; module_param_named(poll_queues, g_poll_queues, int, 0444); MODULE_PARM_DESC(poll_queues, "Number of IOPOLL submission queues"); static int g_home_node = NUMA_NO_NODE; module_param_named(home_node, g_home_node, int, 0444); MODULE_PARM_DESC(home_node, "Home node for the device"); #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION /* * For more details about fault injection, please refer to * Documentation/fault-injection/fault-injection.rst. */ static char g_timeout_str[80]; module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), 0444); MODULE_PARM_DESC(timeout, "Fault injection. timeout=<interval>,<probability>,<space>,<times>"); static char g_requeue_str[80]; module_param_string(requeue, g_requeue_str, sizeof(g_requeue_str), 0444); MODULE_PARM_DESC(requeue, "Fault injection. requeue=<interval>,<probability>,<space>,<times>"); static char g_init_hctx_str[80]; module_param_string(init_hctx, g_init_hctx_str, sizeof(g_init_hctx_str), 0444); MODULE_PARM_DESC(init_hctx, "Fault injection to fail hctx init. init_hctx=<interval>,<probability>,<space>,<times>"); #endif /* * Historic queue modes. * * These days nothing but NULL_Q_MQ is actually supported, but we keep it the * enum for error reporting. */ enum { NULL_Q_BIO = 0, NULL_Q_RQ = 1, NULL_Q_MQ = 2, }; static int g_queue_mode = NULL_Q_MQ; static int null_param_store_val(const char *str, int *val, int min, int max) { int ret, new_val; ret = kstrtoint(str, 10, &new_val); if (ret) return -EINVAL; if (new_val < min || new_val > max) return -EINVAL; *val = new_val; return 0; } static int null_set_queue_mode(const char *str, const struct kernel_param *kp) { return null_param_store_val(str, &g_queue_mode, NULL_Q_BIO, NULL_Q_MQ); } static const struct kernel_param_ops null_queue_mode_param_ops = { .set = null_set_queue_mode, .get = param_get_int, }; device_param_cb(queue_mode, &null_queue_mode_param_ops, &g_queue_mode, 0444); MODULE_PARM_DESC(queue_mode, "Block interface to use (0=bio,1=rq,2=multiqueue)"); static int g_gb = 250; module_param_named(gb, g_gb, int, 0444); MODULE_PARM_DESC(gb, "Size in GB"); static int g_bs = 512; module_param_named(bs, g_bs, int, 0444); MODULE_PARM_DESC(bs, "Block size (in bytes)"); static int g_max_sectors; module_param_named(max_sectors, g_max_sectors, int, 0444); MODULE_PARM_DESC(max_sectors, "Maximum size of a command (in 512B sectors)"); static unsigned int nr_devices = 1; module_param(nr_devices, uint, 0444); MODULE_PARM_DESC(nr_devices, "Number of devices to register"); static bool g_blocking; module_param_named(blocking, g_blocking, bool, 0444); MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device"); static bool g_shared_tags; module_param_named(shared_tags, g_shared_tags, bool, 0444); MODULE_PARM_DESC(shared_tags, "Share tag set between devices for blk-mq"); static bool g_shared_tag_bitmap; module_param_named(shared_tag_bitmap, g_shared_tag_bitmap, bool, 0444); MODULE_PARM_DESC(shared_tag_bitmap, "Use shared tag bitmap for all submission queues for blk-mq"); static int g_irqmode = NULL_IRQ_SOFTIRQ; static int null_set_irqmode(const char *str, const struct kernel_param *kp) { return null_param_store_val(str, &g_irqmode, NULL_IRQ_NONE, NULL_IRQ_TIMER); } static const struct kernel_param_ops null_irqmode_param_ops = { .set = null_set_irqmode, .get = param_get_int, }; device_param_cb(irqmode, &null_irqmode_param_ops, &g_irqmode, 0444); MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer"); static unsigned long g_completion_nsec = 10000; module_param_named(completion_nsec, g_completion_nsec, ulong, 0444); MODULE_PARM_DESC(completion_nsec, "Time in ns to complete a request in hardware. Default: 10,000ns"); static int g_hw_queue_depth = 64; module_param_named(hw_queue_depth, g_hw_queue_depth, int, 0444); MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 64"); static bool g_use_per_node_hctx; module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, 0444); MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false"); static bool g_memory_backed; module_param_named(memory_backed, g_memory_backed, bool, 0444); MODULE_PARM_DESC(memory_backed, "Create a memory-backed block device. Default: false"); static bool g_discard; module_param_named(discard, g_discard, bool, 0444); MODULE_PARM_DESC(discard, "Support discard operations (requires memory-backed null_blk device). Default: false"); static unsigned long g_cache_size; module_param_named(cache_size, g_cache_size, ulong, 0444); MODULE_PARM_DESC(mbps, "Cache size in MiB for memory-backed device. Default: 0 (none)"); static bool g_fua = true; module_param_named(fua, g_fua, bool, 0444); MODULE_PARM_DESC(fua, "Enable/disable FUA support when cache_size is used. Default: true"); static unsigned int g_mbps; module_param_named(mbps, g_mbps, uint, 0444); MODULE_PARM_DESC(mbps, "Limit maximum bandwidth (in MiB/s). Default: 0 (no limit)"); static bool g_zoned; module_param_named(zoned, g_zoned, bool, S_IRUGO); MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Default: false"); static unsigned long g_zone_size = 256; module_param_named(zone_size, g_zone_size, ulong, S_IRUGO); MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256"); static unsigned long g_zone_capacity; module_param_named(zone_capacity, g_zone_capacity, ulong, 0444); MODULE_PARM_DESC(zone_capacity, "Zone capacity in MB when block device is zoned. Can be less than or equal to zone size. Default: Zone size"); static unsigned int g_zone_nr_conv; module_param_named(zone_nr_conv, g_zone_nr_conv, uint, 0444); MODULE_PARM_DESC(zone_nr_conv, "Number of conventional zones when block device is zoned. Default: 0"); static unsigned int g_zone_max_open; module_param_named(zone_max_open, g_zone_max_open, uint, 0444); MODULE_PARM_DESC(zone_max_open, "Maximum number of open zones when block device is zoned. Default: 0 (no limit)"); static unsigned int g_zone_max_active; module_param_named(zone_max_active, g_zone_max_active, uint, 0444); MODULE_PARM_DESC(zone_max_active, "Maximum number of active zones when block device is zoned. Default: 0 (no limit)"); static int g_zone_append_max_sectors = INT_MAX; module_param_named(zone_append_max_sectors, g_zone_append_max_sectors, int, 0444); MODULE_PARM_DESC(zone_append_max_sectors, "Maximum size of a zone append command (in 512B sectors). Specify 0 for zone append emulation"); static bool g_zone_full; module_param_named(zone_full, g_zone_full, bool, S_IRUGO); MODULE_PARM_DESC(zone_full, "Initialize the sequential write required zones of a zoned device to be full. Default: false"); static struct nullb_device *null_alloc_dev(void); static void null_free_dev(struct nullb_device *dev); static void null_del_dev(struct nullb *nullb); static int null_add_dev(struct nullb_device *dev); static struct nullb *null_find_dev_by_name(const char *name); static void null_free_device_storage(struct nullb_device *dev, bool is_cache); static inline struct nullb_device *to_nullb_device(struct config_item *item) { return item ? container_of(to_config_group(item), struct nullb_device, group) : NULL; } static inline ssize_t nullb_device_uint_attr_show(unsigned int val, char *page) { return snprintf(page, PAGE_SIZE, "%u\n", val); } static inline ssize_t nullb_device_ulong_attr_show(unsigned long val, char *page) { return snprintf(page, PAGE_SIZE, "%lu\n", val); } static inline ssize_t nullb_device_bool_attr_show(bool val, char *page) { return snprintf(page, PAGE_SIZE, "%u\n", val); } static ssize_t nullb_device_uint_attr_store(unsigned int *val, const char *page, size_t count) { unsigned int tmp; int result; result = kstrtouint(page, 0, &tmp); if (result < 0) return result; *val = tmp; return count; } static ssize_t nullb_device_ulong_attr_store(unsigned long *val, const char *page, size_t count) { int result; unsigned long tmp; result = kstrtoul(page, 0, &tmp); if (result < 0) return result; *val = tmp; return count; } static ssize_t nullb_device_bool_attr_store(bool *val, const char *page, size_t count) { bool tmp; int result; result = kstrtobool(page, &tmp); if (result < 0) return result; *val = tmp; return count; } /* The following macro should only be used with TYPE = {uint, ulong, bool}. */ #define NULLB_DEVICE_ATTR(NAME, TYPE, APPLY) \ static ssize_t \ nullb_device_##NAME##_show(struct config_item *item, char *page) \ { \ return nullb_device_##TYPE##_attr_show( \ to_nullb_device(item)->NAME, page); \ } \ static ssize_t \ nullb_device_##NAME##_store(struct config_item *item, const char *page, \ size_t count) \ { \ int (*apply_fn)(struct nullb_device *dev, TYPE new_value) = APPLY;\ struct nullb_device *dev = to_nullb_device(item); \ TYPE new_value = 0; \ int ret; \ \ ret = nullb_device_##TYPE##_attr_store(&new_value, page, count);\ if (ret < 0) \ return ret; \ if (apply_fn) \ ret = apply_fn(dev, new_value); \ else if (test_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags)) \ ret = -EBUSY; \ if (ret < 0) \ return ret; \ dev->NAME = new_value; \ return count; \ } \ CONFIGFS_ATTR(nullb_device_, NAME); static int nullb_update_nr_hw_queues(struct nullb_device *dev, unsigned int submit_queues, unsigned int poll_queues) { struct blk_mq_tag_set *set; int ret, nr_hw_queues; if (!dev->nullb) return 0; /* * Make sure at least one submit queue exists. */ if (!submit_queues) return -EINVAL; /* * Make sure that null_init_hctx() does not access nullb->queues[] past * the end of that array. */ if (submit_queues > nr_cpu_ids || poll_queues > g_poll_queues) return -EINVAL; /* * Keep previous and new queue numbers in nullb_device for reference in * the call back function null_map_queues(). */ dev->prev_submit_queues = dev->submit_queues; dev->prev_poll_queues = dev->poll_queues; dev->submit_queues = submit_queues; dev->poll_queues = poll_queues; set = dev->nullb->tag_set; nr_hw_queues = submit_queues + poll_queues; blk_mq_update_nr_hw_queues(set, nr_hw_queues); ret = set->nr_hw_queues == nr_hw_queues ? 0 : -ENOMEM; if (ret) { /* on error, revert the queue numbers */ dev->submit_queues = dev->prev_submit_queues; dev->poll_queues = dev->prev_poll_queues; } return ret; } static int nullb_apply_submit_queues(struct nullb_device *dev, unsigned int submit_queues) { int ret; mutex_lock(&lock); ret = nullb_update_nr_hw_queues(dev, submit_queues, dev->poll_queues); mutex_unlock(&lock); return ret; } static int nullb_apply_poll_queues(struct nullb_device *dev, unsigned int poll_queues) { int ret; mutex_lock(&lock); ret = nullb_update_nr_hw_queues(dev, dev->submit_queues, poll_queues); mutex_unlock(&lock); return ret; } NULLB_DEVICE_ATTR(size, ulong, NULL); NULLB_DEVICE_ATTR(completion_nsec, ulong, NULL); NULLB_DEVICE_ATTR(submit_queues, uint, nullb_apply_submit_queues); NULLB_DEVICE_ATTR(poll_queues, uint, nullb_apply_poll_queues); NULLB_DEVICE_ATTR(home_node, uint, NULL); NULLB_DEVICE_ATTR(queue_mode, uint, NULL); NULLB_DEVICE_ATTR(blocksize, uint, NULL); NULLB_DEVICE_ATTR(max_sectors, uint, NULL); NULLB_DEVICE_ATTR(irqmode, uint, NULL); NULLB_DEVICE_ATTR(hw_queue_depth, uint, NULL); NULLB_DEVICE_ATTR(index, uint, NULL); NULLB_DEVICE_ATTR(blocking, bool, NULL); NULLB_DEVICE_ATTR(use_per_node_hctx, bool, NULL); NULLB_DEVICE_ATTR(memory_backed, bool, NULL); NULLB_DEVICE_ATTR(discard, bool, NULL); NULLB_DEVICE_ATTR(mbps, uint, NULL); NULLB_DEVICE_ATTR(cache_size, ulong, NULL); NULLB_DEVICE_ATTR(zoned, bool, NULL); NULLB_DEVICE_ATTR(zone_size, ulong, NULL); NULLB_DEVICE_ATTR(zone_capacity, ulong, NULL); NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL); NULLB_DEVICE_ATTR(zone_max_open, uint, NULL); NULLB_DEVICE_ATTR(zone_max_active, uint, NULL); NULLB_DEVICE_ATTR(zone_append_max_sectors, uint, NULL); NULLB_DEVICE_ATTR(zone_full, bool, NULL); NULLB_DEVICE_ATTR(virt_boundary, bool, NULL); NULLB_DEVICE_ATTR(no_sched, bool, NULL); NULLB_DEVICE_ATTR(shared_tags, bool, NULL); NULLB_DEVICE_ATTR(shared_tag_bitmap, bool, NULL); NULLB_DEVICE_ATTR(fua, bool, NULL); static ssize_t nullb_device_power_show(struct config_item *item, char *page) { return nullb_device_bool_attr_show(to_nullb_device(item)->power, page); } static ssize_t nullb_device_power_store(struct config_item *item, const char *page, size_t count) { struct nullb_device *dev = to_nullb_device(item); bool newp = false; ssize_t ret; ret = nullb_device_bool_attr_store(&newp, page, count); if (ret < 0) return ret; ret = count; mutex_lock(&lock); if (!dev->power && newp) { if (test_and_set_bit(NULLB_DEV_FL_UP, &dev->flags)) goto out; ret = null_add_dev(dev); if (ret) { clear_bit(NULLB_DEV_FL_UP, &dev->flags); goto out; } set_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); dev->power = newp; ret = count; } else if (dev->power && !newp) { if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) { dev->power = newp; null_del_dev(dev->nullb); } clear_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); } out: mutex_unlock(&lock); return ret; } CONFIGFS_ATTR(nullb_device_, power); static ssize_t nullb_device_badblocks_show(struct config_item *item, char *page) { struct nullb_device *t_dev = to_nullb_device(item); return badblocks_show(&t_dev->badblocks, page, 0); } static ssize_t nullb_device_badblocks_store(struct config_item *item, const char *page, size_t count) { struct nullb_device *t_dev = to_nullb_device(item); char *orig, *buf, *tmp; u64 start, end; int ret; orig = kstrndup(page, count, GFP_KERNEL); if (!orig) return -ENOMEM; buf = strstrip(orig); ret = -EINVAL; if (buf[0] != '+' && buf[0] != '-') goto out; tmp = strchr(&buf[1], '-'); if (!tmp) goto out; *tmp = '\0'; ret = kstrtoull(buf + 1, 0, &start); if (ret) goto out; ret = kstrtoull(tmp + 1, 0, &end); if (ret) goto out; ret = -EINVAL; if (start > end) goto out; /* enable badblocks */ cmpxchg(&t_dev->badblocks.shift, -1, 0); if (buf[0] == '+') ret = badblocks_set(&t_dev->badblocks, start, end - start + 1, 1); else ret = badblocks_clear(&t_dev->badblocks, start, end - start + 1); if (ret == 0) ret = count; out: kfree(orig); return ret; } CONFIGFS_ATTR(nullb_device_, badblocks); static ssize_t nullb_device_zone_readonly_store(struct config_item *item, const char *page, size_t count) { struct nullb_device *dev = to_nullb_device(item); return zone_cond_store(dev, page, count, BLK_ZONE_COND_READONLY); } CONFIGFS_ATTR_WO(nullb_device_, zone_readonly); static ssize_t nullb_device_zone_offline_store(struct config_item *item, const char *page, size_t count) { struct nullb_device *dev = to_nullb_device(item); return zone_cond_store(dev, page, count, BLK_ZONE_COND_OFFLINE); } CONFIGFS_ATTR_WO(nullb_device_, zone_offline); static struct configfs_attribute *nullb_device_attrs[] = { &nullb_device_attr_size, &nullb_device_attr_completion_nsec, &nullb_device_attr_submit_queues, &nullb_device_attr_poll_queues, &nullb_device_attr_home_node, &nullb_device_attr_queue_mode, &nullb_device_attr_blocksize, &nullb_device_attr_max_sectors, &nullb_device_attr_irqmode, &nullb_device_attr_hw_queue_depth, &nullb_device_attr_index, &nullb_device_attr_blocking, &nullb_device_attr_use_per_node_hctx, &nullb_device_attr_power, &nullb_device_attr_memory_backed, &nullb_device_attr_discard, &nullb_device_attr_mbps, &nullb_device_attr_cache_size, &nullb_device_attr_badblocks, &nullb_device_attr_zoned, &nullb_device_attr_zone_size, &nullb_device_attr_zone_capacity, &nullb_device_attr_zone_nr_conv, &nullb_device_attr_zone_max_open, &nullb_device_attr_zone_max_active, &nullb_device_attr_zone_append_max_sectors, &nullb_device_attr_zone_readonly, &nullb_device_attr_zone_offline, &nullb_device_attr_zone_full, &nullb_device_attr_virt_boundary, &nullb_device_attr_no_sched, &nullb_device_attr_shared_tags, &nullb_device_attr_shared_tag_bitmap, &nullb_device_attr_fua, NULL, }; static void nullb_device_release(struct config_item *item) { struct nullb_device *dev = to_nullb_device(item); null_free_device_storage(dev, false); null_free_dev(dev); } static struct configfs_item_operations nullb_device_ops = { .release = nullb_device_release, }; static const struct config_item_type nullb_device_type = { .ct_item_ops = &nullb_device_ops, .ct_attrs = nullb_device_attrs, .ct_owner = THIS_MODULE, }; #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION static void nullb_add_fault_config(struct nullb_device *dev) { fault_config_init(&dev->timeout_config, "timeout_inject"); fault_config_init(&dev->requeue_config, "requeue_inject"); fault_config_init(&dev->init_hctx_fault_config, "init_hctx_fault_inject"); configfs_add_default_group(&dev->timeout_config.group, &dev->group); configfs_add_default_group(&dev->requeue_config.group, &dev->group); configfs_add_default_group(&dev->init_hctx_fault_config.group, &dev->group); } #else static void nullb_add_fault_config(struct nullb_device *dev) { } #endif static struct config_group *nullb_group_make_group(struct config_group *group, const char *name) { struct nullb_device *dev; if (null_find_dev_by_name(name)) return ERR_PTR(-EEXIST); dev = null_alloc_dev(); if (!dev) return ERR_PTR(-ENOMEM); config_group_init_type_name(&dev->group, name, &nullb_device_type); nullb_add_fault_config(dev); return &dev->group; } static void nullb_group_drop_item(struct config_group *group, struct config_item *item) { struct nullb_device *dev = to_nullb_device(item); if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) { mutex_lock(&lock); dev->power = false; null_del_dev(dev->nullb); mutex_unlock(&lock); } config_item_put(item); } static ssize_t memb_group_features_show(struct config_item *item, char *page) { return snprintf(page, PAGE_SIZE, "badblocks,blocking,blocksize,cache_size,fua," "completion_nsec,discard,home_node,hw_queue_depth," "irqmode,max_sectors,mbps,memory_backed,no_sched," "poll_queues,power,queue_mode,shared_tag_bitmap," "shared_tags,size,submit_queues,use_per_node_hctx," "virt_boundary,zoned,zone_capacity,zone_max_active," "zone_max_open,zone_nr_conv,zone_offline,zone_readonly," "zone_size,zone_append_max_sectors,zone_full\n"); } CONFIGFS_ATTR_RO(memb_group_, features); static struct configfs_attribute *nullb_group_attrs[] = { &memb_group_attr_features, NULL, }; static struct configfs_group_operations nullb_group_ops = { .make_group = nullb_group_make_group, .drop_item = nullb_group_drop_item, }; static const struct config_item_type nullb_group_type = { .ct_group_ops = &nullb_group_ops, .ct_attrs = nullb_group_attrs, .ct_owner = THIS_MODULE, }; static struct configfs_subsystem nullb_subsys = { .su_group = { .cg_item = { .ci_namebuf = "nullb", .ci_type = &nullb_group_type, }, }, }; static inline int null_cache_active(struct nullb *nullb) { return test_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags); } static struct nullb_device *null_alloc_dev(void) { struct nullb_device *dev; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return NULL; #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION dev->timeout_config.attr = null_timeout_attr; dev->requeue_config.attr = null_requeue_attr; dev->init_hctx_fault_config.attr = null_init_hctx_attr; #endif INIT_RADIX_TREE(&dev->data, GFP_ATOMIC); INIT_RADIX_TREE(&dev->cache, GFP_ATOMIC); if (badblocks_init(&dev->badblocks, 0)) { kfree(dev); return NULL; } dev->size = g_gb * 1024; dev->completion_nsec = g_completion_nsec; dev->submit_queues = g_submit_queues; dev->prev_submit_queues = g_submit_queues; dev->poll_queues = g_poll_queues; dev->prev_poll_queues = g_poll_queues; dev->home_node = g_home_node; dev->queue_mode = g_queue_mode; dev->blocksize = g_bs; dev->max_sectors = g_max_sectors; dev->irqmode = g_irqmode; dev->hw_queue_depth = g_hw_queue_depth; dev->blocking = g_blocking; dev->memory_backed = g_memory_backed; dev->discard = g_discard; dev->cache_size = g_cache_size; dev->mbps = g_mbps; dev->use_per_node_hctx = g_use_per_node_hctx; dev->zoned = g_zoned; dev->zone_size = g_zone_size; dev->zone_capacity = g_zone_capacity; dev->zone_nr_conv = g_zone_nr_conv; dev->zone_max_open = g_zone_max_open; dev->zone_max_active = g_zone_max_active; dev->zone_append_max_sectors = g_zone_append_max_sectors; dev->zone_full = g_zone_full; dev->virt_boundary = g_virt_boundary; dev->no_sched = g_no_sched; dev->shared_tags = g_shared_tags; dev->shared_tag_bitmap = g_shared_tag_bitmap; dev->fua = g_fua; return dev; } static void null_free_dev(struct nullb_device *dev) { if (!dev) return; null_free_zoned_dev(dev); badblocks_exit(&dev->badblocks); kfree(dev); } static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer) { struct nullb_cmd *cmd = container_of(timer, struct nullb_cmd, timer); blk_mq_end_request(blk_mq_rq_from_pdu(cmd), cmd->error); return HRTIMER_NORESTART; } static void null_cmd_end_timer(struct nullb_cmd *cmd) { ktime_t kt = cmd->nq->dev->completion_nsec; hrtimer_start(&cmd->timer, kt, HRTIMER_MODE_REL); } static void null_complete_rq(struct request *rq) { struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq); blk_mq_end_request(rq, cmd->error); } static struct nullb_page *null_alloc_page(void) { struct nullb_page *t_page; t_page = kmalloc(sizeof(struct nullb_page), GFP_NOIO); if (!t_page) return NULL; t_page->page = alloc_pages(GFP_NOIO, 0); if (!t_page->page) { kfree(t_page); return NULL; } memset(t_page->bitmap, 0, sizeof(t_page->bitmap)); return t_page; } static void null_free_page(struct nullb_page *t_page) { __set_bit(NULLB_PAGE_FREE, t_page->bitmap); if (test_bit(NULLB_PAGE_LOCK, t_page->bitmap)) return; __free_page(t_page->page); kfree(t_page); } static bool null_page_empty(struct nullb_page *page) { int size = MAP_SZ - 2; return find_first_bit(page->bitmap, size) == size; } static void null_free_sector(struct nullb *nullb, sector_t sector, bool is_cache) { unsigned int sector_bit; u64 idx; struct nullb_page *t_page, *ret; struct radix_tree_root *root; root = is_cache ? &nullb->dev->cache : &nullb->dev->data; idx = sector >> PAGE_SECTORS_SHIFT; sector_bit = (sector & SECTOR_MASK); t_page = radix_tree_lookup(root, idx); if (t_page) { __clear_bit(sector_bit, t_page->bitmap); if (null_page_empty(t_page)) { ret = radix_tree_delete_item(root, idx, t_page); WARN_ON(ret != t_page); null_free_page(ret); if (is_cache) nullb->dev->curr_cache -= PAGE_SIZE; } } } static struct nullb_page *null_radix_tree_insert(struct nullb *nullb, u64 idx, struct nullb_page *t_page, bool is_cache) { struct radix_tree_root *root; root = is_cache ? &nullb->dev->cache : &nullb->dev->data; if (radix_tree_insert(root, idx, t_page)) { null_free_page(t_page); t_page = radix_tree_lookup(root, idx); WARN_ON(!t_page || t_page->page->index != idx); } else if (is_cache) nullb->dev->curr_cache += PAGE_SIZE; return t_page; } static void null_free_device_storage(struct nullb_device *dev, bool is_cache) { unsigned long pos = 0; int nr_pages; struct nullb_page *ret, *t_pages[FREE_BATCH]; struct radix_tree_root *root; root = is_cache ? &dev->cache : &dev->data; do { int i; nr_pages = radix_tree_gang_lookup(root, (void **)t_pages, pos, FREE_BATCH); for (i = 0; i < nr_pages; i++) { pos = t_pages[i]->page->index; ret = radix_tree_delete_item(root, pos, t_pages[i]); WARN_ON(ret != t_pages[i]); null_free_page(ret); } pos++; } while (nr_pages == FREE_BATCH); if (is_cache) dev->curr_cache = 0; } static struct nullb_page *__null_lookup_page(struct nullb *nullb, sector_t sector, bool for_write, bool is_cache) { unsigned int sector_bit; u64 idx; struct nullb_page *t_page; struct radix_tree_root *root; idx = sector >> PAGE_SECTORS_SHIFT; sector_bit = (sector & SECTOR_MASK); root = is_cache ? &nullb->dev->cache : &nullb->dev->data; t_page = radix_tree_lookup(root, idx); WARN_ON(t_page && t_page->page->index != idx); if (t_page && (for_write || test_bit(sector_bit, t_page->bitmap))) return t_page; return NULL; } static struct nullb_page *null_lookup_page(struct nullb *nullb, sector_t sector, bool for_write, bool ignore_cache) { struct nullb_page *page = NULL; if (!ignore_cache) page = __null_lookup_page(nullb, sector, for_write, true); if (page) return page; return __null_lookup_page(nullb, sector, for_write, false); } static struct nullb_page *null_insert_page(struct nullb *nullb, sector_t sector, bool ignore_cache) __releases(&nullb->lock) __acquires(&nullb->lock) { u64 idx; struct nullb_page *t_page; t_page = null_lookup_page(nullb, sector, true, ignore_cache); if (t_page) return t_page; spin_unlock_irq(&nullb->lock); t_page = null_alloc_page(); if (!t_page) goto out_lock; if (radix_tree_preload(GFP_NOIO)) goto out_freepage; spin_lock_irq(&nullb->lock); idx = sector >> PAGE_SECTORS_SHIFT; t_page->page->index = idx; t_page = null_radix_tree_insert(nullb, idx, t_page, !ignore_cache); radix_tree_preload_end(); return t_page; out_freepage: null_free_page(t_page); out_lock: spin_lock_irq(&nullb->lock); return null_lookup_page(nullb, sector, true, ignore_cache); } static int null_flush_cache_page(struct nullb *nullb, struct nullb_page *c_page) { int i; unsigned int offset; u64 idx; struct nullb_page *t_page, *ret; void *dst, *src; idx = c_page->page->index; t_page = null_insert_page(nullb, idx << PAGE_SECTORS_SHIFT, true); __clear_bit(NULLB_PAGE_LOCK, c_page->bitmap); if (test_bit(NULLB_PAGE_FREE, c_page->bitmap)) { null_free_page(c_page); if (t_page && null_page_empty(t_page)) { ret = radix_tree_delete_item(&nullb->dev->data, idx, t_page); null_free_page(t_page); } return 0; } if (!t_page) return -ENOMEM; src = kmap_local_page(c_page->page); dst = kmap_local_page(t_page->page); for (i = 0; i < PAGE_SECTORS; i += (nullb->dev->blocksize >> SECTOR_SHIFT)) { if (test_bit(i, c_page->bitmap)) { offset = (i << SECTOR_SHIFT); memcpy(dst + offset, src + offset, nullb->dev->blocksize); __set_bit(i, t_page->bitmap); } } kunmap_local(dst); kunmap_local(src); ret = radix_tree_delete_item(&nullb->dev->cache, idx, c_page); null_free_page(ret); nullb->dev->curr_cache -= PAGE_SIZE; return 0; } static int null_make_cache_space(struct nullb *nullb, unsigned long n) { int i, err, nr_pages; struct nullb_page *c_pages[FREE_BATCH]; unsigned long flushed = 0, one_round; again: if ((nullb->dev->cache_size * 1024 * 1024) > nullb->dev->curr_cache + n || nullb->dev->curr_cache == 0) return 0; nr_pages = radix_tree_gang_lookup(&nullb->dev->cache, (void **)c_pages, nullb->cache_flush_pos, FREE_BATCH); /* * nullb_flush_cache_page could unlock before using the c_pages. To * avoid race, we don't allow page free */ for (i = 0; i < nr_pages; i++) { nullb->cache_flush_pos = c_pages[i]->page->index; /* * We found the page which is being flushed to disk by other * threads */ if (test_bit(NULLB_PAGE_LOCK, c_pages[i]->bitmap)) c_pages[i] = NULL; else __set_bit(NULLB_PAGE_LOCK, c_pages[i]->bitmap); } one_round = 0; for (i = 0; i < nr_pages; i++) { if (c_pages[i] == NULL) continue; err = null_flush_cache_page(nullb, c_pages[i]); if (err) return err; one_round++; } flushed += one_round << PAGE_SHIFT; if (n > flushed) { if (nr_pages == 0) nullb->cache_flush_pos = 0; if (one_round == 0) { /* give other threads a chance */ spin_unlock_irq(&nullb->lock); spin_lock_irq(&nullb->lock); } goto again; } return 0; } static int copy_to_nullb(struct nullb *nullb, struct page *source, unsigned int off, sector_t sector, size_t n, bool is_fua) { size_t temp, count = 0; unsigned int offset; struct nullb_page *t_page; while (count < n) { temp = min_t(size_t, nullb->dev->blocksize, n - count); if (null_cache_active(nullb) && !is_fua) null_make_cache_space(nullb, PAGE_SIZE); offset = (sector & SECTOR_MASK) << SECTOR_SHIFT; t_page = null_insert_page(nullb, sector, !null_cache_active(nullb) || is_fua); if (!t_page) return -ENOSPC; memcpy_page(t_page->page, offset, source, off + count, temp); __set_bit(sector & SECTOR_MASK, t_page->bitmap); if (is_fua) null_free_sector(nullb, sector, true); count += temp; sector += temp >> SECTOR_SHIFT; } return 0; } static int copy_from_nullb(struct nullb *nullb, struct page *dest, unsigned int off, sector_t sector, size_t n) { size_t temp, count = 0; unsigned int offset; struct nullb_page *t_page; while (count < n) { temp = min_t(size_t, nullb->dev->blocksize, n - count); offset = (sector & SECTOR_MASK) << SECTOR_SHIFT; t_page = null_lookup_page(nullb, sector, false, !null_cache_active(nullb)); if (t_page) memcpy_page(dest, off + count, t_page->page, offset, temp); else zero_user(dest, off + count, temp); count += temp; sector += temp >> SECTOR_SHIFT; } return 0; } static void nullb_fill_pattern(struct nullb *nullb, struct page *page, unsigned int len, unsigned int off) { memset_page(page, off, 0xff, len); } blk_status_t null_handle_discard(struct nullb_device *dev, sector_t sector, sector_t nr_sectors) { struct nullb *nullb = dev->nullb; size_t n = nr_sectors << SECTOR_SHIFT; size_t temp; spin_lock_irq(&nullb->lock); while (n > 0) { temp = min_t(size_t, n, dev->blocksize); null_free_sector(nullb, sector, false); if (null_cache_active(nullb)) null_free_sector(nullb, sector, true); sector += temp >> SECTOR_SHIFT; n -= temp; } spin_unlock_irq(&nullb->lock); return BLK_STS_OK; } static blk_status_t null_handle_flush(struct nullb *nullb) { int err; if (!null_cache_active(nullb)) return 0; spin_lock_irq(&nullb->lock); while (true) { err = null_make_cache_space(nullb, nullb->dev->cache_size * 1024 * 1024); if (err || nullb->dev->curr_cache == 0) break; } WARN_ON(!radix_tree_empty(&nullb->dev->cache)); spin_unlock_irq(&nullb->lock); return errno_to_blk_status(err); } static int null_transfer(struct nullb *nullb, struct page *page, unsigned int len, unsigned int off, bool is_write, sector_t sector, bool is_fua) { struct nullb_device *dev = nullb->dev; unsigned int valid_len = len; int err = 0; if (!is_write) { if (dev->zoned) valid_len = null_zone_valid_read_len(nullb, sector, len); if (valid_len) { err = copy_from_nullb(nullb, page, off, sector, valid_len); off += valid_len; len -= valid_len; } if (len) nullb_fill_pattern(nullb, page, len, off); flush_dcache_page(page); } else { flush_dcache_page(page); err = copy_to_nullb(nullb, page, off, sector, len, is_fua); } return err; } static blk_status_t null_handle_rq(struct nullb_cmd *cmd) { struct request *rq = blk_mq_rq_from_pdu(cmd); struct nullb *nullb = cmd->nq->dev->nullb; int err = 0; unsigned int len; sector_t sector = blk_rq_pos(rq); struct req_iterator iter; struct bio_vec bvec; spin_lock_irq(&nullb->lock); rq_for_each_segment(bvec, rq, iter) { len = bvec.bv_len; err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, op_is_write(req_op(rq)), sector, rq->cmd_flags & REQ_FUA); if (err) break; sector += len >> SECTOR_SHIFT; } spin_unlock_irq(&nullb->lock); return errno_to_blk_status(err); } static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd) { struct nullb_device *dev = cmd->nq->dev; struct nullb *nullb = dev->nullb; blk_status_t sts = BLK_STS_OK; struct request *rq = blk_mq_rq_from_pdu(cmd); if (!hrtimer_active(&nullb->bw_timer)) hrtimer_restart(&nullb->bw_timer); if (atomic_long_sub_return(blk_rq_bytes(rq), &nullb->cur_bytes) < 0) { blk_mq_stop_hw_queues(nullb->q); /* race with timer */ if (atomic_long_read(&nullb->cur_bytes) > 0) blk_mq_start_stopped_hw_queues(nullb->q, true); /* requeue request */ sts = BLK_STS_DEV_RESOURCE; } return sts; } static inline blk_status_t null_handle_badblocks(struct nullb_cmd *cmd, sector_t sector, sector_t nr_sectors) { struct badblocks *bb = &cmd->nq->dev->badblocks; sector_t first_bad; int bad_sectors; if (badblocks_check(bb, sector, nr_sectors, &first_bad, &bad_sectors)) return BLK_STS_IOERR; return BLK_STS_OK; } static inline blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd, enum req_op op, sector_t sector, sector_t nr_sectors) { struct nullb_device *dev = cmd->nq->dev; if (op == REQ_OP_DISCARD) return null_handle_discard(dev, sector, nr_sectors); return null_handle_rq(cmd); } static void nullb_zero_read_cmd_buffer(struct nullb_cmd *cmd) { struct request *rq = blk_mq_rq_from_pdu(cmd); struct nullb_device *dev = cmd->nq->dev; struct bio *bio; if (!dev->memory_backed && req_op(rq) == REQ_OP_READ) { __rq_for_each_bio(bio, rq) zero_fill_bio(bio); } } static inline void nullb_complete_cmd(struct nullb_cmd *cmd) { struct request *rq = blk_mq_rq_from_pdu(cmd); /* * Since root privileges are required to configure the null_blk * driver, it is fine that this driver does not initialize the * data buffers of read commands. Zero-initialize these buffers * anyway if KMSAN is enabled to prevent that KMSAN complains * about null_blk not initializing read data buffers. */ if (IS_ENABLED(CONFIG_KMSAN)) nullb_zero_read_cmd_buffer(cmd); /* Complete IO by inline, softirq or timer */ switch (cmd->nq->dev->irqmode) { case NULL_IRQ_SOFTIRQ: blk_mq_complete_request(rq); break; case NULL_IRQ_NONE: blk_mq_end_request(rq, cmd->error); break; case NULL_IRQ_TIMER: null_cmd_end_timer(cmd); break; } } blk_status_t null_process_cmd(struct nullb_cmd *cmd, enum req_op op, sector_t sector, unsigned int nr_sectors) { struct nullb_device *dev = cmd->nq->dev; blk_status_t ret; if (dev->badblocks.shift != -1) { ret = null_handle_badblocks(cmd, sector, nr_sectors); if (ret != BLK_STS_OK) return ret; } if (dev->memory_backed) return null_handle_memory_backed(cmd, op, sector, nr_sectors); return BLK_STS_OK; } static void null_handle_cmd(struct nullb_cmd *cmd, sector_t sector, sector_t nr_sectors, enum req_op op) { struct nullb_device *dev = cmd->nq->dev; struct nullb *nullb = dev->nullb; blk_status_t sts; if (op == REQ_OP_FLUSH) { cmd->error = null_handle_flush(nullb); goto out; } if (dev->zoned) sts = null_process_zoned_cmd(cmd, op, sector, nr_sectors); else sts = null_process_cmd(cmd, op, sector, nr_sectors); /* Do not overwrite errors (e.g. timeout errors) */ if (cmd->error == BLK_STS_OK) cmd->error = sts; out: nullb_complete_cmd(cmd); } static enum hrtimer_restart nullb_bwtimer_fn(struct hrtimer *timer) { struct nullb *nullb = container_of(timer, struct nullb, bw_timer); ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL); unsigned int mbps = nullb->dev->mbps; if (atomic_long_read(&nullb->cur_bytes) == mb_per_tick(mbps)) return HRTIMER_NORESTART; atomic_long_set(&nullb->cur_bytes, mb_per_tick(mbps)); blk_mq_start_stopped_hw_queues(nullb->q, true); hrtimer_forward_now(&nullb->bw_timer, timer_interval); return HRTIMER_RESTART; } static void nullb_setup_bwtimer(struct nullb *nullb) { ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL); hrtimer_init(&nullb->bw_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); nullb->bw_timer.function = nullb_bwtimer_fn; atomic_long_set(&nullb->cur_bytes, mb_per_tick(nullb->dev->mbps)); hrtimer_start(&nullb->bw_timer, timer_interval, HRTIMER_MODE_REL); } #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION static bool should_timeout_request(struct request *rq) { struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq); struct nullb_device *dev = cmd->nq->dev; return should_fail(&dev->timeout_config.attr, 1); } static bool should_requeue_request(struct request *rq) { struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq); struct nullb_device *dev = cmd->nq->dev; return should_fail(&dev->requeue_config.attr, 1); } static bool should_init_hctx_fail(struct nullb_device *dev) { return should_fail(&dev->init_hctx_fault_config.attr, 1); } #else static bool should_timeout_request(struct request *rq) { return false; } static bool should_requeue_request(struct request *rq) { return false; } static bool should_init_hctx_fail(struct nullb_device *dev) { return false; } #endif static void null_map_queues(struct blk_mq_tag_set *set) { struct nullb *nullb = set->driver_data; int i, qoff; unsigned int submit_queues = g_submit_queues; unsigned int poll_queues = g_poll_queues; if (nullb) { struct nullb_device *dev = nullb->dev; /* * Refer nr_hw_queues of the tag set to check if the expected * number of hardware queues are prepared. If block layer failed * to prepare them, use previous numbers of submit queues and * poll queues to map queues. */ if (set->nr_hw_queues == dev->submit_queues + dev->poll_queues) { submit_queues = dev->submit_queues; poll_queues = dev->poll_queues; } else if (set->nr_hw_queues == dev->prev_submit_queues + dev->prev_poll_queues) { submit_queues = dev->prev_submit_queues; poll_queues = dev->prev_poll_queues; } else { pr_warn("tag set has unexpected nr_hw_queues: %d\n", set->nr_hw_queues); WARN_ON_ONCE(true); submit_queues = 1; poll_queues = 0; } } for (i = 0, qoff = 0; i < set->nr_maps; i++) { struct blk_mq_queue_map *map = &set->map[i]; switch (i) { case HCTX_TYPE_DEFAULT: map->nr_queues = submit_queues; break; case HCTX_TYPE_READ: map->nr_queues = 0; continue; case HCTX_TYPE_POLL: map->nr_queues = poll_queues; break; } map->queue_offset = qoff; qoff += map->nr_queues; blk_mq_map_queues(map); } } static int null_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) { struct nullb_queue *nq = hctx->driver_data; LIST_HEAD(list); int nr = 0; struct request *rq; spin_lock(&nq->poll_lock); list_splice_init(&nq->poll_list, &list); list_for_each_entry(rq, &list, queuelist) blk_mq_set_request_complete(rq); spin_unlock(&nq->poll_lock); while (!list_empty(&list)) { struct nullb_cmd *cmd; struct request *req; req = list_first_entry(&list, struct request, queuelist); list_del_init(&req->queuelist); cmd = blk_mq_rq_to_pdu(req); cmd->error = null_process_cmd(cmd, req_op(req), blk_rq_pos(req), blk_rq_sectors(req)); if (!blk_mq_add_to_batch(req, iob, (__force int) cmd->error, blk_mq_end_request_batch)) blk_mq_end_request(req, cmd->error); nr++; } return nr; } static enum blk_eh_timer_return null_timeout_rq(struct request *rq) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq); if (hctx->type == HCTX_TYPE_POLL) { struct nullb_queue *nq = hctx->driver_data; spin_lock(&nq->poll_lock); /* The request may have completed meanwhile. */ if (blk_mq_request_completed(rq)) { spin_unlock(&nq->poll_lock); return BLK_EH_DONE; } list_del_init(&rq->queuelist); spin_unlock(&nq->poll_lock); } pr_info("rq %p timed out\n", rq); /* * If the device is marked as blocking (i.e. memory backed or zoned * device), the submission path may be blocked waiting for resources * and cause real timeouts. For these real timeouts, the submission * path will complete the request using blk_mq_complete_request(). * Only fake timeouts need to execute blk_mq_complete_request() here. */ cmd->error = BLK_STS_TIMEOUT; if (cmd->fake_timeout || hctx->type == HCTX_TYPE_POLL) blk_mq_complete_request(rq); return BLK_EH_DONE; } static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct request *rq = bd->rq; struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq); struct nullb_queue *nq = hctx->driver_data; sector_t nr_sectors = blk_rq_sectors(rq); sector_t sector = blk_rq_pos(rq); const bool is_poll = hctx->type == HCTX_TYPE_POLL; might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); if (!is_poll && nq->dev->irqmode == NULL_IRQ_TIMER) { hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); cmd->timer.function = null_cmd_timer_expired; } cmd->error = BLK_STS_OK; cmd->nq = nq; cmd->fake_timeout = should_timeout_request(rq) || blk_should_fake_timeout(rq->q); if (should_requeue_request(rq)) { /* * Alternate between hitting the core BUSY path, and the * driver driven requeue path */ nq->requeue_selection++; if (nq->requeue_selection & 1) return BLK_STS_RESOURCE; blk_mq_requeue_request(rq, true); return BLK_STS_OK; } if (test_bit(NULLB_DEV_FL_THROTTLED, &nq->dev->flags)) { blk_status_t sts = null_handle_throttled(cmd); if (sts != BLK_STS_OK) return sts; } blk_mq_start_request(rq); if (is_poll) { spin_lock(&nq->poll_lock); list_add_tail(&rq->queuelist, &nq->poll_list); spin_unlock(&nq->poll_lock); return BLK_STS_OK; } if (cmd->fake_timeout) return BLK_STS_OK; null_handle_cmd(cmd, sector, nr_sectors, req_op(rq)); return BLK_STS_OK; } static void null_queue_rqs(struct request **rqlist) { struct request *requeue_list = NULL; struct request **requeue_lastp = &requeue_list; struct blk_mq_queue_data bd = { }; blk_status_t ret; do { struct request *rq = rq_list_pop(rqlist); bd.rq = rq; ret = null_queue_rq(rq->mq_hctx, &bd); if (ret != BLK_STS_OK) rq_list_add_tail(&requeue_lastp, rq); } while (!rq_list_empty(*rqlist)); *rqlist = requeue_list; } static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq) { nq->dev = nullb->dev; INIT_LIST_HEAD(&nq->poll_list); spin_lock_init(&nq->poll_lock); } static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, unsigned int hctx_idx) { struct nullb *nullb = hctx->queue->queuedata; struct nullb_queue *nq; if (should_init_hctx_fail(nullb->dev)) return -EFAULT; nq = &nullb->queues[hctx_idx]; hctx->driver_data = nq; null_init_queue(nullb, nq); return 0; } static const struct blk_mq_ops null_mq_ops = { .queue_rq = null_queue_rq, .queue_rqs = null_queue_rqs, .complete = null_complete_rq, .timeout = null_timeout_rq, .poll = null_poll, .map_queues = null_map_queues, .init_hctx = null_init_hctx, }; static void null_del_dev(struct nullb *nullb) { struct nullb_device *dev; if (!nullb) return; dev = nullb->dev; ida_free(&nullb_indexes, nullb->index); list_del_init(&nullb->list); del_gendisk(nullb->disk); if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) { hrtimer_cancel(&nullb->bw_timer); atomic_long_set(&nullb->cur_bytes, LONG_MAX); blk_mq_start_stopped_hw_queues(nullb->q, true); } put_disk(nullb->disk); if (nullb->tag_set == &nullb->__tag_set) blk_mq_free_tag_set(nullb->tag_set); kfree(nullb->queues); if (null_cache_active(nullb)) null_free_device_storage(nullb->dev, true); kfree(nullb); dev->nullb = NULL; } static void null_config_discard(struct nullb *nullb, struct queue_limits *lim) { if (nullb->dev->discard == false) return; if (!nullb->dev->memory_backed) { nullb->dev->discard = false; pr_info("discard option is ignored without memory backing\n"); return; } if (nullb->dev->zoned) { nullb->dev->discard = false; pr_info("discard option is ignored in zoned mode\n"); return; } lim->max_hw_discard_sectors = UINT_MAX >> 9; } static const struct block_device_operations null_ops = { .owner = THIS_MODULE, .report_zones = null_report_zones, }; static int setup_queues(struct nullb *nullb) { int nqueues = nr_cpu_ids; if (g_poll_queues) nqueues += g_poll_queues; nullb->queues = kcalloc(nqueues, sizeof(struct nullb_queue), GFP_KERNEL); if (!nullb->queues) return -ENOMEM; return 0; } static int null_init_tag_set(struct blk_mq_tag_set *set, int poll_queues) { set->ops = &null_mq_ops; set->cmd_size = sizeof(struct nullb_cmd); set->timeout = 5 * HZ; set->nr_maps = 1; if (poll_queues) { set->nr_hw_queues += poll_queues; set->nr_maps += 2; } return blk_mq_alloc_tag_set(set); } static int null_init_global_tag_set(void) { int error; if (tag_set.ops) return 0; tag_set.nr_hw_queues = g_submit_queues; tag_set.queue_depth = g_hw_queue_depth; tag_set.numa_node = g_home_node; tag_set.flags = BLK_MQ_F_SHOULD_MERGE; if (g_no_sched) tag_set.flags |= BLK_MQ_F_NO_SCHED; if (g_shared_tag_bitmap) tag_set.flags |= BLK_MQ_F_TAG_HCTX_SHARED; if (g_blocking) tag_set.flags |= BLK_MQ_F_BLOCKING; error = null_init_tag_set(&tag_set, g_poll_queues); if (error) tag_set.ops = NULL; return error; } static int null_setup_tagset(struct nullb *nullb) { if (nullb->dev->shared_tags) { nullb->tag_set = &tag_set; return null_init_global_tag_set(); } nullb->tag_set = &nullb->__tag_set; nullb->tag_set->driver_data = nullb; nullb->tag_set->nr_hw_queues = nullb->dev->submit_queues; nullb->tag_set->queue_depth = nullb->dev->hw_queue_depth; nullb->tag_set->numa_node = nullb->dev->home_node; nullb->tag_set->flags = BLK_MQ_F_SHOULD_MERGE; if (nullb->dev->no_sched) nullb->tag_set->flags |= BLK_MQ_F_NO_SCHED; if (nullb->dev->shared_tag_bitmap) nullb->tag_set->flags |= BLK_MQ_F_TAG_HCTX_SHARED; if (nullb->dev->blocking) nullb->tag_set->flags |= BLK_MQ_F_BLOCKING; return null_init_tag_set(nullb->tag_set, nullb->dev->poll_queues); } static int null_validate_conf(struct nullb_device *dev) { if (dev->queue_mode == NULL_Q_RQ) { pr_err("legacy IO path is no longer available\n"); return -EINVAL; } if (dev->queue_mode == NULL_Q_BIO) { pr_err("BIO-based IO path is no longer available, using blk-mq instead.\n"); dev->queue_mode = NULL_Q_MQ; } if (dev->use_per_node_hctx) { if (dev->submit_queues != nr_online_nodes) dev->submit_queues = nr_online_nodes; } else if (dev->submit_queues > nr_cpu_ids) dev->submit_queues = nr_cpu_ids; else if (dev->submit_queues == 0) dev->submit_queues = 1; dev->prev_submit_queues = dev->submit_queues; if (dev->poll_queues > g_poll_queues) dev->poll_queues = g_poll_queues; dev->prev_poll_queues = dev->poll_queues; dev->irqmode = min_t(unsigned int, dev->irqmode, NULL_IRQ_TIMER); /* Do memory allocation, so set blocking */ if (dev->memory_backed) dev->blocking = true; else /* cache is meaningless */ dev->cache_size = 0; dev->cache_size = min_t(unsigned long, ULONG_MAX / 1024 / 1024, dev->cache_size); dev->mbps = min_t(unsigned int, 1024 * 40, dev->mbps); if (dev->zoned && (!dev->zone_size || !is_power_of_2(dev->zone_size))) { pr_err("zone_size must be power-of-two\n"); return -EINVAL; } return 0; } #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION static bool __null_setup_fault(struct fault_attr *attr, char *str) { if (!str[0]) return true; if (!setup_fault_attr(attr, str)) return false; attr->verbose = 0; return true; } #endif static bool null_setup_fault(void) { #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION if (!__null_setup_fault(&null_timeout_attr, g_timeout_str)) return false; if (!__null_setup_fault(&null_requeue_attr, g_requeue_str)) return false; if (!__null_setup_fault(&null_init_hctx_attr, g_init_hctx_str)) return false; #endif return true; } static int null_add_dev(struct nullb_device *dev) { struct queue_limits lim = { .logical_block_size = dev->blocksize, .physical_block_size = dev->blocksize, .max_hw_sectors = dev->max_sectors, }; struct nullb *nullb; int rv; rv = null_validate_conf(dev); if (rv) return rv; nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, dev->home_node); if (!nullb) { rv = -ENOMEM; goto out; } nullb->dev = dev; dev->nullb = nullb; spin_lock_init(&nullb->lock); rv = setup_queues(nullb); if (rv) goto out_free_nullb; rv = null_setup_tagset(nullb); if (rv) goto out_cleanup_queues; if (dev->virt_boundary) lim.virt_boundary_mask = PAGE_SIZE - 1; null_config_discard(nullb, &lim); if (dev->zoned) { rv = null_init_zoned_dev(dev, &lim); if (rv) goto out_cleanup_tags; } if (dev->cache_size > 0) { set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags); lim.features |= BLK_FEAT_WRITE_CACHE; if (dev->fua) lim.features |= BLK_FEAT_FUA; } nullb->disk = blk_mq_alloc_disk(nullb->tag_set, &lim, nullb); if (IS_ERR(nullb->disk)) { rv = PTR_ERR(nullb->disk); goto out_cleanup_zone; } nullb->q = nullb->disk->queue; if (dev->mbps) { set_bit(NULLB_DEV_FL_THROTTLED, &dev->flags); nullb_setup_bwtimer(nullb); } nullb->q->queuedata = nullb; rv = ida_alloc(&nullb_indexes, GFP_KERNEL); if (rv < 0) goto out_cleanup_disk; nullb->index = rv; dev->index = rv; if (config_item_name(&dev->group.cg_item)) { /* Use configfs dir name as the device name */ snprintf(nullb->disk_name, sizeof(nullb->disk_name), "%s", config_item_name(&dev->group.cg_item)); } else { sprintf(nullb->disk_name, "nullb%d", nullb->index); } set_capacity(nullb->disk, ((sector_t)nullb->dev->size * SZ_1M) >> SECTOR_SHIFT); nullb->disk->major = null_major; nullb->disk->first_minor = nullb->index; nullb->disk->minors = 1; nullb->disk->fops = &null_ops; nullb->disk->private_data = nullb; strscpy_pad(nullb->disk->disk_name, nullb->disk_name, DISK_NAME_LEN); if (nullb->dev->zoned) { rv = null_register_zoned_dev(nullb); if (rv) goto out_ida_free; } rv = add_disk(nullb->disk); if (rv) goto out_ida_free; list_add_tail(&nullb->list, &nullb_list); pr_info("disk %s created\n", nullb->disk_name); return 0; out_ida_free: ida_free(&nullb_indexes, nullb->index); out_cleanup_disk: put_disk(nullb->disk); out_cleanup_zone: null_free_zoned_dev(dev); out_cleanup_tags: if (nullb->tag_set == &nullb->__tag_set) blk_mq_free_tag_set(nullb->tag_set); out_cleanup_queues: kfree(nullb->queues); out_free_nullb: kfree(nullb); dev->nullb = NULL; out: return rv; } static struct nullb *null_find_dev_by_name(const char *name) { struct nullb *nullb = NULL, *nb; mutex_lock(&lock); list_for_each_entry(nb, &nullb_list, list) { if (strcmp(nb->disk_name, name) == 0) { nullb = nb; break; } } mutex_unlock(&lock); return nullb; } static int null_create_dev(void) { struct nullb_device *dev; int ret; dev = null_alloc_dev(); if (!dev) return -ENOMEM; mutex_lock(&lock); ret = null_add_dev(dev); mutex_unlock(&lock); if (ret) { null_free_dev(dev); return ret; } return 0; } static void null_destroy_dev(struct nullb *nullb) { struct nullb_device *dev = nullb->dev; null_del_dev(nullb); null_free_device_storage(dev, false); null_free_dev(dev); } static int __init null_init(void) { int ret = 0; unsigned int i; struct nullb *nullb; if (g_bs > PAGE_SIZE) { pr_warn("invalid block size\n"); pr_warn("defaults block size to %lu\n", PAGE_SIZE); g_bs = PAGE_SIZE; } if (g_home_node != NUMA_NO_NODE && g_home_node >= nr_online_nodes) { pr_err("invalid home_node value\n"); g_home_node = NUMA_NO_NODE; } if (!null_setup_fault()) return -EINVAL; if (g_queue_mode == NULL_Q_RQ) { pr_err("legacy IO path is no longer available\n"); return -EINVAL; } if (g_use_per_node_hctx) { if (g_submit_queues != nr_online_nodes) { pr_warn("submit_queues param is set to %u.\n", nr_online_nodes); g_submit_queues = nr_online_nodes; } } else if (g_submit_queues > nr_cpu_ids) { g_submit_queues = nr_cpu_ids; } else if (g_submit_queues <= 0) { g_submit_queues = 1; } config_group_init(&nullb_subsys.su_group); mutex_init(&nullb_subsys.su_mutex); ret = configfs_register_subsystem(&nullb_subsys); if (ret) return ret; mutex_init(&lock); null_major = register_blkdev(0, "nullb"); if (null_major < 0) { ret = null_major; goto err_conf; } for (i = 0; i < nr_devices; i++) { ret = null_create_dev(); if (ret) goto err_dev; } pr_info("module loaded\n"); return 0; err_dev: while (!list_empty(&nullb_list)) { nullb = list_entry(nullb_list.next, struct nullb, list); null_destroy_dev(nullb); } unregister_blkdev(null_major, "nullb"); err_conf: configfs_unregister_subsystem(&nullb_subsys); return ret; } static void __exit null_exit(void) { struct nullb *nullb; configfs_unregister_subsystem(&nullb_subsys); unregister_blkdev(null_major, "nullb"); mutex_lock(&lock); while (!list_empty(&nullb_list)) { nullb = list_entry(nullb_list.next, struct nullb, list); null_destroy_dev(nullb); } mutex_unlock(&lock); if (tag_set.ops) blk_mq_free_tag_set(&tag_set); mutex_destroy(&lock); } module_init(null_init); module_exit(null_exit); MODULE_AUTHOR("Jens Axboe <axboe@kernel.dk>"); MODULE_DESCRIPTION("multi queue aware block test driver"); MODULE_LICENSE("GPL"); |
709 709 678 680 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 | /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef _ASM_X86_INSN_H #define _ASM_X86_INSN_H /* * x86 instruction analysis * * Copyright (C) IBM Corporation, 2009 */ #include <asm/byteorder.h> /* insn_attr_t is defined in inat.h */ #include <asm/inat.h> /* __ignore_sync_check__ */ #if defined(__BYTE_ORDER) ? __BYTE_ORDER == __LITTLE_ENDIAN : defined(__LITTLE_ENDIAN) struct insn_field { union { insn_value_t value; insn_byte_t bytes[4]; }; /* !0 if we've run insn_get_xxx() for this field */ unsigned char got; unsigned char nbytes; }; static inline void insn_field_set(struct insn_field *p, insn_value_t v, unsigned char n) { p->value = v; p->nbytes = n; } static inline void insn_set_byte(struct insn_field *p, unsigned char n, insn_byte_t v) { p->bytes[n] = v; } #else struct insn_field { insn_value_t value; union { insn_value_t little; insn_byte_t bytes[4]; }; /* !0 if we've run insn_get_xxx() for this field */ unsigned char got; unsigned char nbytes; }; static inline void insn_field_set(struct insn_field *p, insn_value_t v, unsigned char n) { p->value = v; p->little = __cpu_to_le32(v); p->nbytes = n; } static inline void insn_set_byte(struct insn_field *p, unsigned char n, insn_byte_t v) { p->bytes[n] = v; p->value = __le32_to_cpu(p->little); } #endif struct insn { struct insn_field prefixes; /* * Prefixes * prefixes.bytes[3]: last prefix */ struct insn_field rex_prefix; /* REX prefix */ struct insn_field vex_prefix; /* VEX prefix */ struct insn_field opcode; /* * opcode.bytes[0]: opcode1 * opcode.bytes[1]: opcode2 * opcode.bytes[2]: opcode3 */ struct insn_field modrm; struct insn_field sib; struct insn_field displacement; union { struct insn_field immediate; struct insn_field moffset1; /* for 64bit MOV */ struct insn_field immediate1; /* for 64bit imm or off16/32 */ }; union { struct insn_field moffset2; /* for 64bit MOV */ struct insn_field immediate2; /* for 64bit imm or seg16 */ }; int emulate_prefix_size; insn_attr_t attr; unsigned char opnd_bytes; unsigned char addr_bytes; unsigned char length; unsigned char x86_64; const insn_byte_t *kaddr; /* kernel address of insn to analyze */ const insn_byte_t *end_kaddr; /* kernel address of last insn in buffer */ const insn_byte_t *next_byte; }; #define MAX_INSN_SIZE 15 #define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6) #define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3) #define X86_MODRM_RM(modrm) ((modrm) & 0x07) #define X86_SIB_SCALE(sib) (((sib) & 0xc0) >> 6) #define X86_SIB_INDEX(sib) (((sib) & 0x38) >> 3) #define X86_SIB_BASE(sib) ((sib) & 0x07) #define X86_REX2_M(rex) ((rex) & 0x80) /* REX2 M0 */ #define X86_REX2_R(rex) ((rex) & 0x40) /* REX2 R4 */ #define X86_REX2_X(rex) ((rex) & 0x20) /* REX2 X4 */ #define X86_REX2_B(rex) ((rex) & 0x10) /* REX2 B4 */ #define X86_REX_W(rex) ((rex) & 8) /* REX or REX2 W */ #define X86_REX_R(rex) ((rex) & 4) /* REX or REX2 R3 */ #define X86_REX_X(rex) ((rex) & 2) /* REX or REX2 X3 */ #define X86_REX_B(rex) ((rex) & 1) /* REX or REX2 B3 */ /* VEX bit flags */ #define X86_VEX_W(vex) ((vex) & 0x80) /* VEX3 Byte2 */ #define X86_VEX_R(vex) ((vex) & 0x80) /* VEX2/3 Byte1 */ #define X86_VEX_X(vex) ((vex) & 0x40) /* VEX3 Byte1 */ #define X86_VEX_B(vex) ((vex) & 0x20) /* VEX3 Byte1 */ #define X86_VEX_L(vex) ((vex) & 0x04) /* VEX3 Byte2, VEX2 Byte1 */ /* VEX bit fields */ #define X86_EVEX_M(vex) ((vex) & 0x07) /* EVEX Byte1 */ #define X86_VEX3_M(vex) ((vex) & 0x1f) /* VEX3 Byte1 */ #define X86_VEX2_M 1 /* VEX2.M always 1 */ #define X86_VEX_V(vex) (((vex) & 0x78) >> 3) /* VEX3 Byte2, VEX2 Byte1 */ #define X86_VEX_P(vex) ((vex) & 0x03) /* VEX3 Byte2, VEX2 Byte1 */ #define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */ extern void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64); extern int insn_get_prefixes(struct insn *insn); extern int insn_get_opcode(struct insn *insn); extern int insn_get_modrm(struct insn *insn); extern int insn_get_sib(struct insn *insn); extern int insn_get_displacement(struct insn *insn); extern int insn_get_immediate(struct insn *insn); extern int insn_get_length(struct insn *insn); enum insn_mode { INSN_MODE_32, INSN_MODE_64, /* Mode is determined by the current kernel build. */ INSN_MODE_KERN, INSN_NUM_MODES, }; extern int insn_decode(struct insn *insn, const void *kaddr, int buf_len, enum insn_mode m); #define insn_decode_kernel(_insn, _ptr) insn_decode((_insn), (_ptr), MAX_INSN_SIZE, INSN_MODE_KERN) /* Attribute will be determined after getting ModRM (for opcode groups) */ static inline void insn_get_attribute(struct insn *insn) { insn_get_modrm(insn); } /* Instruction uses RIP-relative addressing */ extern int insn_rip_relative(struct insn *insn); static inline int insn_is_rex2(struct insn *insn) { if (!insn->prefixes.got) insn_get_prefixes(insn); return insn->rex_prefix.nbytes == 2; } static inline insn_byte_t insn_rex2_m_bit(struct insn *insn) { return X86_REX2_M(insn->rex_prefix.bytes[1]); } static inline int insn_is_avx(struct insn *insn) { if (!insn->prefixes.got) insn_get_prefixes(insn); return (insn->vex_prefix.value != 0); } static inline int insn_is_evex(struct insn *insn) { if (!insn->prefixes.got) insn_get_prefixes(insn); return (insn->vex_prefix.nbytes == 4); } static inline int insn_has_emulate_prefix(struct insn *insn) { return !!insn->emulate_prefix_size; } static inline insn_byte_t insn_vex_m_bits(struct insn *insn) { if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */ return X86_VEX2_M; else if (insn->vex_prefix.nbytes == 3) /* 3 bytes VEX */ return X86_VEX3_M(insn->vex_prefix.bytes[1]); else /* EVEX */ return X86_EVEX_M(insn->vex_prefix.bytes[1]); } static inline insn_byte_t insn_vex_p_bits(struct insn *insn) { if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */ return X86_VEX_P(insn->vex_prefix.bytes[1]); else return X86_VEX_P(insn->vex_prefix.bytes[2]); } static inline insn_byte_t insn_vex_w_bit(struct insn *insn) { if (insn->vex_prefix.nbytes < 3) return 0; return X86_VEX_W(insn->vex_prefix.bytes[2]); } /* Get the last prefix id from last prefix or VEX prefix */ static inline int insn_last_prefix_id(struct insn *insn) { if (insn_is_avx(insn)) return insn_vex_p_bits(insn); /* VEX_p is a SIMD prefix id */ if (insn->prefixes.bytes[3]) return inat_get_last_prefix_id(insn->prefixes.bytes[3]); return 0; } /* Offset of each field from kaddr */ static inline int insn_offset_rex_prefix(struct insn *insn) { return insn->prefixes.nbytes; } static inline int insn_offset_vex_prefix(struct insn *insn) { return insn_offset_rex_prefix(insn) + insn->rex_prefix.nbytes; } static inline int insn_offset_opcode(struct insn *insn) { return insn_offset_vex_prefix(insn) + insn->vex_prefix.nbytes; } static inline int insn_offset_modrm(struct insn *insn) { return insn_offset_opcode(insn) + insn->opcode.nbytes; } static inline int insn_offset_sib(struct insn *insn) { return insn_offset_modrm(insn) + insn->modrm.nbytes; } static inline int insn_offset_displacement(struct insn *insn) { return insn_offset_sib(insn) + insn->sib.nbytes; } static inline int insn_offset_immediate(struct insn *insn) { return insn_offset_displacement(insn) + insn->displacement.nbytes; } /** * for_each_insn_prefix() -- Iterate prefixes in the instruction * @insn: Pointer to struct insn. * @idx: Index storage. * @prefix: Prefix byte. * * Iterate prefix bytes of given @insn. Each prefix byte is stored in @prefix * and the index is stored in @idx (note that this @idx is just for a cursor, * do not change it.) * Since prefixes.nbytes can be bigger than 4 if some prefixes * are repeated, it cannot be used for looping over the prefixes. */ #define for_each_insn_prefix(insn, idx, prefix) \ for (idx = 0; idx < ARRAY_SIZE(insn->prefixes.bytes) && (prefix = insn->prefixes.bytes[idx]) != 0; idx++) #define POP_SS_OPCODE 0x1f #define MOV_SREG_OPCODE 0x8e /* * Intel SDM Vol.3A 6.8.3 states; * "Any single-step trap that would be delivered following the MOV to SS * instruction or POP to SS instruction (because EFLAGS.TF is 1) is * suppressed." * This function returns true if @insn is MOV SS or POP SS. On these * instructions, single stepping is suppressed. */ static inline int insn_masking_exception(struct insn *insn) { return insn->opcode.bytes[0] == POP_SS_OPCODE || (insn->opcode.bytes[0] == MOV_SREG_OPCODE && X86_MODRM_REG(insn->modrm.bytes[0]) == 2); } #endif /* _ASM_X86_INSN_H */ |
233 127 117 234 232 234 234 216 217 215 126 216 215 216 9 186 217 20 20 225 74 74 74 73 74 74 73 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 | /* * Copyright (C) 2014 Red Hat * Author: Rob Clark <robdclark@gmail.com> * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ #include <drm/drm_atomic.h> #include <drm/drm_crtc.h> #include <drm/drm_device.h> #include <drm/drm_modeset_lock.h> #include <drm/drm_print.h> /** * DOC: kms locking * * As KMS moves toward more fine grained locking, and atomic ioctl where * userspace can indirectly control locking order, it becomes necessary * to use &ww_mutex and acquire-contexts to avoid deadlocks. But because * the locking is more distributed around the driver code, we want a bit * of extra utility/tracking out of our acquire-ctx. This is provided * by &struct drm_modeset_lock and &struct drm_modeset_acquire_ctx. * * For basic principles of &ww_mutex, see: Documentation/locking/ww-mutex-design.rst * * The basic usage pattern is to:: * * drm_modeset_acquire_init(ctx, DRM_MODESET_ACQUIRE_INTERRUPTIBLE) * retry: * foreach (lock in random_ordered_set_of_locks) { * ret = drm_modeset_lock(lock, ctx) * if (ret == -EDEADLK) { * ret = drm_modeset_backoff(ctx); * if (!ret) * goto retry; * } * if (ret) * goto out; * } * ... do stuff ... * out: * drm_modeset_drop_locks(ctx); * drm_modeset_acquire_fini(ctx); * * For convenience this control flow is implemented in * DRM_MODESET_LOCK_ALL_BEGIN() and DRM_MODESET_LOCK_ALL_END() for the case * where all modeset locks need to be taken through drm_modeset_lock_all_ctx(). * * If all that is needed is a single modeset lock, then the &struct * drm_modeset_acquire_ctx is not needed and the locking can be simplified * by passing a NULL instead of ctx in the drm_modeset_lock() call or * calling drm_modeset_lock_single_interruptible(). To unlock afterwards * call drm_modeset_unlock(). * * On top of these per-object locks using &ww_mutex there's also an overall * &drm_mode_config.mutex, for protecting everything else. Mostly this means * probe state of connectors, and preventing hotplug add/removal of connectors. * * Finally there's a bunch of dedicated locks to protect drm core internal * lists and lookup data structures. */ static DEFINE_WW_CLASS(crtc_ww_class); #if IS_ENABLED(CONFIG_DRM_DEBUG_MODESET_LOCK) static noinline depot_stack_handle_t __drm_stack_depot_save(void) { unsigned long entries[8]; unsigned int n; n = stack_trace_save(entries, ARRAY_SIZE(entries), 1); return stack_depot_save(entries, n, GFP_NOWAIT | __GFP_NOWARN); } static void __drm_stack_depot_print(depot_stack_handle_t stack_depot) { struct drm_printer p = drm_dbg_printer(NULL, DRM_UT_KMS, "drm_modeset_lock"); unsigned long *entries; unsigned int nr_entries; char *buf; buf = kmalloc(PAGE_SIZE, GFP_NOWAIT | __GFP_NOWARN); if (!buf) return; nr_entries = stack_depot_fetch(stack_depot, &entries); stack_trace_snprint(buf, PAGE_SIZE, entries, nr_entries, 2); drm_printf(&p, "attempting to lock a contended lock without backoff:\n%s", buf); kfree(buf); } static void __drm_stack_depot_init(void) { stack_depot_init(); } #else /* CONFIG_DRM_DEBUG_MODESET_LOCK */ static depot_stack_handle_t __drm_stack_depot_save(void) { return 0; } static void __drm_stack_depot_print(depot_stack_handle_t stack_depot) { } static void __drm_stack_depot_init(void) { } #endif /* CONFIG_DRM_DEBUG_MODESET_LOCK */ /** * drm_modeset_lock_all - take all modeset locks * @dev: DRM device * * This function takes all modeset locks, suitable where a more fine-grained * scheme isn't (yet) implemented. Locks must be dropped by calling the * drm_modeset_unlock_all() function. * * This function is deprecated. It allocates a lock acquisition context and * stores it in &drm_device.mode_config. This facilitate conversion of * existing code because it removes the need to manually deal with the * acquisition context, but it is also brittle because the context is global * and care must be taken not to nest calls. New code should use the * drm_modeset_lock_all_ctx() function and pass in the context explicitly. */ void drm_modeset_lock_all(struct drm_device *dev) { struct drm_mode_config *config = &dev->mode_config; struct drm_modeset_acquire_ctx *ctx; int ret; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL | __GFP_NOFAIL); if (WARN_ON(!ctx)) return; mutex_lock(&config->mutex); drm_modeset_acquire_init(ctx, 0); retry: ret = drm_modeset_lock_all_ctx(dev, ctx); if (ret < 0) { if (ret == -EDEADLK) { drm_modeset_backoff(ctx); goto retry; } drm_modeset_acquire_fini(ctx); kfree(ctx); return; } ww_acquire_done(&ctx->ww_ctx); WARN_ON(config->acquire_ctx); /* * We hold the locks now, so it is safe to stash the acquisition * context for drm_modeset_unlock_all(). */ config->acquire_ctx = ctx; drm_warn_on_modeset_not_all_locked(dev); } EXPORT_SYMBOL(drm_modeset_lock_all); /** * drm_modeset_unlock_all - drop all modeset locks * @dev: DRM device * * This function drops all modeset locks taken by a previous call to the * drm_modeset_lock_all() function. * * This function is deprecated. It uses the lock acquisition context stored * in &drm_device.mode_config. This facilitates conversion of existing * code because it removes the need to manually deal with the acquisition * context, but it is also brittle because the context is global and care must * be taken not to nest calls. New code should pass the acquisition context * directly to the drm_modeset_drop_locks() function. */ void drm_modeset_unlock_all(struct drm_device *dev) { struct drm_mode_config *config = &dev->mode_config; struct drm_modeset_acquire_ctx *ctx = config->acquire_ctx; if (WARN_ON(!ctx)) return; config->acquire_ctx = NULL; drm_modeset_drop_locks(ctx); drm_modeset_acquire_fini(ctx); kfree(ctx); mutex_unlock(&dev->mode_config.mutex); } EXPORT_SYMBOL(drm_modeset_unlock_all); /** * drm_warn_on_modeset_not_all_locked - check that all modeset locks are locked * @dev: device * * Useful as a debug assert. */ void drm_warn_on_modeset_not_all_locked(struct drm_device *dev) { struct drm_crtc *crtc; /* Locking is currently fubar in the panic handler. */ if (oops_in_progress) return; drm_for_each_crtc(crtc, dev) WARN_ON(!drm_modeset_is_locked(&crtc->mutex)); WARN_ON(!drm_modeset_is_locked(&dev->mode_config.connection_mutex)); WARN_ON(!mutex_is_locked(&dev->mode_config.mutex)); } EXPORT_SYMBOL(drm_warn_on_modeset_not_all_locked); /** * drm_modeset_acquire_init - initialize acquire context * @ctx: the acquire context * @flags: 0 or %DRM_MODESET_ACQUIRE_INTERRUPTIBLE * * When passing %DRM_MODESET_ACQUIRE_INTERRUPTIBLE to @flags, * all calls to drm_modeset_lock() will perform an interruptible * wait. */ void drm_modeset_acquire_init(struct drm_modeset_acquire_ctx *ctx, uint32_t flags) { memset(ctx, 0, sizeof(*ctx)); ww_acquire_init(&ctx->ww_ctx, &crtc_ww_class); INIT_LIST_HEAD(&ctx->locked); if (flags & DRM_MODESET_ACQUIRE_INTERRUPTIBLE) ctx->interruptible = true; } EXPORT_SYMBOL(drm_modeset_acquire_init); /** * drm_modeset_acquire_fini - cleanup acquire context * @ctx: the acquire context */ void drm_modeset_acquire_fini(struct drm_modeset_acquire_ctx *ctx) { ww_acquire_fini(&ctx->ww_ctx); } EXPORT_SYMBOL(drm_modeset_acquire_fini); /** * drm_modeset_drop_locks - drop all locks * @ctx: the acquire context * * Drop all locks currently held against this acquire context. */ void drm_modeset_drop_locks(struct drm_modeset_acquire_ctx *ctx) { if (WARN_ON(ctx->contended)) __drm_stack_depot_print(ctx->stack_depot); while (!list_empty(&ctx->locked)) { struct drm_modeset_lock *lock; lock = list_first_entry(&ctx->locked, struct drm_modeset_lock, head); drm_modeset_unlock(lock); } } EXPORT_SYMBOL(drm_modeset_drop_locks); static inline int modeset_lock(struct drm_modeset_lock *lock, struct drm_modeset_acquire_ctx *ctx, bool interruptible, bool slow) { int ret; if (WARN_ON(ctx->contended)) __drm_stack_depot_print(ctx->stack_depot); if (ctx->trylock_only) { lockdep_assert_held(&ctx->ww_ctx); if (!ww_mutex_trylock(&lock->mutex, NULL)) return -EBUSY; else return 0; } else if (interruptible && slow) { ret = ww_mutex_lock_slow_interruptible(&lock->mutex, &ctx->ww_ctx); } else if (interruptible) { ret = ww_mutex_lock_interruptible(&lock->mutex, &ctx->ww_ctx); } else if (slow) { ww_mutex_lock_slow(&lock->mutex, &ctx->ww_ctx); ret = 0; } else { ret = ww_mutex_lock(&lock->mutex, &ctx->ww_ctx); } if (!ret) { WARN_ON(!list_empty(&lock->head)); list_add(&lock->head, &ctx->locked); } else if (ret == -EALREADY) { /* we already hold the lock.. this is fine. For atomic * we will need to be able to drm_modeset_lock() things * without having to keep track of what is already locked * or not. */ ret = 0; } else if (ret == -EDEADLK) { ctx->contended = lock; ctx->stack_depot = __drm_stack_depot_save(); } return ret; } /** * drm_modeset_backoff - deadlock avoidance backoff * @ctx: the acquire context * * If deadlock is detected (ie. drm_modeset_lock() returns -EDEADLK), * you must call this function to drop all currently held locks and * block until the contended lock becomes available. * * This function returns 0 on success, or -ERESTARTSYS if this context * is initialized with %DRM_MODESET_ACQUIRE_INTERRUPTIBLE and the * wait has been interrupted. */ int drm_modeset_backoff(struct drm_modeset_acquire_ctx *ctx) { struct drm_modeset_lock *contended = ctx->contended; ctx->contended = NULL; ctx->stack_depot = 0; if (WARN_ON(!contended)) return 0; drm_modeset_drop_locks(ctx); return modeset_lock(contended, ctx, ctx->interruptible, true); } EXPORT_SYMBOL(drm_modeset_backoff); /** * drm_modeset_lock_init - initialize lock * @lock: lock to init */ void drm_modeset_lock_init(struct drm_modeset_lock *lock) { ww_mutex_init(&lock->mutex, &crtc_ww_class); INIT_LIST_HEAD(&lock->head); __drm_stack_depot_init(); } EXPORT_SYMBOL(drm_modeset_lock_init); /** * drm_modeset_lock - take modeset lock * @lock: lock to take * @ctx: acquire ctx * * If @ctx is not NULL, then its ww acquire context is used and the * lock will be tracked by the context and can be released by calling * drm_modeset_drop_locks(). If -EDEADLK is returned, this means a * deadlock scenario has been detected and it is an error to attempt * to take any more locks without first calling drm_modeset_backoff(). * * If the @ctx is not NULL and initialized with * %DRM_MODESET_ACQUIRE_INTERRUPTIBLE, this function will fail with * -ERESTARTSYS when interrupted. * * If @ctx is NULL then the function call behaves like a normal, * uninterruptible non-nesting mutex_lock() call. */ int drm_modeset_lock(struct drm_modeset_lock *lock, struct drm_modeset_acquire_ctx *ctx) { if (ctx) return modeset_lock(lock, ctx, ctx->interruptible, false); ww_mutex_lock(&lock->mutex, NULL); return 0; } EXPORT_SYMBOL(drm_modeset_lock); /** * drm_modeset_lock_single_interruptible - take a single modeset lock * @lock: lock to take * * This function behaves as drm_modeset_lock() with a NULL context, * but performs interruptible waits. * * This function returns 0 on success, or -ERESTARTSYS when interrupted. */ int drm_modeset_lock_single_interruptible(struct drm_modeset_lock *lock) { return ww_mutex_lock_interruptible(&lock->mutex, NULL); } EXPORT_SYMBOL(drm_modeset_lock_single_interruptible); /** * drm_modeset_unlock - drop modeset lock * @lock: lock to release */ void drm_modeset_unlock(struct drm_modeset_lock *lock) { list_del_init(&lock->head); ww_mutex_unlock(&lock->mutex); } EXPORT_SYMBOL(drm_modeset_unlock); /** * drm_modeset_lock_all_ctx - take all modeset locks * @dev: DRM device * @ctx: lock acquisition context * * This function takes all modeset locks, suitable where a more fine-grained * scheme isn't (yet) implemented. * * Unlike drm_modeset_lock_all(), it doesn't take the &drm_mode_config.mutex * since that lock isn't required for modeset state changes. Callers which * need to grab that lock too need to do so outside of the acquire context * @ctx. * * Locks acquired with this function should be released by calling the * drm_modeset_drop_locks() function on @ctx. * * See also: DRM_MODESET_LOCK_ALL_BEGIN() and DRM_MODESET_LOCK_ALL_END() * * Returns: 0 on success or a negative error-code on failure. */ int drm_modeset_lock_all_ctx(struct drm_device *dev, struct drm_modeset_acquire_ctx *ctx) { struct drm_private_obj *privobj; struct drm_crtc *crtc; struct drm_plane *plane; int ret; ret = drm_modeset_lock(&dev->mode_config.connection_mutex, ctx); if (ret) return ret; drm_for_each_crtc(crtc, dev) { ret = drm_modeset_lock(&crtc->mutex, ctx); if (ret) return ret; } drm_for_each_plane(plane, dev) { ret = drm_modeset_lock(&plane->mutex, ctx); if (ret) return ret; } drm_for_each_privobj(privobj, dev) { ret = drm_modeset_lock(&privobj->lock, ctx); if (ret) return ret; } return 0; } EXPORT_SYMBOL(drm_modeset_lock_all_ctx); |
62 8 46 2 9 8 34 34 34 31 2 1 1 49 25 60 60 60 47 4 1 3 4 21 1 1 2 18 1 1 1 17 17 17 25 44 6 39 18 11 36 12 18 12 24 24 18 18 18 18 18 18 17 17 39 39 36 36 36 36 36 5 5 2 11 2 1 1 7 35 35 35 35 25 28 34 138 98 37 43 44 1 1 1 4 4 1 4 7 25 5 16 6 20 25 23 2 25 17 24 2 15 1 13 13 1 44 38 7 41 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Userspace interface * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> */ #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/netpoll.h> #include <linux/ethtool.h> #include <linux/if_arp.h> #include <linux/module.h> #include <linux/init.h> #include <linux/rtnetlink.h> #include <linux/if_ether.h> #include <linux/slab.h> #include <net/dsa.h> #include <net/sock.h> #include <linux/if_vlan.h> #include <net/switchdev.h> #include <net/net_namespace.h> #include "br_private.h" /* * Determine initial path cost based on speed. * using recommendations from 802.1d standard * * Since driver might sleep need to not be holding any locks. */ static int port_cost(struct net_device *dev) { struct ethtool_link_ksettings ecmd; if (!__ethtool_get_link_ksettings(dev, &ecmd)) { switch (ecmd.base.speed) { case SPEED_10000: return 2; case SPEED_5000: return 3; case SPEED_2500: return 4; case SPEED_1000: return 5; case SPEED_100: return 19; case SPEED_10: return 100; case SPEED_UNKNOWN: return 100; default: if (ecmd.base.speed > SPEED_10000) return 1; } } /* Old silly heuristics based on name */ if (!strncmp(dev->name, "lec", 3)) return 7; if (!strncmp(dev->name, "plip", 4)) return 2500; return 100; /* assume old 10Mbps */ } /* Check for port carrier transitions. */ void br_port_carrier_check(struct net_bridge_port *p, bool *notified) { struct net_device *dev = p->dev; struct net_bridge *br = p->br; if (!(p->flags & BR_ADMIN_COST) && netif_running(dev) && netif_oper_up(dev)) p->path_cost = port_cost(dev); *notified = false; if (!netif_running(br->dev)) return; spin_lock_bh(&br->lock); if (netif_running(dev) && netif_oper_up(dev)) { if (p->state == BR_STATE_DISABLED) { br_stp_enable_port(p); *notified = true; } } else { if (p->state != BR_STATE_DISABLED) { br_stp_disable_port(p); *notified = true; } } spin_unlock_bh(&br->lock); } static void br_port_set_promisc(struct net_bridge_port *p) { int err = 0; if (br_promisc_port(p)) return; err = dev_set_promiscuity(p->dev, 1); if (err) return; br_fdb_unsync_static(p->br, p); p->flags |= BR_PROMISC; } static void br_port_clear_promisc(struct net_bridge_port *p) { int err; /* Check if the port is already non-promisc or if it doesn't * support UNICAST filtering. Without unicast filtering support * we'll end up re-enabling promisc mode anyway, so just check for * it here. */ if (!br_promisc_port(p) || !(p->dev->priv_flags & IFF_UNICAST_FLT)) return; /* Since we'll be clearing the promisc mode, program the port * first so that we don't have interruption in traffic. */ err = br_fdb_sync_static(p->br, p); if (err) return; dev_set_promiscuity(p->dev, -1); p->flags &= ~BR_PROMISC; } /* When a port is added or removed or when certain port flags * change, this function is called to automatically manage * promiscuity setting of all the bridge ports. We are always called * under RTNL so can skip using rcu primitives. */ void br_manage_promisc(struct net_bridge *br) { struct net_bridge_port *p; bool set_all = false; /* If vlan filtering is disabled or bridge interface is placed * into promiscuous mode, place all ports in promiscuous mode. */ if ((br->dev->flags & IFF_PROMISC) || !br_vlan_enabled(br->dev)) set_all = true; list_for_each_entry(p, &br->port_list, list) { if (set_all) { br_port_set_promisc(p); } else { /* If the number of auto-ports is <= 1, then all other * ports will have their output configuration * statically specified through fdbs. Since ingress * on the auto-port becomes forwarding/egress to other * ports and egress configuration is statically known, * we can say that ingress configuration of the * auto-port is also statically known. * This lets us disable promiscuous mode and write * this config to hw. */ if ((p->dev->priv_flags & IFF_UNICAST_FLT) && (br->auto_cnt == 0 || (br->auto_cnt == 1 && br_auto_port(p)))) br_port_clear_promisc(p); else br_port_set_promisc(p); } } } int nbp_backup_change(struct net_bridge_port *p, struct net_device *backup_dev) { struct net_bridge_port *old_backup = rtnl_dereference(p->backup_port); struct net_bridge_port *backup_p = NULL; ASSERT_RTNL(); if (backup_dev) { if (!netif_is_bridge_port(backup_dev)) return -ENOENT; backup_p = br_port_get_rtnl(backup_dev); if (backup_p->br != p->br) return -EINVAL; } if (p == backup_p) return -EINVAL; if (old_backup == backup_p) return 0; /* if the backup link is already set, clear it */ if (old_backup) old_backup->backup_redirected_cnt--; if (backup_p) backup_p->backup_redirected_cnt++; rcu_assign_pointer(p->backup_port, backup_p); return 0; } static void nbp_backup_clear(struct net_bridge_port *p) { nbp_backup_change(p, NULL); if (p->backup_redirected_cnt) { struct net_bridge_port *cur_p; list_for_each_entry(cur_p, &p->br->port_list, list) { struct net_bridge_port *backup_p; backup_p = rtnl_dereference(cur_p->backup_port); if (backup_p == p) nbp_backup_change(cur_p, NULL); } } WARN_ON(rcu_access_pointer(p->backup_port) || p->backup_redirected_cnt); } static void nbp_update_port_count(struct net_bridge *br) { struct net_bridge_port *p; u32 cnt = 0; list_for_each_entry(p, &br->port_list, list) { if (br_auto_port(p)) cnt++; } if (br->auto_cnt != cnt) { br->auto_cnt = cnt; br_manage_promisc(br); } } static void nbp_delete_promisc(struct net_bridge_port *p) { /* If port is currently promiscuous, unset promiscuity. * Otherwise, it is a static port so remove all addresses * from it. */ dev_set_allmulti(p->dev, -1); if (br_promisc_port(p)) dev_set_promiscuity(p->dev, -1); else br_fdb_unsync_static(p->br, p); } static void release_nbp(struct kobject *kobj) { struct net_bridge_port *p = container_of(kobj, struct net_bridge_port, kobj); kfree(p); } static void brport_get_ownership(const struct kobject *kobj, kuid_t *uid, kgid_t *gid) { struct net_bridge_port *p = kobj_to_brport(kobj); net_ns_get_ownership(dev_net(p->dev), uid, gid); } static const struct kobj_type brport_ktype = { #ifdef CONFIG_SYSFS .sysfs_ops = &brport_sysfs_ops, #endif .release = release_nbp, .get_ownership = brport_get_ownership, }; static void destroy_nbp(struct net_bridge_port *p) { struct net_device *dev = p->dev; p->br = NULL; p->dev = NULL; netdev_put(dev, &p->dev_tracker); kobject_put(&p->kobj); } static void destroy_nbp_rcu(struct rcu_head *head) { struct net_bridge_port *p = container_of(head, struct net_bridge_port, rcu); destroy_nbp(p); } static unsigned get_max_headroom(struct net_bridge *br) { unsigned max_headroom = 0; struct net_bridge_port *p; list_for_each_entry(p, &br->port_list, list) { unsigned dev_headroom = netdev_get_fwd_headroom(p->dev); if (dev_headroom > max_headroom) max_headroom = dev_headroom; } return max_headroom; } static void update_headroom(struct net_bridge *br, int new_hr) { struct net_bridge_port *p; list_for_each_entry(p, &br->port_list, list) netdev_set_rx_headroom(p->dev, new_hr); br->dev->needed_headroom = new_hr; } /* Delete port(interface) from bridge is done in two steps. * via RCU. First step, marks device as down. That deletes * all the timers and stops new packets from flowing through. * * Final cleanup doesn't occur until after all CPU's finished * processing packets. * * Protected from multiple admin operations by RTNL mutex */ static void del_nbp(struct net_bridge_port *p) { struct net_bridge *br = p->br; struct net_device *dev = p->dev; sysfs_remove_link(br->ifobj, p->dev->name); nbp_delete_promisc(p); spin_lock_bh(&br->lock); br_stp_disable_port(p); spin_unlock_bh(&br->lock); br_mrp_port_del(br, p); br_cfm_port_del(br, p); br_ifinfo_notify(RTM_DELLINK, NULL, p); list_del_rcu(&p->list); if (netdev_get_fwd_headroom(dev) == br->dev->needed_headroom) update_headroom(br, get_max_headroom(br)); netdev_reset_rx_headroom(dev); nbp_vlan_flush(p); br_fdb_delete_by_port(br, p, 0, 1); switchdev_deferred_process(); nbp_backup_clear(p); nbp_update_port_count(br); netdev_upper_dev_unlink(dev, br->dev); dev->priv_flags &= ~IFF_BRIDGE_PORT; netdev_rx_handler_unregister(dev); br_multicast_del_port(p); kobject_uevent(&p->kobj, KOBJ_REMOVE); kobject_del(&p->kobj); br_netpoll_disable(p); call_rcu(&p->rcu, destroy_nbp_rcu); } /* Delete bridge device */ void br_dev_delete(struct net_device *dev, struct list_head *head) { struct net_bridge *br = netdev_priv(dev); struct net_bridge_port *p, *n; list_for_each_entry_safe(p, n, &br->port_list, list) { del_nbp(p); } br_recalculate_neigh_suppress_enabled(br); br_fdb_delete_by_port(br, NULL, 0, 1); cancel_delayed_work_sync(&br->gc_work); br_sysfs_delbr(br->dev); unregister_netdevice_queue(br->dev, head); } /* find an available port number */ static int find_portno(struct net_bridge *br) { int index; struct net_bridge_port *p; unsigned long *inuse; inuse = bitmap_zalloc(BR_MAX_PORTS, GFP_KERNEL); if (!inuse) return -ENOMEM; __set_bit(0, inuse); /* zero is reserved */ list_for_each_entry(p, &br->port_list, list) __set_bit(p->port_no, inuse); index = find_first_zero_bit(inuse, BR_MAX_PORTS); bitmap_free(inuse); return (index >= BR_MAX_PORTS) ? -EXFULL : index; } /* called with RTNL but without bridge lock */ static struct net_bridge_port *new_nbp(struct net_bridge *br, struct net_device *dev) { struct net_bridge_port *p; int index, err; index = find_portno(br); if (index < 0) return ERR_PTR(index); p = kzalloc(sizeof(*p), GFP_KERNEL); if (p == NULL) return ERR_PTR(-ENOMEM); p->br = br; netdev_hold(dev, &p->dev_tracker, GFP_KERNEL); p->dev = dev; p->path_cost = port_cost(dev); p->priority = 0x8000 >> BR_PORT_BITS; p->port_no = index; p->flags = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD | BR_BCAST_FLOOD; br_init_port(p); br_set_state(p, BR_STATE_DISABLED); br_stp_port_timer_init(p); err = br_multicast_add_port(p); if (err) { netdev_put(dev, &p->dev_tracker); kfree(p); p = ERR_PTR(err); } return p; } int br_add_bridge(struct net *net, const char *name) { struct net_device *dev; int res; dev = alloc_netdev(sizeof(struct net_bridge), name, NET_NAME_UNKNOWN, br_dev_setup); if (!dev) return -ENOMEM; dev_net_set(dev, net); dev->rtnl_link_ops = &br_link_ops; res = register_netdevice(dev); if (res) free_netdev(dev); return res; } int br_del_bridge(struct net *net, const char *name) { struct net_device *dev; int ret = 0; dev = __dev_get_by_name(net, name); if (dev == NULL) ret = -ENXIO; /* Could not find device */ else if (!netif_is_bridge_master(dev)) { /* Attempt to delete non bridge device! */ ret = -EPERM; } else if (dev->flags & IFF_UP) { /* Not shutdown yet. */ ret = -EBUSY; } else br_dev_delete(dev, NULL); return ret; } /* MTU of the bridge pseudo-device: ETH_DATA_LEN or the minimum of the ports */ static int br_mtu_min(const struct net_bridge *br) { const struct net_bridge_port *p; int ret_mtu = 0; list_for_each_entry(p, &br->port_list, list) if (!ret_mtu || ret_mtu > p->dev->mtu) ret_mtu = p->dev->mtu; return ret_mtu ? ret_mtu : ETH_DATA_LEN; } void br_mtu_auto_adjust(struct net_bridge *br) { ASSERT_RTNL(); /* if the bridge MTU was manually configured don't mess with it */ if (br_opt_get(br, BROPT_MTU_SET_BY_USER)) return; /* change to the minimum MTU and clear the flag which was set by * the bridge ndo_change_mtu callback */ dev_set_mtu(br->dev, br_mtu_min(br)); br_opt_toggle(br, BROPT_MTU_SET_BY_USER, false); } static void br_set_gso_limits(struct net_bridge *br) { unsigned int tso_max_size = TSO_MAX_SIZE; const struct net_bridge_port *p; u16 tso_max_segs = TSO_MAX_SEGS; list_for_each_entry(p, &br->port_list, list) { tso_max_size = min(tso_max_size, p->dev->tso_max_size); tso_max_segs = min(tso_max_segs, p->dev->tso_max_segs); } netif_set_tso_max_size(br->dev, tso_max_size); netif_set_tso_max_segs(br->dev, tso_max_segs); } /* * Recomputes features using slave's features */ netdev_features_t br_features_recompute(struct net_bridge *br, netdev_features_t features) { struct net_bridge_port *p; netdev_features_t mask; if (list_empty(&br->port_list)) return features; mask = features; features &= ~NETIF_F_ONE_FOR_ALL; list_for_each_entry(p, &br->port_list, list) { features = netdev_increment_features(features, p->dev->features, mask); } features = netdev_add_tso_features(features, mask); return features; } /* called with RTNL */ int br_add_if(struct net_bridge *br, struct net_device *dev, struct netlink_ext_ack *extack) { struct net_bridge_port *p; int err = 0; unsigned br_hr, dev_hr; bool changed_addr, fdb_synced = false; /* Don't allow bridging non-ethernet like devices. */ if ((dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr)) return -EINVAL; /* No bridging of bridges */ if (dev->netdev_ops->ndo_start_xmit == br_dev_xmit) { NL_SET_ERR_MSG(extack, "Can not enslave a bridge to a bridge"); return -ELOOP; } /* Device has master upper dev */ if (netdev_master_upper_dev_get(dev)) return -EBUSY; /* No bridging devices that dislike that (e.g. wireless) */ if (dev->priv_flags & IFF_DONT_BRIDGE) { NL_SET_ERR_MSG(extack, "Device does not allow enslaving to a bridge"); return -EOPNOTSUPP; } p = new_nbp(br, dev); if (IS_ERR(p)) return PTR_ERR(p); call_netdevice_notifiers(NETDEV_JOIN, dev); err = dev_set_allmulti(dev, 1); if (err) { br_multicast_del_port(p); netdev_put(dev, &p->dev_tracker); kfree(p); /* kobject not yet init'd, manually free */ goto err1; } err = kobject_init_and_add(&p->kobj, &brport_ktype, &(dev->dev.kobj), SYSFS_BRIDGE_PORT_ATTR); if (err) goto err2; err = br_sysfs_addif(p); if (err) goto err2; err = br_netpoll_enable(p); if (err) goto err3; err = netdev_rx_handler_register(dev, br_get_rx_handler(dev), p); if (err) goto err4; dev->priv_flags |= IFF_BRIDGE_PORT; err = netdev_master_upper_dev_link(dev, br->dev, NULL, NULL, extack); if (err) goto err5; dev_disable_lro(dev); list_add_rcu(&p->list, &br->port_list); nbp_update_port_count(br); if (!br_promisc_port(p) && (p->dev->priv_flags & IFF_UNICAST_FLT)) { /* When updating the port count we also update all ports' * promiscuous mode. * A port leaving promiscuous mode normally gets the bridge's * fdb synced to the unicast filter (if supported), however, * `br_port_clear_promisc` does not distinguish between * non-promiscuous ports and *new* ports, so we need to * sync explicitly here. */ fdb_synced = br_fdb_sync_static(br, p) == 0; if (!fdb_synced) netdev_err(dev, "failed to sync bridge static fdb addresses to this port\n"); } netdev_update_features(br->dev); br_hr = br->dev->needed_headroom; dev_hr = netdev_get_fwd_headroom(dev); if (br_hr < dev_hr) update_headroom(br, dev_hr); else netdev_set_rx_headroom(dev, br_hr); if (br_fdb_add_local(br, p, dev->dev_addr, 0)) netdev_err(dev, "failed insert local address bridge forwarding table\n"); if (br->dev->addr_assign_type != NET_ADDR_SET) { /* Ask for permission to use this MAC address now, even if we * don't end up choosing it below. */ err = dev_pre_changeaddr_notify(br->dev, dev->dev_addr, extack); if (err) goto err6; } err = nbp_vlan_init(p, extack); if (err) { netdev_err(dev, "failed to initialize vlan filtering on this port\n"); goto err6; } spin_lock_bh(&br->lock); changed_addr = br_stp_recalculate_bridge_id(br); if (netif_running(dev) && netif_oper_up(dev) && (br->dev->flags & IFF_UP)) br_stp_enable_port(p); spin_unlock_bh(&br->lock); br_ifinfo_notify(RTM_NEWLINK, NULL, p); if (changed_addr) call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev); br_mtu_auto_adjust(br); br_set_gso_limits(br); kobject_uevent(&p->kobj, KOBJ_ADD); return 0; err6: if (fdb_synced) br_fdb_unsync_static(br, p); list_del_rcu(&p->list); br_fdb_delete_by_port(br, p, 0, 1); nbp_update_port_count(br); netdev_upper_dev_unlink(dev, br->dev); err5: dev->priv_flags &= ~IFF_BRIDGE_PORT; netdev_rx_handler_unregister(dev); err4: br_netpoll_disable(p); err3: sysfs_remove_link(br->ifobj, p->dev->name); err2: br_multicast_del_port(p); netdev_put(dev, &p->dev_tracker); kobject_put(&p->kobj); dev_set_allmulti(dev, -1); err1: return err; } /* called with RTNL */ int br_del_if(struct net_bridge *br, struct net_device *dev) { struct net_bridge_port *p; bool changed_addr; p = br_port_get_rtnl(dev); if (!p || p->br != br) return -EINVAL; /* Since more than one interface can be attached to a bridge, * there still maybe an alternate path for netconsole to use; * therefore there is no reason for a NETDEV_RELEASE event. */ del_nbp(p); br_mtu_auto_adjust(br); br_set_gso_limits(br); spin_lock_bh(&br->lock); changed_addr = br_stp_recalculate_bridge_id(br); spin_unlock_bh(&br->lock); if (changed_addr) call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev); netdev_update_features(br->dev); return 0; } void br_port_flags_change(struct net_bridge_port *p, unsigned long mask) { struct net_bridge *br = p->br; if (mask & BR_AUTO_MASK) nbp_update_port_count(br); if (mask & (BR_NEIGH_SUPPRESS | BR_NEIGH_VLAN_SUPPRESS)) br_recalculate_neigh_suppress_enabled(br); } bool br_port_flag_is_set(const struct net_device *dev, unsigned long flag) { struct net_bridge_port *p; p = br_port_get_rtnl_rcu(dev); if (!p) return false; return p->flags & flag; } EXPORT_SYMBOL_GPL(br_port_flag_is_set); |
5 3 2 4 2 1 1 3 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 | // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/namei.h> #include <linux/io_uring.h> #include <linux/fsnotify.h> #include <uapi/linux/io_uring.h> #include "io_uring.h" #include "sync.h" struct io_sync { struct file *file; loff_t len; loff_t off; int flags; int mode; }; int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sync *sync = io_kiocb_to_cmd(req, struct io_sync); if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in)) return -EINVAL; sync->off = READ_ONCE(sqe->off); sync->len = READ_ONCE(sqe->len); sync->flags = READ_ONCE(sqe->sync_range_flags); req->flags |= REQ_F_FORCE_ASYNC; return 0; } int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags) { struct io_sync *sync = io_kiocb_to_cmd(req, struct io_sync); int ret; /* sync_file_range always requires a blocking context */ WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = sync_file_range(req->file, sync->off, sync->len, sync->flags); io_req_set_res(req, ret, 0); return IOU_OK; } int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sync *sync = io_kiocb_to_cmd(req, struct io_sync); if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in)) return -EINVAL; sync->flags = READ_ONCE(sqe->fsync_flags); if (unlikely(sync->flags & ~IORING_FSYNC_DATASYNC)) return -EINVAL; sync->off = READ_ONCE(sqe->off); sync->len = READ_ONCE(sqe->len); req->flags |= REQ_F_FORCE_ASYNC; return 0; } int io_fsync(struct io_kiocb *req, unsigned int issue_flags) { struct io_sync *sync = io_kiocb_to_cmd(req, struct io_sync); loff_t end = sync->off + sync->len; int ret; /* fsync always requires a blocking context */ WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = vfs_fsync_range(req->file, sync->off, end > 0 ? end : LLONG_MAX, sync->flags & IORING_FSYNC_DATASYNC); io_req_set_res(req, ret, 0); return IOU_OK; } int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sync *sync = io_kiocb_to_cmd(req, struct io_sync); if (sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; sync->off = READ_ONCE(sqe->off); sync->len = READ_ONCE(sqe->addr); sync->mode = READ_ONCE(sqe->len); req->flags |= REQ_F_FORCE_ASYNC; return 0; } int io_fallocate(struct io_kiocb *req, unsigned int issue_flags) { struct io_sync *sync = io_kiocb_to_cmd(req, struct io_sync); int ret; /* fallocate always requiring blocking context */ WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = vfs_fallocate(req->file, sync->mode, sync->off, sync->len); if (ret >= 0) fsnotify_modify(req->file); io_req_set_res(req, ret, 0); return IOU_OK; } |
2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 | // SPDX-License-Identifier: GPL-2.0 #include <linux/fs.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/hugetlb.h> #include <linux/mman.h> #include <linux/mmzone.h> #include <linux/memblock.h> #include <linux/proc_fs.h> #include <linux/percpu.h> #include <linux/seq_file.h> #include <linux/swap.h> #include <linux/vmstat.h> #include <linux/atomic.h> #include <linux/vmalloc.h> #ifdef CONFIG_CMA #include <linux/cma.h> #endif #include <linux/zswap.h> #include <asm/page.h> #include "internal.h" void __attribute__((weak)) arch_report_meminfo(struct seq_file *m) { } static void show_val_kb(struct seq_file *m, const char *s, unsigned long num) { seq_put_decimal_ull_width(m, s, num << (PAGE_SHIFT - 10), 8); seq_write(m, " kB\n", 4); } static int meminfo_proc_show(struct seq_file *m, void *v) { struct sysinfo i; unsigned long committed; long cached; long available; unsigned long pages[NR_LRU_LISTS]; unsigned long sreclaimable, sunreclaim; int lru; si_meminfo(&i); si_swapinfo(&i); committed = vm_memory_committed(); cached = global_node_page_state(NR_FILE_PAGES) - total_swapcache_pages() - i.bufferram; if (cached < 0) cached = 0; for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) pages[lru] = global_node_page_state(NR_LRU_BASE + lru); available = si_mem_available(); sreclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B); sunreclaim = global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B); show_val_kb(m, "MemTotal: ", i.totalram); show_val_kb(m, "MemFree: ", i.freeram); show_val_kb(m, "MemAvailable: ", available); show_val_kb(m, "Buffers: ", i.bufferram); show_val_kb(m, "Cached: ", cached); show_val_kb(m, "SwapCached: ", total_swapcache_pages()); show_val_kb(m, "Active: ", pages[LRU_ACTIVE_ANON] + pages[LRU_ACTIVE_FILE]); show_val_kb(m, "Inactive: ", pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]); show_val_kb(m, "Active(anon): ", pages[LRU_ACTIVE_ANON]); show_val_kb(m, "Inactive(anon): ", pages[LRU_INACTIVE_ANON]); show_val_kb(m, "Active(file): ", pages[LRU_ACTIVE_FILE]); show_val_kb(m, "Inactive(file): ", pages[LRU_INACTIVE_FILE]); show_val_kb(m, "Unevictable: ", pages[LRU_UNEVICTABLE]); show_val_kb(m, "Mlocked: ", global_zone_page_state(NR_MLOCK)); #ifdef CONFIG_HIGHMEM show_val_kb(m, "HighTotal: ", i.totalhigh); show_val_kb(m, "HighFree: ", i.freehigh); show_val_kb(m, "LowTotal: ", i.totalram - i.totalhigh); show_val_kb(m, "LowFree: ", i.freeram - i.freehigh); #endif #ifndef CONFIG_MMU show_val_kb(m, "MmapCopy: ", (unsigned long)atomic_long_read(&mmap_pages_allocated)); #endif show_val_kb(m, "SwapTotal: ", i.totalswap); show_val_kb(m, "SwapFree: ", i.freeswap); #ifdef CONFIG_ZSWAP show_val_kb(m, "Zswap: ", zswap_total_pages()); seq_printf(m, "Zswapped: %8lu kB\n", (unsigned long)atomic_read(&zswap_stored_pages) << (PAGE_SHIFT - 10)); #endif show_val_kb(m, "Dirty: ", global_node_page_state(NR_FILE_DIRTY)); show_val_kb(m, "Writeback: ", global_node_page_state(NR_WRITEBACK)); show_val_kb(m, "AnonPages: ", global_node_page_state(NR_ANON_MAPPED)); show_val_kb(m, "Mapped: ", global_node_page_state(NR_FILE_MAPPED)); show_val_kb(m, "Shmem: ", i.sharedram); show_val_kb(m, "KReclaimable: ", sreclaimable + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE)); show_val_kb(m, "Slab: ", sreclaimable + sunreclaim); show_val_kb(m, "SReclaimable: ", sreclaimable); show_val_kb(m, "SUnreclaim: ", sunreclaim); seq_printf(m, "KernelStack: %8lu kB\n", global_node_page_state(NR_KERNEL_STACK_KB)); #ifdef CONFIG_SHADOW_CALL_STACK seq_printf(m, "ShadowCallStack:%8lu kB\n", global_node_page_state(NR_KERNEL_SCS_KB)); #endif show_val_kb(m, "PageTables: ", global_node_page_state(NR_PAGETABLE)); show_val_kb(m, "SecPageTables: ", global_node_page_state(NR_SECONDARY_PAGETABLE)); show_val_kb(m, "NFS_Unstable: ", 0); show_val_kb(m, "Bounce: ", global_zone_page_state(NR_BOUNCE)); show_val_kb(m, "WritebackTmp: ", global_node_page_state(NR_WRITEBACK_TEMP)); show_val_kb(m, "CommitLimit: ", vm_commit_limit()); show_val_kb(m, "Committed_AS: ", committed); seq_printf(m, "VmallocTotal: %8lu kB\n", (unsigned long)VMALLOC_TOTAL >> 10); show_val_kb(m, "VmallocUsed: ", vmalloc_nr_pages()); show_val_kb(m, "VmallocChunk: ", 0ul); show_val_kb(m, "Percpu: ", pcpu_nr_pages()); memtest_report_meminfo(m); #ifdef CONFIG_MEMORY_FAILURE seq_printf(m, "HardwareCorrupted: %5lu kB\n", atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)); #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE show_val_kb(m, "AnonHugePages: ", global_node_page_state(NR_ANON_THPS)); show_val_kb(m, "ShmemHugePages: ", global_node_page_state(NR_SHMEM_THPS)); show_val_kb(m, "ShmemPmdMapped: ", global_node_page_state(NR_SHMEM_PMDMAPPED)); show_val_kb(m, "FileHugePages: ", global_node_page_state(NR_FILE_THPS)); show_val_kb(m, "FilePmdMapped: ", global_node_page_state(NR_FILE_PMDMAPPED)); #endif #ifdef CONFIG_CMA show_val_kb(m, "CmaTotal: ", totalcma_pages); show_val_kb(m, "CmaFree: ", global_zone_page_state(NR_FREE_CMA_PAGES)); #endif #ifdef CONFIG_UNACCEPTED_MEMORY show_val_kb(m, "Unaccepted: ", global_zone_page_state(NR_UNACCEPTED)); #endif hugetlb_report_meminfo(m); arch_report_meminfo(m); return 0; } static int __init proc_meminfo_init(void) { struct proc_dir_entry *pde; pde = proc_create_single("meminfo", 0, NULL, meminfo_proc_show); pde_make_permanent(pde); return 0; } fs_initcall(proc_meminfo_init); |
104 104 99 111 9 104 104 111 9 104 108 114 8 108 107 1 1 107 55 113 8 107 105 2 2 2 2 111 24 89 89 89 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 1996-2000 Russell King. * * Scan ADFS partitions on hard disk drives. Unfortunately, there * isn't a standard for partitioning drives on Acorn machines, so * every single manufacturer of SCSI and IDE cards created their own * method. */ #include <linux/buffer_head.h> #include <linux/adfs_fs.h> #include "check.h" /* * Partition types. (Oh for reusability) */ #define PARTITION_RISCIX_MFM 1 #define PARTITION_RISCIX_SCSI 2 #define PARTITION_LINUX 9 #if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ defined(CONFIG_ACORN_PARTITION_ADFS) static struct adfs_discrecord * adfs_partition(struct parsed_partitions *state, char *name, char *data, unsigned long first_sector, int slot) { struct adfs_discrecord *dr; unsigned int nr_sects; if (adfs_checkbblk(data)) return NULL; dr = (struct adfs_discrecord *)(data + 0x1c0); if (dr->disc_size == 0 && dr->disc_size_high == 0) return NULL; nr_sects = (le32_to_cpu(dr->disc_size_high) << 23) | (le32_to_cpu(dr->disc_size) >> 9); if (name) { strlcat(state->pp_buf, " [", PAGE_SIZE); strlcat(state->pp_buf, name, PAGE_SIZE); strlcat(state->pp_buf, "]", PAGE_SIZE); } put_partition(state, slot, first_sector, nr_sects); return dr; } #endif #ifdef CONFIG_ACORN_PARTITION_RISCIX struct riscix_part { __le32 start; __le32 length; __le32 one; char name[16]; }; struct riscix_record { __le32 magic; #define RISCIX_MAGIC cpu_to_le32(0x4a657320) __le32 date; struct riscix_part part[8]; }; #if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ defined(CONFIG_ACORN_PARTITION_ADFS) static int riscix_partition(struct parsed_partitions *state, unsigned long first_sect, int slot, unsigned long nr_sects) { Sector sect; struct riscix_record *rr; rr = read_part_sector(state, first_sect, §); if (!rr) return -1; strlcat(state->pp_buf, " [RISCiX]", PAGE_SIZE); if (rr->magic == RISCIX_MAGIC) { unsigned long size = nr_sects > 2 ? 2 : nr_sects; int part; strlcat(state->pp_buf, " <", PAGE_SIZE); put_partition(state, slot++, first_sect, size); for (part = 0; part < 8; part++) { if (rr->part[part].one && memcmp(rr->part[part].name, "All\0", 4)) { put_partition(state, slot++, le32_to_cpu(rr->part[part].start), le32_to_cpu(rr->part[part].length)); strlcat(state->pp_buf, "(", PAGE_SIZE); strlcat(state->pp_buf, rr->part[part].name, PAGE_SIZE); strlcat(state->pp_buf, ")", PAGE_SIZE); } } strlcat(state->pp_buf, " >\n", PAGE_SIZE); } else { put_partition(state, slot++, first_sect, nr_sects); } put_dev_sector(sect); return slot; } #endif #endif #define LINUX_NATIVE_MAGIC 0xdeafa1de #define LINUX_SWAP_MAGIC 0xdeafab1e struct linux_part { __le32 magic; __le32 start_sect; __le32 nr_sects; }; #if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ defined(CONFIG_ACORN_PARTITION_ADFS) static int linux_partition(struct parsed_partitions *state, unsigned long first_sect, int slot, unsigned long nr_sects) { Sector sect; struct linux_part *linuxp; unsigned long size = nr_sects > 2 ? 2 : nr_sects; strlcat(state->pp_buf, " [Linux]", PAGE_SIZE); put_partition(state, slot++, first_sect, size); linuxp = read_part_sector(state, first_sect, §); if (!linuxp) return -1; strlcat(state->pp_buf, " <", PAGE_SIZE); while (linuxp->magic == cpu_to_le32(LINUX_NATIVE_MAGIC) || linuxp->magic == cpu_to_le32(LINUX_SWAP_MAGIC)) { if (slot == state->limit) break; put_partition(state, slot++, first_sect + le32_to_cpu(linuxp->start_sect), le32_to_cpu(linuxp->nr_sects)); linuxp ++; } strlcat(state->pp_buf, " >", PAGE_SIZE); put_dev_sector(sect); return slot; } #endif #ifdef CONFIG_ACORN_PARTITION_CUMANA int adfspart_check_CUMANA(struct parsed_partitions *state) { unsigned long first_sector = 0; unsigned int start_blk = 0; Sector sect; unsigned char *data; char *name = "CUMANA/ADFS"; int first = 1; int slot = 1; /* * Try Cumana style partitions - sector 6 contains ADFS boot block * with pointer to next 'drive'. * * There are unknowns in this code - is the 'cylinder number' of the * next partition relative to the start of this one - I'm assuming * it is. * * Also, which ID did Cumana use? * * This is totally unfinished, and will require more work to get it * going. Hence it is totally untested. */ do { struct adfs_discrecord *dr; unsigned int nr_sects; data = read_part_sector(state, start_blk * 2 + 6, §); if (!data) return -1; if (slot == state->limit) break; dr = adfs_partition(state, name, data, first_sector, slot++); if (!dr) break; name = NULL; nr_sects = (data[0x1fd] + (data[0x1fe] << 8)) * (dr->heads + (dr->lowsector & 0x40 ? 1 : 0)) * dr->secspertrack; if (!nr_sects) break; first = 0; first_sector += nr_sects; start_blk += nr_sects >> (BLOCK_SIZE_BITS - 9); nr_sects = 0; /* hmm - should be partition size */ switch (data[0x1fc] & 15) { case 0: /* No partition / ADFS? */ break; #ifdef CONFIG_ACORN_PARTITION_RISCIX case PARTITION_RISCIX_SCSI: /* RISCiX - we don't know how to find the next one. */ slot = riscix_partition(state, first_sector, slot, nr_sects); break; #endif case PARTITION_LINUX: slot = linux_partition(state, first_sector, slot, nr_sects); break; } put_dev_sector(sect); if (slot == -1) return -1; } while (1); put_dev_sector(sect); return first ? 0 : 1; } #endif #ifdef CONFIG_ACORN_PARTITION_ADFS /* * Purpose: allocate ADFS partitions. * * Params : hd - pointer to gendisk structure to store partition info. * dev - device number to access. * * Returns: -1 on error, 0 for no ADFS boot sector, 1 for ok. * * Alloc : hda = whole drive * hda1 = ADFS partition on first drive. * hda2 = non-ADFS partition. */ int adfspart_check_ADFS(struct parsed_partitions *state) { unsigned long start_sect, nr_sects, sectscyl, heads; Sector sect; unsigned char *data; struct adfs_discrecord *dr; unsigned char id; int slot = 1; data = read_part_sector(state, 6, §); if (!data) return -1; dr = adfs_partition(state, "ADFS", data, 0, slot++); if (!dr) { put_dev_sector(sect); return 0; } heads = dr->heads + ((dr->lowsector >> 6) & 1); sectscyl = dr->secspertrack * heads; start_sect = ((data[0x1fe] << 8) + data[0x1fd]) * sectscyl; id = data[0x1fc] & 15; put_dev_sector(sect); /* * Work out start of non-adfs partition. */ nr_sects = get_capacity(state->disk) - start_sect; if (start_sect) { switch (id) { #ifdef CONFIG_ACORN_PARTITION_RISCIX case PARTITION_RISCIX_SCSI: case PARTITION_RISCIX_MFM: riscix_partition(state, start_sect, slot, nr_sects); break; #endif case PARTITION_LINUX: linux_partition(state, start_sect, slot, nr_sects); break; } } strlcat(state->pp_buf, "\n", PAGE_SIZE); return 1; } #endif #ifdef CONFIG_ACORN_PARTITION_ICS struct ics_part { __le32 start; __le32 size; }; static int adfspart_check_ICSLinux(struct parsed_partitions *state, unsigned long block) { Sector sect; unsigned char *data = read_part_sector(state, block, §); int result = 0; if (data) { if (memcmp(data, "LinuxPart", 9) == 0) result = 1; put_dev_sector(sect); } return result; } /* * Check for a valid ICS partition using the checksum. */ static inline int valid_ics_sector(const unsigned char *data) { unsigned long sum; int i; for (i = 0, sum = 0x50617274; i < 508; i++) sum += data[i]; sum -= le32_to_cpu(*(__le32 *)(&data[508])); return sum == 0; } /* * Purpose: allocate ICS partitions. * Params : hd - pointer to gendisk structure to store partition info. * dev - device number to access. * Returns: -1 on error, 0 for no ICS table, 1 for partitions ok. * Alloc : hda = whole drive * hda1 = ADFS partition 0 on first drive. * hda2 = ADFS partition 1 on first drive. * ..etc.. */ int adfspart_check_ICS(struct parsed_partitions *state) { const unsigned char *data; const struct ics_part *p; int slot; Sector sect; /* * Try ICS style partitions - sector 0 contains partition info. */ data = read_part_sector(state, 0, §); if (!data) return -1; if (!valid_ics_sector(data)) { put_dev_sector(sect); return 0; } strlcat(state->pp_buf, " [ICS]", PAGE_SIZE); for (slot = 1, p = (const struct ics_part *)data; p->size; p++) { u32 start = le32_to_cpu(p->start); s32 size = le32_to_cpu(p->size); /* yes, it's signed. */ if (slot == state->limit) break; /* * Negative sizes tell the RISC OS ICS driver to ignore * this partition - in effect it says that this does not * contain an ADFS filesystem. */ if (size < 0) { size = -size; /* * Our own extension - We use the first sector * of the partition to identify what type this * partition is. We must not make this visible * to the filesystem. */ if (size > 1 && adfspart_check_ICSLinux(state, start)) { start += 1; size -= 1; } } if (size) put_partition(state, slot++, start, size); } put_dev_sector(sect); strlcat(state->pp_buf, "\n", PAGE_SIZE); return 1; } #endif #ifdef CONFIG_ACORN_PARTITION_POWERTEC struct ptec_part { __le32 unused1; __le32 unused2; __le32 start; __le32 size; __le32 unused5; char type[8]; }; static inline int valid_ptec_sector(const unsigned char *data) { unsigned char checksum = 0x2a; int i; /* * If it looks like a PC/BIOS partition, then it * probably isn't PowerTec. */ if (data[510] == 0x55 && data[511] == 0xaa) return 0; for (i = 0; i < 511; i++) checksum += data[i]; return checksum == data[511]; } /* * Purpose: allocate ICS partitions. * Params : hd - pointer to gendisk structure to store partition info. * dev - device number to access. * Returns: -1 on error, 0 for no ICS table, 1 for partitions ok. * Alloc : hda = whole drive * hda1 = ADFS partition 0 on first drive. * hda2 = ADFS partition 1 on first drive. * ..etc.. */ int adfspart_check_POWERTEC(struct parsed_partitions *state) { Sector sect; const unsigned char *data; const struct ptec_part *p; int slot = 1; int i; data = read_part_sector(state, 0, §); if (!data) return -1; if (!valid_ptec_sector(data)) { put_dev_sector(sect); return 0; } strlcat(state->pp_buf, " [POWERTEC]", PAGE_SIZE); for (i = 0, p = (const struct ptec_part *)data; i < 12; i++, p++) { u32 start = le32_to_cpu(p->start); u32 size = le32_to_cpu(p->size); if (size) put_partition(state, slot++, start, size); } put_dev_sector(sect); strlcat(state->pp_buf, "\n", PAGE_SIZE); return 1; } #endif #ifdef CONFIG_ACORN_PARTITION_EESOX struct eesox_part { char magic[6]; char name[10]; __le32 start; __le32 unused6; __le32 unused7; __le32 unused8; }; /* * Guess who created this format? */ static const char eesox_name[] = { 'N', 'e', 'i', 'l', ' ', 'C', 'r', 'i', 't', 'c', 'h', 'e', 'l', 'l', ' ', ' ' }; /* * EESOX SCSI partition format. * * This is a goddamned awful partition format. We don't seem to store * the size of the partition in this table, only the start addresses. * * There are two possibilities where the size comes from: * 1. The individual ADFS boot block entries that are placed on the disk. * 2. The start address of the next entry. */ int adfspart_check_EESOX(struct parsed_partitions *state) { Sector sect; const unsigned char *data; unsigned char buffer[256]; struct eesox_part *p; sector_t start = 0; int i, slot = 1; data = read_part_sector(state, 7, §); if (!data) return -1; /* * "Decrypt" the partition table. God knows why... */ for (i = 0; i < 256; i++) buffer[i] = data[i] ^ eesox_name[i & 15]; put_dev_sector(sect); for (i = 0, p = (struct eesox_part *)buffer; i < 8; i++, p++) { sector_t next; if (memcmp(p->magic, "Eesox", 6)) break; next = le32_to_cpu(p->start); if (i) put_partition(state, slot++, start, next - start); start = next; } if (i != 0) { sector_t size; size = get_capacity(state->disk); put_partition(state, slot++, start, size - start); strlcat(state->pp_buf, "\n", PAGE_SIZE); } return i ? 1 : 0; } #endif |
5 129 36 5 1 34 34 211 1 21 21 21 20 1 1 215 64 63 49 49 47 47 141 142 34 34 150 117 34 215 215 143 17 150 52 40 13 145 144 41 120 144 5 216 217 217 30 30 29 30 30 29 30 30 30 48 49 95 96 96 96 95 49 49 1 48 49 48 211 211 210 210 196 14 194 196 169 5 5 5 5 5 5 5 6 6 5 5 5 216 1 1 156 3 216 145 1 157 158 157 52 156 214 215 145 155 217 56 214 217 217 11 14 14 12 1 15 14 1 15 15 10 10 1 2 2 36 41 123 123 123 123 123 41 41 123 89 41 41 41 128 97 5 36 34 34 130 26 130 130 130 89 38 3 123 129 129 130 42 42 42 65 86 211 211 211 211 210 4 93 92 1 1 1 1 1 1 1 1 26 26 217 216 6 217 13 7 155 169 81 8 55 55 55 54 6 6 6 6 6 6 76 76 15 15 2 2 2 2 2 12 12 12 12 6 51 50 51 12 8 8 8 8 74 74 12 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 | // SPDX-License-Identifier: GPL-2.0-only /* * Kernel-based Virtual Machine driver for Linux * * This module enables machines with Intel VT-x extensions to run virtual * machines without emulation or binary translation. * * MMU support * * Copyright (C) 2006 Qumranet, Inc. * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Yaniv Kamay <yaniv@qumranet.com> * Avi Kivity <avi@qumranet.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "irq.h" #include "ioapic.h" #include "mmu.h" #include "mmu_internal.h" #include "tdp_mmu.h" #include "x86.h" #include "kvm_cache_regs.h" #include "smm.h" #include "kvm_emulate.h" #include "page_track.h" #include "cpuid.h" #include "spte.h" #include <linux/kvm_host.h> #include <linux/types.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/highmem.h> #include <linux/moduleparam.h> #include <linux/export.h> #include <linux/swap.h> #include <linux/hugetlb.h> #include <linux/compiler.h> #include <linux/srcu.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/uaccess.h> #include <linux/hash.h> #include <linux/kern_levels.h> #include <linux/kstrtox.h> #include <linux/kthread.h> #include <linux/wordpart.h> #include <asm/page.h> #include <asm/memtype.h> #include <asm/cmpxchg.h> #include <asm/io.h> #include <asm/set_memory.h> #include <asm/spec-ctrl.h> #include <asm/vmx.h> #include "trace.h" static bool nx_hugepage_mitigation_hard_disabled; int __read_mostly nx_huge_pages = -1; static uint __read_mostly nx_huge_pages_recovery_period_ms; #ifdef CONFIG_PREEMPT_RT /* Recovery can cause latency spikes, disable it for PREEMPT_RT. */ static uint __read_mostly nx_huge_pages_recovery_ratio = 0; #else static uint __read_mostly nx_huge_pages_recovery_ratio = 60; #endif static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp); static int set_nx_huge_pages(const char *val, const struct kernel_param *kp); static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp); static const struct kernel_param_ops nx_huge_pages_ops = { .set = set_nx_huge_pages, .get = get_nx_huge_pages, }; static const struct kernel_param_ops nx_huge_pages_recovery_param_ops = { .set = set_nx_huge_pages_recovery_param, .get = param_get_uint, }; module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644); __MODULE_PARM_TYPE(nx_huge_pages, "bool"); module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_param_ops, &nx_huge_pages_recovery_ratio, 0644); __MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint"); module_param_cb(nx_huge_pages_recovery_period_ms, &nx_huge_pages_recovery_param_ops, &nx_huge_pages_recovery_period_ms, 0644); __MODULE_PARM_TYPE(nx_huge_pages_recovery_period_ms, "uint"); static bool __read_mostly force_flush_and_sync_on_reuse; module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644); /* * When setting this variable to true it enables Two-Dimensional-Paging * where the hardware walks 2 page tables: * 1. the guest-virtual to guest-physical * 2. while doing 1. it walks guest-physical to host-physical * If the hardware supports that we don't need to do shadow paging. */ bool tdp_enabled = false; static bool __ro_after_init tdp_mmu_allowed; #ifdef CONFIG_X86_64 bool __read_mostly tdp_mmu_enabled = true; module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0444); #endif static int max_huge_page_level __read_mostly; static int tdp_root_level __read_mostly; static int max_tdp_level __read_mostly; #define PTE_PREFETCH_NUM 8 #include <trace/events/kvm.h> /* make pte_list_desc fit well in cache lines */ #define PTE_LIST_EXT 14 /* * struct pte_list_desc is the core data structure used to implement a custom * list for tracking a set of related SPTEs, e.g. all the SPTEs that map a * given GFN when used in the context of rmaps. Using a custom list allows KVM * to optimize for the common case where many GFNs will have at most a handful * of SPTEs pointing at them, i.e. allows packing multiple SPTEs into a small * memory footprint, which in turn improves runtime performance by exploiting * cache locality. * * A list is comprised of one or more pte_list_desc objects (descriptors). * Each individual descriptor stores up to PTE_LIST_EXT SPTEs. If a descriptor * is full and a new SPTEs needs to be added, a new descriptor is allocated and * becomes the head of the list. This means that by definitions, all tail * descriptors are full. * * Note, the meta data fields are deliberately placed at the start of the * structure to optimize the cacheline layout; accessing the descriptor will * touch only a single cacheline so long as @spte_count<=6 (or if only the * descriptors metadata is accessed). */ struct pte_list_desc { struct pte_list_desc *more; /* The number of PTEs stored in _this_ descriptor. */ u32 spte_count; /* The number of PTEs stored in all tails of this descriptor. */ u32 tail_count; u64 *sptes[PTE_LIST_EXT]; }; struct kvm_shadow_walk_iterator { u64 addr; hpa_t shadow_addr; u64 *sptep; int level; unsigned index; }; #define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker) \ for (shadow_walk_init_using_root(&(_walker), (_vcpu), \ (_root), (_addr)); \ shadow_walk_okay(&(_walker)); \ shadow_walk_next(&(_walker))) #define for_each_shadow_entry(_vcpu, _addr, _walker) \ for (shadow_walk_init(&(_walker), _vcpu, _addr); \ shadow_walk_okay(&(_walker)); \ shadow_walk_next(&(_walker))) #define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \ for (shadow_walk_init(&(_walker), _vcpu, _addr); \ shadow_walk_okay(&(_walker)) && \ ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \ __shadow_walk_next(&(_walker), spte)) static struct kmem_cache *pte_list_desc_cache; struct kmem_cache *mmu_page_header_cache; static struct percpu_counter kvm_total_used_mmu_pages; static void mmu_spte_set(u64 *sptep, u64 spte); struct kvm_mmu_role_regs { const unsigned long cr0; const unsigned long cr4; const u64 efer; }; #define CREATE_TRACE_POINTS #include "mmutrace.h" /* * Yes, lot's of underscores. They're a hint that you probably shouldn't be * reading from the role_regs. Once the root_role is constructed, it becomes * the single source of truth for the MMU's state. */ #define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag) \ static inline bool __maybe_unused \ ____is_##reg##_##name(const struct kvm_mmu_role_regs *regs) \ { \ return !!(regs->reg & flag); \ } BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, pg, X86_CR0_PG); BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, wp, X86_CR0_WP); BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pse, X86_CR4_PSE); BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pae, X86_CR4_PAE); BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smep, X86_CR4_SMEP); BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smap, X86_CR4_SMAP); BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pke, X86_CR4_PKE); BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, la57, X86_CR4_LA57); BUILD_MMU_ROLE_REGS_ACCESSOR(efer, nx, EFER_NX); BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA); /* * The MMU itself (with a valid role) is the single source of truth for the * MMU. Do not use the regs used to build the MMU/role, nor the vCPU. The * regs don't account for dependencies, e.g. clearing CR4 bits if CR0.PG=1, * and the vCPU may be incorrect/irrelevant. */ #define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name) \ static inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu) \ { \ return !!(mmu->cpu_role. base_or_ext . reg##_##name); \ } BUILD_MMU_ROLE_ACCESSOR(base, cr0, wp); BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pse); BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smep); BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smap); BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pke); BUILD_MMU_ROLE_ACCESSOR(ext, cr4, la57); BUILD_MMU_ROLE_ACCESSOR(base, efer, nx); BUILD_MMU_ROLE_ACCESSOR(ext, efer, lma); static inline bool is_cr0_pg(struct kvm_mmu *mmu) { return mmu->cpu_role.base.level > 0; } static inline bool is_cr4_pae(struct kvm_mmu *mmu) { return !mmu->cpu_role.base.has_4_byte_gpte; } static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu) { struct kvm_mmu_role_regs regs = { .cr0 = kvm_read_cr0_bits(vcpu, KVM_MMU_CR0_ROLE_BITS), .cr4 = kvm_read_cr4_bits(vcpu, KVM_MMU_CR4_ROLE_BITS), .efer = vcpu->arch.efer, }; return regs; } static unsigned long get_guest_cr3(struct kvm_vcpu *vcpu) { return kvm_read_cr3(vcpu); } static inline unsigned long kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) { if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && mmu->get_guest_pgd == get_guest_cr3) return kvm_read_cr3(vcpu); return mmu->get_guest_pgd(vcpu); } static inline bool kvm_available_flush_remote_tlbs_range(void) { #if IS_ENABLED(CONFIG_HYPERV) return kvm_x86_ops.flush_remote_tlbs_range; #else return false; #endif } static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index); /* Flush the range of guest memory mapped by the given SPTE. */ static void kvm_flush_remote_tlbs_sptep(struct kvm *kvm, u64 *sptep) { struct kvm_mmu_page *sp = sptep_to_sp(sptep); gfn_t gfn = kvm_mmu_page_get_gfn(sp, spte_index(sptep)); kvm_flush_remote_tlbs_gfn(kvm, gfn, sp->role.level); } static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, unsigned int access) { u64 spte = make_mmio_spte(vcpu, gfn, access); trace_mark_mmio_spte(sptep, gfn, spte); mmu_spte_set(sptep, spte); } static gfn_t get_mmio_spte_gfn(u64 spte) { u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask; gpa |= (spte >> SHADOW_NONPRESENT_OR_RSVD_MASK_LEN) & shadow_nonpresent_or_rsvd_mask; return gpa >> PAGE_SHIFT; } static unsigned get_mmio_spte_access(u64 spte) { return spte & shadow_mmio_access_mask; } static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) { u64 kvm_gen, spte_gen, gen; gen = kvm_vcpu_memslots(vcpu)->generation; if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS)) return false; kvm_gen = gen & MMIO_SPTE_GEN_MASK; spte_gen = get_mmio_spte_generation(spte); trace_check_mmio_spte(spte, kvm_gen, spte_gen); return likely(kvm_gen == spte_gen); } static int is_cpuid_PSE36(void) { return 1; } #ifdef CONFIG_X86_64 static void __set_spte(u64 *sptep, u64 spte) { KVM_MMU_WARN_ON(is_ept_ve_possible(spte)); WRITE_ONCE(*sptep, spte); } static void __update_clear_spte_fast(u64 *sptep, u64 spte) { KVM_MMU_WARN_ON(is_ept_ve_possible(spte)); WRITE_ONCE(*sptep, spte); } static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) { KVM_MMU_WARN_ON(is_ept_ve_possible(spte)); return xchg(sptep, spte); } static u64 __get_spte_lockless(u64 *sptep) { return READ_ONCE(*sptep); } #else union split_spte { struct { u32 spte_low; u32 spte_high; }; u64 spte; }; static void count_spte_clear(u64 *sptep, u64 spte) { struct kvm_mmu_page *sp = sptep_to_sp(sptep); if (is_shadow_present_pte(spte)) return; /* Ensure the spte is completely set before we increase the count */ smp_wmb(); sp->clear_spte_count++; } static void __set_spte(u64 *sptep, u64 spte) { union split_spte *ssptep, sspte; ssptep = (union split_spte *)sptep; sspte = (union split_spte)spte; ssptep->spte_high = sspte.spte_high; /* * If we map the spte from nonpresent to present, We should store * the high bits firstly, then set present bit, so cpu can not * fetch this spte while we are setting the spte. */ smp_wmb(); WRITE_ONCE(ssptep->spte_low, sspte.spte_low); } static void __update_clear_spte_fast(u64 *sptep, u64 spte) { union split_spte *ssptep, sspte; ssptep = (union split_spte *)sptep; sspte = (union split_spte)spte; WRITE_ONCE(ssptep->spte_low, sspte.spte_low); /* * If we map the spte from present to nonpresent, we should clear * present bit firstly to avoid vcpu fetch the old high bits. */ smp_wmb(); ssptep->spte_high = sspte.spte_high; count_spte_clear(sptep, spte); } static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) { union split_spte *ssptep, sspte, orig; ssptep = (union split_spte *)sptep; sspte = (union split_spte)spte; /* xchg acts as a barrier before the setting of the high bits */ orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low); orig.spte_high = ssptep->spte_high; ssptep->spte_high = sspte.spte_high; count_spte_clear(sptep, spte); return orig.spte; } /* * The idea using the light way get the spte on x86_32 guest is from * gup_get_pte (mm/gup.c). * * An spte tlb flush may be pending, because they are coalesced and * we are running out of the MMU lock. Therefore * we need to protect against in-progress updates of the spte. * * Reading the spte while an update is in progress may get the old value * for the high part of the spte. The race is fine for a present->non-present * change (because the high part of the spte is ignored for non-present spte), * but for a present->present change we must reread the spte. * * All such changes are done in two steps (present->non-present and * non-present->present), hence it is enough to count the number of * present->non-present updates: if it changed while reading the spte, * we might have hit the race. This is done using clear_spte_count. */ static u64 __get_spte_lockless(u64 *sptep) { struct kvm_mmu_page *sp = sptep_to_sp(sptep); union split_spte spte, *orig = (union split_spte *)sptep; int count; retry: count = sp->clear_spte_count; smp_rmb(); spte.spte_low = orig->spte_low; smp_rmb(); spte.spte_high = orig->spte_high; smp_rmb(); if (unlikely(spte.spte_low != orig->spte_low || count != sp->clear_spte_count)) goto retry; return spte.spte; } #endif /* Rules for using mmu_spte_set: * Set the sptep from nonpresent to present. * Note: the sptep being assigned *must* be either not present * or in a state where the hardware will not attempt to update * the spte. */ static void mmu_spte_set(u64 *sptep, u64 new_spte) { WARN_ON_ONCE(is_shadow_present_pte(*sptep)); __set_spte(sptep, new_spte); } /* * Update the SPTE (excluding the PFN), but do not track changes in its * accessed/dirty status. */ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte) { u64 old_spte = *sptep; WARN_ON_ONCE(!is_shadow_present_pte(new_spte)); check_spte_writable_invariants(new_spte); if (!is_shadow_present_pte(old_spte)) { mmu_spte_set(sptep, new_spte); return old_spte; } if (!spte_has_volatile_bits(old_spte)) __update_clear_spte_fast(sptep, new_spte); else old_spte = __update_clear_spte_slow(sptep, new_spte); WARN_ON_ONCE(spte_to_pfn(old_spte) != spte_to_pfn(new_spte)); return old_spte; } /* Rules for using mmu_spte_update: * Update the state bits, it means the mapped pfn is not changed. * * Whenever an MMU-writable SPTE is overwritten with a read-only SPTE, remote * TLBs must be flushed. Otherwise rmap_write_protect will find a read-only * spte, even though the writable spte might be cached on a CPU's TLB. * * Returns true if the TLB needs to be flushed */ static bool mmu_spte_update(u64 *sptep, u64 new_spte) { bool flush = false; u64 old_spte = mmu_spte_update_no_track(sptep, new_spte); if (!is_shadow_present_pte(old_spte)) return false; /* * For the spte updated out of mmu-lock is safe, since * we always atomically update it, see the comments in * spte_has_volatile_bits(). */ if (is_mmu_writable_spte(old_spte) && !is_writable_pte(new_spte)) flush = true; /* * Flush TLB when accessed/dirty states are changed in the page tables, * to guarantee consistency between TLB and page tables. */ if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) { flush = true; kvm_set_pfn_accessed(spte_to_pfn(old_spte)); } if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) { flush = true; kvm_set_pfn_dirty(spte_to_pfn(old_spte)); } return flush; } /* * Rules for using mmu_spte_clear_track_bits: * It sets the sptep from present to nonpresent, and track the * state bits, it is used to clear the last level sptep. * Returns the old PTE. */ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) { kvm_pfn_t pfn; u64 old_spte = *sptep; int level = sptep_to_sp(sptep)->role.level; struct page *page; if (!is_shadow_present_pte(old_spte) || !spte_has_volatile_bits(old_spte)) __update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE); else old_spte = __update_clear_spte_slow(sptep, SHADOW_NONPRESENT_VALUE); if (!is_shadow_present_pte(old_spte)) return old_spte; kvm_update_page_stats(kvm, level, -1); pfn = spte_to_pfn(old_spte); /* * KVM doesn't hold a reference to any pages mapped into the guest, and * instead uses the mmu_notifier to ensure that KVM unmaps any pages * before they are reclaimed. Sanity check that, if the pfn is backed * by a refcounted page, the refcount is elevated. */ page = kvm_pfn_to_refcounted_page(pfn); WARN_ON_ONCE(page && !page_count(page)); if (is_accessed_spte(old_spte)) kvm_set_pfn_accessed(pfn); if (is_dirty_spte(old_spte)) kvm_set_pfn_dirty(pfn); return old_spte; } /* * Rules for using mmu_spte_clear_no_track: * Directly clear spte without caring the state bits of sptep, * it is used to set the upper level spte. */ static void mmu_spte_clear_no_track(u64 *sptep) { __update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE); } static u64 mmu_spte_get_lockless(u64 *sptep) { return __get_spte_lockless(sptep); } /* Returns the Accessed status of the PTE and resets it at the same time. */ static bool mmu_spte_age(u64 *sptep) { u64 spte = mmu_spte_get_lockless(sptep); if (!is_accessed_spte(spte)) return false; if (spte_ad_enabled(spte)) { clear_bit((ffs(shadow_accessed_mask) - 1), (unsigned long *)sptep); } else { /* * Capture the dirty status of the page, so that it doesn't get * lost when the SPTE is marked for access tracking. */ if (is_writable_pte(spte)) kvm_set_pfn_dirty(spte_to_pfn(spte)); spte = mark_spte_for_access_track(spte); mmu_spte_update_no_track(sptep, spte); } return true; } static inline bool is_tdp_mmu_active(struct kvm_vcpu *vcpu) { return tdp_mmu_enabled && vcpu->arch.mmu->root_role.direct; } static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) { if (is_tdp_mmu_active(vcpu)) { kvm_tdp_mmu_walk_lockless_begin(); } else { /* * Prevent page table teardown by making any free-er wait during * kvm_flush_remote_tlbs() IPI to all active vcpus. */ local_irq_disable(); /* * Make sure a following spte read is not reordered ahead of the write * to vcpu->mode. */ smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES); } } static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) { if (is_tdp_mmu_active(vcpu)) { kvm_tdp_mmu_walk_lockless_end(); } else { /* * Make sure the write to vcpu->mode is not reordered in front of * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us * OUTSIDE_GUEST_MODE and proceed to free the shadow page table. */ smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE); local_irq_enable(); } } static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect) { int r; /* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */ r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, 1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM); if (r) return r; r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache, PT64_ROOT_MAX_LEVEL); if (r) return r; if (maybe_indirect) { r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadowed_info_cache, PT64_ROOT_MAX_LEVEL); if (r) return r; } return kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, PT64_ROOT_MAX_LEVEL); } static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) { kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache); kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache); kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache); kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); } static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) { kmem_cache_free(pte_list_desc_cache, pte_list_desc); } static bool sp_has_gptes(struct kvm_mmu_page *sp); static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) { if (sp->role.passthrough) return sp->gfn; if (sp->shadowed_translation) return sp->shadowed_translation[index] >> PAGE_SHIFT; return sp->gfn + (index << ((sp->role.level - 1) * SPTE_LEVEL_BITS)); } /* * For leaf SPTEs, fetch the *guest* access permissions being shadowed. Note * that the SPTE itself may have a more constrained access permissions that * what the guest enforces. For example, a guest may create an executable * huge PTE but KVM may disallow execution to mitigate iTLB multihit. */ static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index) { if (sp->shadowed_translation) return sp->shadowed_translation[index] & ACC_ALL; /* * For direct MMUs (e.g. TDP or non-paging guests) or passthrough SPs, * KVM is not shadowing any guest page tables, so the "guest access * permissions" are just ACC_ALL. * * For direct SPs in indirect MMUs (shadow paging), i.e. when KVM * is shadowing a guest huge page with small pages, the guest access * permissions being shadowed are the access permissions of the huge * page. * * In both cases, sp->role.access contains the correct access bits. */ return sp->role.access; } static void kvm_mmu_page_set_translation(struct kvm_mmu_page *sp, int index, gfn_t gfn, unsigned int access) { if (sp->shadowed_translation) { sp->shadowed_translation[index] = (gfn << PAGE_SHIFT) | access; return; } WARN_ONCE(access != kvm_mmu_page_get_access(sp, index), "access mismatch under %s page %llx (expected %u, got %u)\n", sp->role.passthrough ? "passthrough" : "direct", sp->gfn, kvm_mmu_page_get_access(sp, index), access); WARN_ONCE(gfn != kvm_mmu_page_get_gfn(sp, index), "gfn mismatch under %s page %llx (expected %llx, got %llx)\n", sp->role.passthrough ? "passthrough" : "direct", sp->gfn, kvm_mmu_page_get_gfn(sp, index), gfn); } static void kvm_mmu_page_set_access(struct kvm_mmu_page *sp, int index, unsigned int access) { gfn_t gfn = kvm_mmu_page_get_gfn(sp, index); kvm_mmu_page_set_translation(sp, index, gfn, access); } /* * Return the pointer to the large page information for a given gfn, * handling slots that are not large page aligned. */ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, const struct kvm_memory_slot *slot, int level) { unsigned long idx; idx = gfn_to_index(gfn, slot->base_gfn, level); return &slot->arch.lpage_info[level - 2][idx]; } /* * The most significant bit in disallow_lpage tracks whether or not memory * attributes are mixed, i.e. not identical for all gfns at the current level. * The lower order bits are used to refcount other cases where a hugepage is * disallowed, e.g. if KVM has shadow a page table at the gfn. */ #define KVM_LPAGE_MIXED_FLAG BIT(31) static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot, gfn_t gfn, int count) { struct kvm_lpage_info *linfo; int old, i; for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { linfo = lpage_info_slot(gfn, slot, i); old = linfo->disallow_lpage; linfo->disallow_lpage += count; WARN_ON_ONCE((old ^ linfo->disallow_lpage) & KVM_LPAGE_MIXED_FLAG); } } void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn) { update_gfn_disallow_lpage_count(slot, gfn, 1); } void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn) { update_gfn_disallow_lpage_count(slot, gfn, -1); } static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) { struct kvm_memslots *slots; struct kvm_memory_slot *slot; gfn_t gfn; kvm->arch.indirect_shadow_pages++; /* * Ensure indirect_shadow_pages is elevated prior to re-reading guest * child PTEs in FNAME(gpte_changed), i.e. guarantee either in-flight * emulated writes are visible before re-reading guest PTEs, or that * an emulated write will see the elevated count and acquire mmu_lock * to update SPTEs. Pairs with the smp_mb() in kvm_mmu_track_write(). */ smp_mb(); gfn = sp->gfn; slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, gfn); /* the non-leaf shadow pages are keeping readonly. */ if (sp->role.level > PG_LEVEL_4K) return __kvm_write_track_add_gfn(kvm, slot, gfn); kvm_mmu_gfn_disallow_lpage(slot, gfn); if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K)) kvm_flush_remote_tlbs_gfn(kvm, gfn, PG_LEVEL_4K); } void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) { /* * If it's possible to replace the shadow page with an NX huge page, * i.e. if the shadow page is the only thing currently preventing KVM * from using a huge page, add the shadow page to the list of "to be * zapped for NX recovery" pages. Note, the shadow page can already be * on the list if KVM is reusing an existing shadow page, i.e. if KVM * links a shadow page at multiple points. */ if (!list_empty(&sp->possible_nx_huge_page_link)) return; ++kvm->stat.nx_lpage_splits; list_add_tail(&sp->possible_nx_huge_page_link, &kvm->arch.possible_nx_huge_pages); } static void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp, bool nx_huge_page_possible) { sp->nx_huge_page_disallowed = true; if (nx_huge_page_possible) track_possible_nx_huge_page(kvm, sp); } static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) { struct kvm_memslots *slots; struct kvm_memory_slot *slot; gfn_t gfn; kvm->arch.indirect_shadow_pages--; gfn = sp->gfn; slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, gfn); if (sp->role.level > PG_LEVEL_4K) return __kvm_write_track_remove_gfn(kvm, slot, gfn); kvm_mmu_gfn_allow_lpage(slot, gfn); } void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) { if (list_empty(&sp->possible_nx_huge_page_link)) return; --kvm->stat.nx_lpage_splits; list_del_init(&sp->possible_nx_huge_page_link); } static void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) { sp->nx_huge_page_disallowed = false; untrack_possible_nx_huge_page(kvm, sp); } static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) { struct kvm_memory_slot *slot; slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); if (!slot || slot->flags & KVM_MEMSLOT_INVALID) return NULL; if (no_dirty_log && kvm_slot_dirty_track_enabled(slot)) return NULL; return slot; } /* * About rmap_head encoding: * * If the bit zero of rmap_head->val is clear, then it points to the only spte * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct * pte_list_desc containing more mappings. */ /* * Returns the number of pointers in the rmap chain, not counting the new one. */ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; int count = 0; if (!rmap_head->val) { rmap_head->val = (unsigned long)spte; } else if (!(rmap_head->val & 1)) { desc = kvm_mmu_memory_cache_alloc(cache); desc->sptes[0] = (u64 *)rmap_head->val; desc->sptes[1] = spte; desc->spte_count = 2; desc->tail_count = 0; rmap_head->val = (unsigned long)desc | 1; ++count; } else { desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); count = desc->tail_count + desc->spte_count; /* * If the previous head is full, allocate a new head descriptor * as tail descriptors are always kept full. */ if (desc->spte_count == PTE_LIST_EXT) { desc = kvm_mmu_memory_cache_alloc(cache); desc->more = (struct pte_list_desc *)(rmap_head->val & ~1ul); desc->spte_count = 0; desc->tail_count = count; rmap_head->val = (unsigned long)desc | 1; } desc->sptes[desc->spte_count++] = spte; } return count; } static void pte_list_desc_remove_entry(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct pte_list_desc *desc, int i) { struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); int j = head_desc->spte_count - 1; /* * The head descriptor should never be empty. A new head is added only * when adding an entry and the previous head is full, and heads are * removed (this flow) when they become empty. */ KVM_BUG_ON_DATA_CORRUPTION(j < 0, kvm); /* * Replace the to-be-freed SPTE with the last valid entry from the head * descriptor to ensure that tail descriptors are full at all times. * Note, this also means that tail_count is stable for each descriptor. */ desc->sptes[i] = head_desc->sptes[j]; head_desc->sptes[j] = NULL; head_desc->spte_count--; if (head_desc->spte_count) return; /* * The head descriptor is empty. If there are no tail descriptors, * nullify the rmap head to mark the list as empty, else point the rmap * head at the next descriptor, i.e. the new head. */ if (!head_desc->more) rmap_head->val = 0; else rmap_head->val = (unsigned long)head_desc->more | 1; mmu_free_pte_list_desc(head_desc); } static void pte_list_remove(struct kvm *kvm, u64 *spte, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; int i; if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_head->val, kvm)) return; if (!(rmap_head->val & 1)) { if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_head->val != spte, kvm)) return; rmap_head->val = 0; } else { desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); while (desc) { for (i = 0; i < desc->spte_count; ++i) { if (desc->sptes[i] == spte) { pte_list_desc_remove_entry(kvm, rmap_head, desc, i); return; } } desc = desc->more; } KVM_BUG_ON_DATA_CORRUPTION(true, kvm); } } static void kvm_zap_one_rmap_spte(struct kvm *kvm, struct kvm_rmap_head *rmap_head, u64 *sptep) { mmu_spte_clear_track_bits(kvm, sptep); pte_list_remove(kvm, sptep, rmap_head); } /* Return true if at least one SPTE was zapped, false otherwise */ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc, *next; int i; if (!rmap_head->val) return false; if (!(rmap_head->val & 1)) { mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val); goto out; } desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); for (; desc; desc = next) { for (i = 0; i < desc->spte_count; i++) mmu_spte_clear_track_bits(kvm, desc->sptes[i]); next = desc->more; mmu_free_pte_list_desc(desc); } out: /* rmap_head is meaningless now, remember to reset it */ rmap_head->val = 0; return true; } unsigned int pte_list_count(struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; if (!rmap_head->val) return 0; else if (!(rmap_head->val & 1)) return 1; desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); return desc->tail_count + desc->spte_count; } static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level, const struct kvm_memory_slot *slot) { unsigned long idx; idx = gfn_to_index(gfn, slot->base_gfn, level); return &slot->arch.rmap[level - PG_LEVEL_4K][idx]; } static void rmap_remove(struct kvm *kvm, u64 *spte) { struct kvm_memslots *slots; struct kvm_memory_slot *slot; struct kvm_mmu_page *sp; gfn_t gfn; struct kvm_rmap_head *rmap_head; sp = sptep_to_sp(spte); gfn = kvm_mmu_page_get_gfn(sp, spte_index(spte)); /* * Unlike rmap_add, rmap_remove does not run in the context of a vCPU * so we have to determine which memslots to use based on context * information in sp->role. */ slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, gfn); rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); pte_list_remove(kvm, spte, rmap_head); } /* * Used by the following functions to iterate through the sptes linked by a * rmap. All fields are private and not assumed to be used outside. */ struct rmap_iterator { /* private fields */ struct pte_list_desc *desc; /* holds the sptep if not NULL */ int pos; /* index of the sptep */ }; /* * Iteration must be started by this function. This should also be used after * removing/dropping sptes from the rmap link because in such cases the * information in the iterator may not be valid. * * Returns sptep if found, NULL otherwise. */ static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head, struct rmap_iterator *iter) { u64 *sptep; if (!rmap_head->val) return NULL; if (!(rmap_head->val & 1)) { iter->desc = NULL; sptep = (u64 *)rmap_head->val; goto out; } iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); iter->pos = 0; sptep = iter->desc->sptes[iter->pos]; out: BUG_ON(!is_shadow_present_pte(*sptep)); return sptep; } /* * Must be used with a valid iterator: e.g. after rmap_get_first(). * * Returns sptep if found, NULL otherwise. */ static u64 *rmap_get_next(struct rmap_iterator *iter) { u64 *sptep; if (iter->desc) { if (iter->pos < PTE_LIST_EXT - 1) { ++iter->pos; sptep = iter->desc->sptes[iter->pos]; if (sptep) goto out; } iter->desc = iter->desc->more; if (iter->desc) { iter->pos = 0; /* desc->sptes[0] cannot be NULL */ sptep = iter->desc->sptes[iter->pos]; goto out; } } return NULL; out: BUG_ON(!is_shadow_present_pte(*sptep)); return sptep; } #define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \ for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \ _spte_; _spte_ = rmap_get_next(_iter_)) static void drop_spte(struct kvm *kvm, u64 *sptep) { u64 old_spte = mmu_spte_clear_track_bits(kvm, sptep); if (is_shadow_present_pte(old_spte)) rmap_remove(kvm, sptep); } static void drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush) { struct kvm_mmu_page *sp; sp = sptep_to_sp(sptep); WARN_ON_ONCE(sp->role.level == PG_LEVEL_4K); drop_spte(kvm, sptep); if (flush) kvm_flush_remote_tlbs_sptep(kvm, sptep); } /* * Write-protect on the specified @sptep, @pt_protect indicates whether * spte write-protection is caused by protecting shadow page table. * * Note: write protection is difference between dirty logging and spte * protection: * - for dirty logging, the spte can be set to writable at anytime if * its dirty bitmap is properly set. * - for spte protection, the spte can be writable only after unsync-ing * shadow page. * * Return true if tlb need be flushed. */ static bool spte_write_protect(u64 *sptep, bool pt_protect) { u64 spte = *sptep; if (!is_writable_pte(spte) && !(pt_protect && is_mmu_writable_spte(spte))) return false; if (pt_protect) spte &= ~shadow_mmu_writable_mask; spte = spte & ~PT_WRITABLE_MASK; return mmu_spte_update(sptep, spte); } static bool rmap_write_protect(struct kvm_rmap_head *rmap_head, bool pt_protect) { u64 *sptep; struct rmap_iterator iter; bool flush = false; for_each_rmap_spte(rmap_head, &iter, sptep) flush |= spte_write_protect(sptep, pt_protect); return flush; } static bool spte_clear_dirty(u64 *sptep) { u64 spte = *sptep; KVM_MMU_WARN_ON(!spte_ad_enabled(spte)); spte &= ~shadow_dirty_mask; return mmu_spte_update(sptep, spte); } static bool spte_wrprot_for_clear_dirty(u64 *sptep) { bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT, (unsigned long *)sptep); if (was_writable && !spte_ad_enabled(*sptep)) kvm_set_pfn_dirty(spte_to_pfn(*sptep)); return was_writable; } /* * Gets the GFN ready for another round of dirty logging by clearing the * - D bit on ad-enabled SPTEs, and * - W bit on ad-disabled SPTEs. * Returns true iff any D or W bits were cleared. */ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head, const struct kvm_memory_slot *slot) { u64 *sptep; struct rmap_iterator iter; bool flush = false; for_each_rmap_spte(rmap_head, &iter, sptep) if (spte_ad_need_write_protect(*sptep)) flush |= spte_wrprot_for_clear_dirty(sptep); else flush |= spte_clear_dirty(sptep); return flush; } /** * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages * @kvm: kvm instance * @slot: slot to protect * @gfn_offset: start of the BITS_PER_LONG pages we care about * @mask: indicates which pages we should protect * * Used when we do not need to care about huge page mappings. */ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) { struct kvm_rmap_head *rmap_head; if (tdp_mmu_enabled) kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, slot->base_gfn + gfn_offset, mask, true); if (!kvm_memslots_have_rmaps(kvm)) return; while (mask) { rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), PG_LEVEL_4K, slot); rmap_write_protect(rmap_head, false); /* clear the first set bit */ mask &= mask - 1; } } /** * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write * protect the page if the D-bit isn't supported. * @kvm: kvm instance * @slot: slot to clear D-bit * @gfn_offset: start of the BITS_PER_LONG pages we care about * @mask: indicates which pages we should clear D-bit * * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap. */ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) { struct kvm_rmap_head *rmap_head; if (tdp_mmu_enabled) kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, slot->base_gfn + gfn_offset, mask, false); if (!kvm_memslots_have_rmaps(kvm)) return; while (mask) { rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), PG_LEVEL_4K, slot); __rmap_clear_dirty(kvm, rmap_head, slot); /* clear the first set bit */ mask &= mask - 1; } } /** * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected * PT level pages. * * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to * enable dirty logging for them. * * We need to care about huge page mappings: e.g. during dirty logging we may * have such mappings. */ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) { /* * Huge pages are NOT write protected when we start dirty logging in * initially-all-set mode; must write protect them here so that they * are split to 4K on the first write. * * The gfn_offset is guaranteed to be aligned to 64, but the base_gfn * of memslot has no such restriction, so the range can cross two large * pages. */ if (kvm_dirty_log_manual_protect_and_init_set(kvm)) { gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask); gfn_t end = slot->base_gfn + gfn_offset + __fls(mask); if (READ_ONCE(eager_page_split)) kvm_mmu_try_split_huge_pages(kvm, slot, start, end + 1, PG_LEVEL_4K); kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M); /* Cross two large pages? */ if (ALIGN(start << PAGE_SHIFT, PMD_SIZE) != ALIGN(end << PAGE_SHIFT, PMD_SIZE)) kvm_mmu_slot_gfn_write_protect(kvm, slot, end, PG_LEVEL_2M); } /* Now handle 4K PTEs. */ if (kvm_x86_ops.cpu_dirty_log_size) kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask); else kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); } int kvm_cpu_dirty_log_size(void) { return kvm_x86_ops.cpu_dirty_log_size; } bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn, int min_level) { struct kvm_rmap_head *rmap_head; int i; bool write_protected = false; if (kvm_memslots_have_rmaps(kvm)) { for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { rmap_head = gfn_to_rmap(gfn, i, slot); write_protected |= rmap_write_protect(rmap_head, true); } } if (tdp_mmu_enabled) write_protected |= kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn, min_level); return write_protected; } static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn) { struct kvm_memory_slot *slot; slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K); } static bool __kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, const struct kvm_memory_slot *slot) { return kvm_zap_all_rmap_sptes(kvm, rmap_head); } static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level) { return __kvm_zap_rmap(kvm, rmap_head, slot); } struct slot_rmap_walk_iterator { /* input fields. */ const struct kvm_memory_slot *slot; gfn_t start_gfn; gfn_t end_gfn; int start_level; int end_level; /* output fields. */ gfn_t gfn; struct kvm_rmap_head *rmap; int level; /* private field. */ struct kvm_rmap_head *end_rmap; }; static void rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level) { iterator->level = level; iterator->gfn = iterator->start_gfn; iterator->rmap = gfn_to_rmap(iterator->gfn, level, iterator->slot); iterator->end_rmap = gfn_to_rmap(iterator->end_gfn, level, iterator->slot); } static void slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator, const struct kvm_memory_slot *slot, int start_level, int end_level, gfn_t start_gfn, gfn_t end_gfn) { iterator->slot = slot; iterator->start_level = start_level; iterator->end_level = end_level; iterator->start_gfn = start_gfn; iterator->end_gfn = end_gfn; rmap_walk_init_level(iterator, iterator->start_level); } static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator) { return !!iterator->rmap; } static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) { while (++iterator->rmap <= iterator->end_rmap) { iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level)); if (iterator->rmap->val) return; } if (++iterator->level > iterator->end_level) { iterator->rmap = NULL; return; } rmap_walk_init_level(iterator, iterator->level); } #define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_, \ _start_gfn, _end_gfn, _iter_) \ for (slot_rmap_walk_init(_iter_, _slot_, _start_level_, \ _end_level_, _start_gfn, _end_gfn); \ slot_rmap_walk_okay(_iter_); \ slot_rmap_walk_next(_iter_)) typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level); static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, rmap_handler_t handler) { struct slot_rmap_walk_iterator iterator; bool ret = false; for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, range->start, range->end - 1, &iterator) ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn, iterator.level); return ret; } bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { bool flush = false; if (kvm_memslots_have_rmaps(kvm)) flush = kvm_handle_gfn_range(kvm, range, kvm_zap_rmap); if (tdp_mmu_enabled) flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); if (kvm_x86_ops.set_apic_access_page_addr && range->slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT) kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); return flush; } static bool kvm_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level) { u64 *sptep; struct rmap_iterator iter; int young = 0; for_each_rmap_spte(rmap_head, &iter, sptep) young |= mmu_spte_age(sptep); return young; } static bool kvm_test_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level) { u64 *sptep; struct rmap_iterator iter; for_each_rmap_spte(rmap_head, &iter, sptep) if (is_accessed_spte(*sptep)) return true; return false; } #define RMAP_RECYCLE_THRESHOLD 1000 static void __rmap_add(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, const struct kvm_memory_slot *slot, u64 *spte, gfn_t gfn, unsigned int access) { struct kvm_mmu_page *sp; struct kvm_rmap_head *rmap_head; int rmap_count; sp = sptep_to_sp(spte); kvm_mmu_page_set_translation(sp, spte_index(spte), gfn, access); kvm_update_page_stats(kvm, sp->role.level, 1); rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); rmap_count = pte_list_add(cache, spte, rmap_head); if (rmap_count > kvm->stat.max_mmu_rmap_size) kvm->stat.max_mmu_rmap_size = rmap_count; if (rmap_count > RMAP_RECYCLE_THRESHOLD) { kvm_zap_all_rmap_sptes(kvm, rmap_head); kvm_flush_remote_tlbs_gfn(kvm, gfn, sp->role.level); } } static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot, u64 *spte, gfn_t gfn, unsigned int access) { struct kvm_mmu_memory_cache *cache = &vcpu->arch.mmu_pte_list_desc_cache; __rmap_add(vcpu->kvm, cache, slot, spte, gfn, access); } bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { bool young = false; if (kvm_memslots_have_rmaps(kvm)) young = kvm_handle_gfn_range(kvm, range, kvm_age_rmap); if (tdp_mmu_enabled) young |= kvm_tdp_mmu_age_gfn_range(kvm, range); return young; } bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { bool young = false; if (kvm_memslots_have_rmaps(kvm)) young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmap); if (tdp_mmu_enabled) young |= kvm_tdp_mmu_test_age_gfn(kvm, range); return young; } static void kvm_mmu_check_sptes_at_free(struct kvm_mmu_page *sp) { #ifdef CONFIG_KVM_PROVE_MMU int i; for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { if (KVM_MMU_WARN_ON(is_shadow_present_pte(sp->spt[i]))) pr_err_ratelimited("SPTE %llx (@ %p) for gfn %llx shadow-present at free", sp->spt[i], &sp->spt[i], kvm_mmu_page_get_gfn(sp, i)); } #endif } /* * This value is the sum of all of the kvm instances's * kvm->arch.n_used_mmu_pages values. We need a global, * aggregate version in order to make the slab shrinker * faster */ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr) { kvm->arch.n_used_mmu_pages += nr; percpu_counter_add(&kvm_total_used_mmu_pages, nr); } static void kvm_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { kvm_mod_used_mmu_pages(kvm, +1); kvm_account_pgtable_pages((void *)sp->spt, +1); } static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { kvm_mod_used_mmu_pages(kvm, -1); kvm_account_pgtable_pages((void *)sp->spt, -1); } static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp) { kvm_mmu_check_sptes_at_free(sp); hlist_del(&sp->hash_link); list_del(&sp->link); free_page((unsigned long)sp->spt); free_page((unsigned long)sp->shadowed_translation); kmem_cache_free(mmu_page_header_cache, sp); } static unsigned kvm_page_table_hashfn(gfn_t gfn) { return hash_64(gfn, KVM_MMU_HASH_SHIFT); } static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache, struct kvm_mmu_page *sp, u64 *parent_pte) { if (!parent_pte) return; pte_list_add(cache, parent_pte, &sp->parent_ptes); } static void mmu_page_remove_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *parent_pte) { pte_list_remove(kvm, parent_pte, &sp->parent_ptes); } static void drop_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *parent_pte) { mmu_page_remove_parent_pte(kvm, sp, parent_pte); mmu_spte_clear_no_track(parent_pte); } static void mark_unsync(u64 *spte); static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) { u64 *sptep; struct rmap_iterator iter; for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) { mark_unsync(sptep); } } static void mark_unsync(u64 *spte) { struct kvm_mmu_page *sp; sp = sptep_to_sp(spte); if (__test_and_set_bit(spte_index(spte), sp->unsync_child_bitmap)) return; if (sp->unsync_children++) return; kvm_mmu_mark_parents_unsync(sp); } #define KVM_PAGE_ARRAY_NR 16 struct kvm_mmu_pages { struct mmu_page_and_offset { struct kvm_mmu_page *sp; unsigned int idx; } page[KVM_PAGE_ARRAY_NR]; unsigned int nr; }; static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, int idx) { int i; if (sp->unsync) for (i=0; i < pvec->nr; i++) if (pvec->page[i].sp == sp) return 0; pvec->page[pvec->nr].sp = sp; pvec->page[pvec->nr].idx = idx; pvec->nr++; return (pvec->nr == KVM_PAGE_ARRAY_NR); } static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx) { --sp->unsync_children; WARN_ON_ONCE((int)sp->unsync_children < 0); __clear_bit(idx, sp->unsync_child_bitmap); } static int __mmu_unsync_walk(struct kvm_mmu_page *sp, struct kvm_mmu_pages *pvec) { int i, ret, nr_unsync_leaf = 0; for_each_set_bit(i, sp->unsync_child_bitmap, 512) { struct kvm_mmu_page *child; u64 ent = sp->spt[i]; if (!is_shadow_present_pte(ent) || is_large_pte(ent)) { clear_unsync_child_bit(sp, i); continue; } child = spte_to_child_sp(ent); if (child->unsync_children) { if (mmu_pages_add(pvec, child, i)) return -ENOSPC; ret = __mmu_unsync_walk(child, pvec); if (!ret) { clear_unsync_child_bit(sp, i); continue; } else if (ret > 0) { nr_unsync_leaf += ret; } else return ret; } else if (child->unsync) { nr_unsync_leaf++; if (mmu_pages_add(pvec, child, i)) return -ENOSPC; } else clear_unsync_child_bit(sp, i); } return nr_unsync_leaf; } #define INVALID_INDEX (-1) static int mmu_unsync_walk(struct kvm_mmu_page *sp, struct kvm_mmu_pages *pvec) { pvec->nr = 0; if (!sp->unsync_children) return 0; mmu_pages_add(pvec, sp, INVALID_INDEX); return __mmu_unsync_walk(sp, pvec); } static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) { WARN_ON_ONCE(!sp->unsync); trace_kvm_mmu_sync_page(sp); sp->unsync = 0; --kvm->stat.mmu_unsync; } static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, struct list_head *invalid_list); static void kvm_mmu_commit_zap_page(struct kvm *kvm, struct list_head *invalid_list); static bool sp_has_gptes(struct kvm_mmu_page *sp) { if (sp->role.direct) return false; if (sp->role.passthrough) return false; return true; } #define for_each_valid_sp(_kvm, _sp, _list) \ hlist_for_each_entry(_sp, _list, hash_link) \ if (is_obsolete_sp((_kvm), (_sp))) { \ } else #define for_each_gfn_valid_sp_with_gptes(_kvm, _sp, _gfn) \ for_each_valid_sp(_kvm, _sp, \ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \ if ((_sp)->gfn != (_gfn) || !sp_has_gptes(_sp)) {} else static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { union kvm_mmu_page_role root_role = vcpu->arch.mmu->root_role; /* * Ignore various flags when verifying that it's safe to sync a shadow * page using the current MMU context. * * - level: not part of the overall MMU role and will never match as the MMU's * level tracks the root level * - access: updated based on the new guest PTE * - quadrant: not part of the overall MMU role (similar to level) */ const union kvm_mmu_page_role sync_role_ign = { .level = 0xf, .access = 0x7, .quadrant = 0x3, .passthrough = 0x1, }; /* * Direct pages can never be unsync, and KVM should never attempt to * sync a shadow page for a different MMU context, e.g. if the role * differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the * reserved bits checks will be wrong, etc... */ if (WARN_ON_ONCE(sp->role.direct || !vcpu->arch.mmu->sync_spte || (sp->role.word ^ root_role.word) & ~sync_role_ign.word)) return false; return true; } static int kvm_sync_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i) { /* sp->spt[i] has initial value of shadow page table allocation */ if (sp->spt[i] == SHADOW_NONPRESENT_VALUE) return 0; return vcpu->arch.mmu->sync_spte(vcpu, sp, i); } static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { int flush = 0; int i; if (!kvm_sync_page_check(vcpu, sp)) return -1; for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { int ret = kvm_sync_spte(vcpu, sp, i); if (ret < -1) return -1; flush |= ret; } /* * Note, any flush is purely for KVM's correctness, e.g. when dropping * an existing SPTE or clearing W/A/D bits to ensure an mmu_notifier * unmap or dirty logging event doesn't fail to flush. The guest is * responsible for flushing the TLB to ensure any changes in protection * bits are recognized, i.e. until the guest flushes or page faults on * a relevant address, KVM is architecturally allowed to let vCPUs use * cached translations with the old protection bits. */ return flush; } static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, struct list_head *invalid_list) { int ret = __kvm_sync_page(vcpu, sp); if (ret < 0) kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); return ret; } static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm, struct list_head *invalid_list, bool remote_flush) { if (!remote_flush && list_empty(invalid_list)) return false; if (!list_empty(invalid_list)) kvm_mmu_commit_zap_page(kvm, invalid_list); else kvm_flush_remote_tlbs(kvm); return true; } static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) { if (sp->role.invalid) return true; /* TDP MMU pages do not use the MMU generation. */ return !is_tdp_mmu_page(sp) && unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); } struct mmu_page_path { struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL]; unsigned int idx[PT64_ROOT_MAX_LEVEL]; }; #define for_each_sp(pvec, sp, parents, i) \ for (i = mmu_pages_first(&pvec, &parents); \ i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \ i = mmu_pages_next(&pvec, &parents, i)) static int mmu_pages_next(struct kvm_mmu_pages *pvec, struct mmu_page_path *parents, int i) { int n; for (n = i+1; n < pvec->nr; n++) { struct kvm_mmu_page *sp = pvec->page[n].sp; unsigned idx = pvec->page[n].idx; int level = sp->role.level; parents->idx[level-1] = idx; if (level == PG_LEVEL_4K) break; parents->parent[level-2] = sp; } return n; } static int mmu_pages_first(struct kvm_mmu_pages *pvec, struct mmu_page_path *parents) { struct kvm_mmu_page *sp; int level; if (pvec->nr == 0) return 0; WARN_ON_ONCE(pvec->page[0].idx != INVALID_INDEX); sp = pvec->page[0].sp; level = sp->role.level; WARN_ON_ONCE(level == PG_LEVEL_4K); parents->parent[level-2] = sp; /* Also set up a sentinel. Further entries in pvec are all * children of sp, so this element is never overwritten. */ parents->parent[level-1] = NULL; return mmu_pages_next(pvec, parents, 0); } static void mmu_pages_clear_parents(struct mmu_page_path *parents) { struct kvm_mmu_page *sp; unsigned int level = 0; do { unsigned int idx = parents->idx[level]; sp = parents->parent[level]; if (!sp) return; WARN_ON_ONCE(idx == INVALID_INDEX); clear_unsync_child_bit(sp, idx); level++; } while (!sp->unsync_children); } static int mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *parent, bool can_yield) { int i; struct kvm_mmu_page *sp; struct mmu_page_path parents; struct kvm_mmu_pages pages; LIST_HEAD(invalid_list); bool flush = false; while (mmu_unsync_walk(parent, &pages)) { bool protected = false; for_each_sp(pages, sp, parents, i) protected |= kvm_vcpu_write_protect_gfn(vcpu, sp->gfn); if (protected) { kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, true); flush = false; } for_each_sp(pages, sp, parents, i) { kvm_unlink_unsync_page(vcpu->kvm, sp); flush |= kvm_sync_page(vcpu, sp, &invalid_list) > 0; mmu_pages_clear_parents(&parents); } if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) { kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush); if (!can_yield) { kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); return -EINTR; } cond_resched_rwlock_write(&vcpu->kvm->mmu_lock); flush = false; } } kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush); return 0; } static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp) { atomic_set(&sp->write_flooding_count, 0); } static void clear_sp_write_flooding_count(u64 *spte) { __clear_sp_write_flooding_count(sptep_to_sp(spte)); } /* * The vCPU is required when finding indirect shadow pages; the shadow * page may already exist and syncing it needs the vCPU pointer in * order to read guest page tables. Direct shadow pages are never * unsync, thus @vcpu can be NULL if @role.direct is true. */ static struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm *kvm, struct kvm_vcpu *vcpu, gfn_t gfn, struct hlist_head *sp_list, union kvm_mmu_page_role role) { struct kvm_mmu_page *sp; int ret; int collisions = 0; LIST_HEAD(invalid_list); for_each_valid_sp(kvm, sp, sp_list) { if (sp->gfn != gfn) { collisions++; continue; } if (sp->role.word != role.word) { /* * If the guest is creating an upper-level page, zap * unsync pages for the same gfn. While it's possible * the guest is using recursive page tables, in all * likelihood the guest has stopped using the unsync * page and is installing a completely unrelated page. * Unsync pages must not be left as is, because the new * upper-level page will be write-protected. */ if (role.level > PG_LEVEL_4K && sp->unsync) kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); continue; } /* unsync and write-flooding only apply to indirect SPs. */ if (sp->role.direct) goto out; if (sp->unsync) { if (KVM_BUG_ON(!vcpu, kvm)) break; /* * The page is good, but is stale. kvm_sync_page does * get the latest guest state, but (unlike mmu_unsync_children) * it doesn't write-protect the page or mark it synchronized! * This way the validity of the mapping is ensured, but the * overhead of write protection is not incurred until the * guest invalidates the TLB mapping. This allows multiple * SPs for a single gfn to be unsync. * * If the sync fails, the page is zapped. If so, break * in order to rebuild it. */ ret = kvm_sync_page(vcpu, sp, &invalid_list); if (ret < 0) break; WARN_ON_ONCE(!list_empty(&invalid_list)); if (ret > 0) kvm_flush_remote_tlbs(kvm); } __clear_sp_write_flooding_count(sp); goto out; } sp = NULL; ++kvm->stat.mmu_cache_miss; out: kvm_mmu_commit_zap_page(kvm, &invalid_list); if (collisions > kvm->stat.max_mmu_page_hash_collisions) kvm->stat.max_mmu_page_hash_collisions = collisions; return sp; } /* Caches used when allocating a new shadow page. */ struct shadow_page_caches { struct kvm_mmu_memory_cache *page_header_cache; struct kvm_mmu_memory_cache *shadow_page_cache; struct kvm_mmu_memory_cache *shadowed_info_cache; }; static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm, struct shadow_page_caches *caches, gfn_t gfn, struct hlist_head *sp_list, union kvm_mmu_page_role role) { struct kvm_mmu_page *sp; sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache); sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache); if (!role.direct && role.level <= KVM_MAX_HUGEPAGE_LEVEL) sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); INIT_LIST_HEAD(&sp->possible_nx_huge_page_link); /* * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages() * depends on valid pages being added to the head of the list. See * comments in kvm_zap_obsolete_pages(). */ sp->mmu_valid_gen = kvm->arch.mmu_valid_gen; list_add(&sp->link, &kvm->arch.active_mmu_pages); kvm_account_mmu_page(kvm, sp); sp->gfn = gfn; sp->role = role; hlist_add_head(&sp->hash_link, sp_list); if (sp_has_gptes(sp)) account_shadowed(kvm, sp); return sp; } /* Note, @vcpu may be NULL if @role.direct is true; see kvm_mmu_find_shadow_page. */ static struct kvm_mmu_page *__kvm_mmu_get_shadow_page(struct kvm *kvm, struct kvm_vcpu *vcpu, struct shadow_page_caches *caches, gfn_t gfn, union kvm_mmu_page_role role) { struct hlist_head *sp_list; struct kvm_mmu_page *sp; bool created = false; sp_list = &kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]; sp = kvm_mmu_find_shadow_page(kvm, vcpu, gfn, sp_list, role); if (!sp) { created = true; sp = kvm_mmu_alloc_shadow_page(kvm, caches, gfn, sp_list, role); } trace_kvm_mmu_get_page(sp, created); return sp; } static struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu, gfn_t gfn, union kvm_mmu_page_role role) { struct shadow_page_caches caches = { .page_header_cache = &vcpu->arch.mmu_page_header_cache, .shadow_page_cache = &vcpu->arch.mmu_shadow_page_cache, .shadowed_info_cache = &vcpu->arch.mmu_shadowed_info_cache, }; return __kvm_mmu_get_shadow_page(vcpu->kvm, vcpu, &caches, gfn, role); } static union kvm_mmu_page_role kvm_mmu_child_role(u64 *sptep, bool direct, unsigned int access) { struct kvm_mmu_page *parent_sp = sptep_to_sp(sptep); union kvm_mmu_page_role role; role = parent_sp->role; role.level--; role.access = access; role.direct = direct; role.passthrough = 0; /* * If the guest has 4-byte PTEs then that means it's using 32-bit, * 2-level, non-PAE paging. KVM shadows such guests with PAE paging * (i.e. 8-byte PTEs). The difference in PTE size means that KVM must * shadow each guest page table with multiple shadow page tables, which * requires extra bookkeeping in the role. * * Specifically, to shadow the guest's page directory (which covers a * 4GiB address space), KVM uses 4 PAE page directories, each mapping * 1GiB of the address space. @role.quadrant encodes which quarter of * the address space each maps. * * To shadow the guest's page tables (which each map a 4MiB region), KVM * uses 2 PAE page tables, each mapping a 2MiB region. For these, * @role.quadrant encodes which half of the region they map. * * Concretely, a 4-byte PDE consumes bits 31:22, while an 8-byte PDE * consumes bits 29:21. To consume bits 31:30, KVM's uses 4 shadow * PDPTEs; those 4 PAE page directories are pre-allocated and their * quadrant is assigned in mmu_alloc_root(). A 4-byte PTE consumes * bits 21:12, while an 8-byte PTE consumes bits 20:12. To consume * bit 21 in the PTE (the child here), KVM propagates that bit to the * quadrant, i.e. sets quadrant to '0' or '1'. The parent 8-byte PDE * covers bit 21 (see above), thus the quadrant is calculated from the * _least_ significant bit of the PDE index. */ if (role.has_4_byte_gpte) { WARN_ON_ONCE(role.level != PG_LEVEL_4K); role.quadrant = spte_index(sptep) & 1; } return role; } static struct kvm_mmu_page *kvm_mmu_get_child_sp(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, bool direct, unsigned int access) { union kvm_mmu_page_role role; if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) return ERR_PTR(-EEXIST); role = kvm_mmu_child_role(sptep, direct, access); return kvm_mmu_get_shadow_page(vcpu, gfn, role); } static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator, struct kvm_vcpu *vcpu, hpa_t root, u64 addr) { iterator->addr = addr; iterator->shadow_addr = root; iterator->level = vcpu->arch.mmu->root_role.level; if (iterator->level >= PT64_ROOT_4LEVEL && vcpu->arch.mmu->cpu_role.base.level < PT64_ROOT_4LEVEL && !vcpu->arch.mmu->root_role.direct) iterator->level = PT32E_ROOT_LEVEL; if (iterator->level == PT32E_ROOT_LEVEL) { /* * prev_root is currently only used for 64-bit hosts. So only * the active root_hpa is valid here. */ BUG_ON(root != vcpu->arch.mmu->root.hpa); iterator->shadow_addr = vcpu->arch.mmu->pae_root[(addr >> 30) & 3]; iterator->shadow_addr &= SPTE_BASE_ADDR_MASK; --iterator->level; if (!iterator->shadow_addr) iterator->level = 0; } } static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, struct kvm_vcpu *vcpu, u64 addr) { shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root.hpa, addr); } static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) { if (iterator->level < PG_LEVEL_4K) return false; iterator->index = SPTE_INDEX(iterator->addr, iterator->level); iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; return true; } static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator, u64 spte) { if (!is_shadow_present_pte(spte) || is_last_spte(spte, iterator->level)) { iterator->level = 0; return; } iterator->shadow_addr = spte & SPTE_BASE_ADDR_MASK; --iterator->level; } static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) { __shadow_walk_next(iterator, *iterator->sptep); } static void __link_shadow_page(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, u64 *sptep, struct kvm_mmu_page *sp, bool flush) { u64 spte; BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); /* * If an SPTE is present already, it must be a leaf and therefore * a large one. Drop it, and flush the TLB if needed, before * installing sp. */ if (is_shadow_present_pte(*sptep)) drop_large_spte(kvm, sptep, flush); spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp)); mmu_spte_set(sptep, spte); mmu_page_add_parent_pte(cache, sp, sptep); /* * The non-direct sub-pagetable must be updated before linking. For * L1 sp, the pagetable is updated via kvm_sync_page() in * kvm_mmu_find_shadow_page() without write-protecting the gfn, * so sp->unsync can be true or false. For higher level non-direct * sp, the pagetable is updated/synced via mmu_sync_children() in * FNAME(fetch)(), so sp->unsync_children can only be false. * WARN_ON_ONCE() if anything happens unexpectedly. */ if (WARN_ON_ONCE(sp->unsync_children) || sp->unsync) mark_unsync(sptep); } static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, struct kvm_mmu_page *sp) { __link_shadow_page(vcpu->kvm, &vcpu->arch.mmu_pte_list_desc_cache, sptep, sp, true); } static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned direct_access) { if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) { struct kvm_mmu_page *child; /* * For the direct sp, if the guest pte's dirty bit * changed form clean to dirty, it will corrupt the * sp's access: allow writable in the read-only sp, * so we should update the spte at this point to get * a new sp with the correct access. */ child = spte_to_child_sp(*sptep); if (child->role.access == direct_access) return; drop_parent_pte(vcpu->kvm, child, sptep); kvm_flush_remote_tlbs_sptep(vcpu->kvm, sptep); } } /* Returns the number of zapped non-leaf child shadow pages. */ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *spte, struct list_head *invalid_list) { u64 pte; struct kvm_mmu_page *child; pte = *spte; if (is_shadow_present_pte(pte)) { if (is_last_spte(pte, sp->role.level)) { drop_spte(kvm, spte); } else { child = spte_to_child_sp(pte); drop_parent_pte(kvm, child, spte); /* * Recursively zap nested TDP SPs, parentless SPs are * unlikely to be used again in the near future. This * avoids retaining a large number of stale nested SPs. */ if (tdp_enabled && invalid_list && child->role.guest_mode && !child->parent_ptes.val) return kvm_mmu_prepare_zap_page(kvm, child, invalid_list); } } else if (is_mmio_spte(kvm, pte)) { mmu_spte_clear_no_track(spte); } return 0; } static int kvm_mmu_page_unlink_children(struct kvm *kvm, struct kvm_mmu_page *sp, struct list_head *invalid_list) { int zapped = 0; unsigned i; for (i = 0; i < SPTE_ENT_PER_PAGE; ++i) zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list); return zapped; } static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) { u64 *sptep; struct rmap_iterator iter; while ((sptep = rmap_get_first(&sp->parent_ptes, &iter))) drop_parent_pte(kvm, sp, sptep); } static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *parent, struct list_head *invalid_list) { int i, zapped = 0; struct mmu_page_path parents; struct kvm_mmu_pages pages; if (parent->role.level == PG_LEVEL_4K) return 0; while (mmu_unsync_walk(parent, &pages)) { struct kvm_mmu_page *sp; for_each_sp(pages, sp, parents, i) { kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); mmu_pages_clear_parents(&parents); zapped++; } } return zapped; } static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, struct list_head *invalid_list, int *nr_zapped) { bool list_unstable, zapped_root = false; lockdep_assert_held_write(&kvm->mmu_lock); trace_kvm_mmu_prepare_zap_page(sp); ++kvm->stat.mmu_shadow_zapped; *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list); *nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list); kvm_mmu_unlink_parents(kvm, sp); /* Zapping children means active_mmu_pages has become unstable. */ list_unstable = *nr_zapped; if (!sp->role.invalid && sp_has_gptes(sp)) unaccount_shadowed(kvm, sp); if (sp->unsync) kvm_unlink_unsync_page(kvm, sp); if (!sp->root_count) { /* Count self */ (*nr_zapped)++; /* * Already invalid pages (previously active roots) are not on * the active page list. See list_del() in the "else" case of * !sp->root_count. */ if (sp->role.invalid) list_add(&sp->link, invalid_list); else list_move(&sp->link, invalid_list); kvm_unaccount_mmu_page(kvm, sp); } else { /* * Remove the active root from the active page list, the root * will be explicitly freed when the root_count hits zero. */ list_del(&sp->link); /* * Obsolete pages cannot be used on any vCPUs, see the comment * in kvm_mmu_zap_all_fast(). Note, is_obsolete_sp() also * treats invalid shadow pages as being obsolete. */ zapped_root = !is_obsolete_sp(kvm, sp); } if (sp->nx_huge_page_disallowed) unaccount_nx_huge_page(kvm, sp); sp->role.invalid = 1; /* * Make the request to free obsolete roots after marking the root * invalid, otherwise other vCPUs may not see it as invalid. */ if (zapped_root) kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS); return list_unstable; } static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, struct list_head *invalid_list) { int nr_zapped; __kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped); return nr_zapped; } static void kvm_mmu_commit_zap_page(struct kvm *kvm, struct list_head *invalid_list) { struct kvm_mmu_page *sp, *nsp; if (list_empty(invalid_list)) return; /* * We need to make sure everyone sees our modifications to * the page tables and see changes to vcpu->mode here. The barrier * in the kvm_flush_remote_tlbs() achieves this. This pairs * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end. * * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit * guest mode and/or lockless shadow page table walks. */ kvm_flush_remote_tlbs(kvm); list_for_each_entry_safe(sp, nsp, invalid_list, link) { WARN_ON_ONCE(!sp->role.invalid || sp->root_count); kvm_mmu_free_shadow_page(sp); } } static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm, unsigned long nr_to_zap) { unsigned long total_zapped = 0; struct kvm_mmu_page *sp, *tmp; LIST_HEAD(invalid_list); bool unstable; int nr_zapped; if (list_empty(&kvm->arch.active_mmu_pages)) return 0; restart: list_for_each_entry_safe_reverse(sp, tmp, &kvm->arch.active_mmu_pages, link) { /* * Don't zap active root pages, the page itself can't be freed * and zapping it will just force vCPUs to realloc and reload. */ if (sp->root_count) continue; unstable = __kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &nr_zapped); total_zapped += nr_zapped; if (total_zapped >= nr_to_zap) break; if (unstable) goto restart; } kvm_mmu_commit_zap_page(kvm, &invalid_list); kvm->stat.mmu_recycled += total_zapped; return total_zapped; } static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm) { if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages) return kvm->arch.n_max_mmu_pages - kvm->arch.n_used_mmu_pages; return 0; } static int make_mmu_pages_available(struct kvm_vcpu *vcpu) { unsigned long avail = kvm_mmu_available_pages(vcpu->kvm); if (likely(avail >= KVM_MIN_FREE_MMU_PAGES)) return 0; kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail); /* * Note, this check is intentionally soft, it only guarantees that one * page is available, while the caller may end up allocating as many as * four pages, e.g. for PAE roots or for 5-level paging. Temporarily * exceeding the (arbitrary by default) limit will not harm the host, * being too aggressive may unnecessarily kill the guest, and getting an * exact count is far more trouble than it's worth, especially in the * page fault paths. */ if (!kvm_mmu_available_pages(vcpu->kvm)) return -ENOSPC; return 0; } /* * Changing the number of mmu pages allocated to the vm * Note: if goal_nr_mmu_pages is too small, you will get dead lock */ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages) { write_lock(&kvm->mmu_lock); if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages - goal_nr_mmu_pages); goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; } kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; write_unlock(&kvm->mmu_lock); } int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) { struct kvm_mmu_page *sp; LIST_HEAD(invalid_list); int r; r = 0; write_lock(&kvm->mmu_lock); for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) { r = 1; kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); } kvm_mmu_commit_zap_page(kvm, &invalid_list); write_unlock(&kvm->mmu_lock); return r; } static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) { gpa_t gpa; int r; if (vcpu->arch.mmu->root_role.direct) return 0; gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); return r; } static void kvm_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) { trace_kvm_mmu_unsync_page(sp); ++kvm->stat.mmu_unsync; sp->unsync = 1; kvm_mmu_mark_parents_unsync(sp); } /* * Attempt to unsync any shadow pages that can be reached by the specified gfn, * KVM is creating a writable mapping for said gfn. Returns 0 if all pages * were marked unsync (or if there is no shadow page), -EPERM if the SPTE must * be write-protected. */ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t gfn, bool can_unsync, bool prefetch) { struct kvm_mmu_page *sp; bool locked = false; /* * Force write-protection if the page is being tracked. Note, the page * track machinery is used to write-protect upper-level shadow pages, * i.e. this guards the role.level == 4K assertion below! */ if (kvm_gfn_is_write_tracked(kvm, slot, gfn)) return -EPERM; /* * The page is not write-tracked, mark existing shadow pages unsync * unless KVM is synchronizing an unsync SP (can_unsync = false). In * that case, KVM must complete emulation of the guest TLB flush before * allowing shadow pages to become unsync (writable by the guest). */ for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) { if (!can_unsync) return -EPERM; if (sp->unsync) continue; if (prefetch) return -EEXIST; /* * TDP MMU page faults require an additional spinlock as they * run with mmu_lock held for read, not write, and the unsync * logic is not thread safe. Take the spinklock regardless of * the MMU type to avoid extra conditionals/parameters, there's * no meaningful penalty if mmu_lock is held for write. */ if (!locked) { locked = true; spin_lock(&kvm->arch.mmu_unsync_pages_lock); /* * Recheck after taking the spinlock, a different vCPU * may have since marked the page unsync. A false * negative on the unprotected check above is not * possible as clearing sp->unsync _must_ hold mmu_lock * for write, i.e. unsync cannot transition from 1->0 * while this CPU holds mmu_lock for read (or write). */ if (READ_ONCE(sp->unsync)) continue; } WARN_ON_ONCE(sp->role.level != PG_LEVEL_4K); kvm_unsync_page(kvm, sp); } if (locked) spin_unlock(&kvm->arch.mmu_unsync_pages_lock); /* * We need to ensure that the marking of unsync pages is visible * before the SPTE is updated to allow writes because * kvm_mmu_sync_roots() checks the unsync flags without holding * the MMU lock and so can race with this. If the SPTE was updated * before the page had been marked as unsync-ed, something like the * following could happen: * * CPU 1 CPU 2 * --------------------------------------------------------------------- * 1.2 Host updates SPTE * to be writable * 2.1 Guest writes a GPTE for GVA X. * (GPTE being in the guest page table shadowed * by the SP from CPU 1.) * This reads SPTE during the page table walk. * Since SPTE.W is read as 1, there is no * fault. * * 2.2 Guest issues TLB flush. * That causes a VM Exit. * * 2.3 Walking of unsync pages sees sp->unsync is * false and skips the page. * * 2.4 Guest accesses GVA X. * Since the mapping in the SP was not updated, * so the old mapping for GVA X incorrectly * gets used. * 1.1 Host marks SP * as unsync * (sp->unsync = true) * * The write barrier below ensures that 1.1 happens before 1.2 and thus * the situation in 2.4 does not arise. It pairs with the read barrier * in is_unsync_root(), placed between 2.1's load of SPTE.W and 2.3. */ smp_wmb(); return 0; } static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, u64 *sptep, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, struct kvm_page_fault *fault) { struct kvm_mmu_page *sp = sptep_to_sp(sptep); int level = sp->role.level; int was_rmapped = 0; int ret = RET_PF_FIXED; bool flush = false; bool wrprot; u64 spte; /* Prefetching always gets a writable pfn. */ bool host_writable = !fault || fault->map_writable; bool prefetch = !fault || fault->prefetch; bool write_fault = fault && fault->write; if (unlikely(is_noslot_pfn(pfn))) { vcpu->stat.pf_mmio_spte_created++; mark_mmio_spte(vcpu, sptep, gfn, pte_access); return RET_PF_EMULATE; } if (is_shadow_present_pte(*sptep)) { /* * If we overwrite a PTE page pointer with a 2MB PMD, unlink * the parent of the now unreachable PTE. */ if (level > PG_LEVEL_4K && !is_large_pte(*sptep)) { struct kvm_mmu_page *child; u64 pte = *sptep; child = spte_to_child_sp(pte); drop_parent_pte(vcpu->kvm, child, sptep); flush = true; } else if (pfn != spte_to_pfn(*sptep)) { drop_spte(vcpu->kvm, sptep); flush = true; } else was_rmapped = 1; } wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch, true, host_writable, &spte); if (*sptep == spte) { ret = RET_PF_SPURIOUS; } else { flush |= mmu_spte_update(sptep, spte); trace_kvm_mmu_set_spte(level, gfn, sptep); } if (wrprot) { if (write_fault) ret = RET_PF_EMULATE; } if (flush) kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level); if (!was_rmapped) { WARN_ON_ONCE(ret == RET_PF_SPURIOUS); rmap_add(vcpu, slot, sptep, gfn, pte_access); } else { /* Already rmapped but the pte_access bits may have changed. */ kvm_mmu_page_set_access(sp, spte_index(sptep), pte_access); } return ret; } static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *start, u64 *end) { struct page *pages[PTE_PREFETCH_NUM]; struct kvm_memory_slot *slot; unsigned int access = sp->role.access; int i, ret; gfn_t gfn; gfn = kvm_mmu_page_get_gfn(sp, spte_index(start)); slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK); if (!slot) return -1; ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start); if (ret <= 0) return -1; for (i = 0; i < ret; i++, gfn++, start++) { mmu_set_spte(vcpu, slot, start, access, gfn, page_to_pfn(pages[i]), NULL); put_page(pages[i]); } return 0; } static void __direct_pte_prefetch(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *sptep) { u64 *spte, *start = NULL; int i; WARN_ON_ONCE(!sp->role.direct); i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1); spte = sp->spt + i; for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { if (is_shadow_present_pte(*spte) || spte == sptep) { if (!start) continue; if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0) return; start = NULL; } else if (!start) start = spte; } if (start) direct_pte_prefetch_many(vcpu, sp, start, spte); } static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) { struct kvm_mmu_page *sp; sp = sptep_to_sp(sptep); /* * Without accessed bits, there's no way to distinguish between * actually accessed translations and prefetched, so disable pte * prefetch if accessed bits aren't available. */ if (sp_ad_disabled(sp)) return; if (sp->role.level > PG_LEVEL_4K) return; /* * If addresses are being invalidated, skip prefetching to avoid * accidentally prefetching those addresses. */ if (unlikely(vcpu->kvm->mmu_invalidate_in_progress)) return; __direct_pte_prefetch(vcpu, sp, sptep); } /* * Lookup the mapping level for @gfn in the current mm. * * WARNING! Use of host_pfn_mapping_level() requires the caller and the end * consumer to be tied into KVM's handlers for MMU notifier events! * * There are several ways to safely use this helper: * * - Check mmu_invalidate_retry_gfn() after grabbing the mapping level, before * consuming it. In this case, mmu_lock doesn't need to be held during the * lookup, but it does need to be held while checking the MMU notifier. * * - Hold mmu_lock AND ensure there is no in-progress MMU notifier invalidation * event for the hva. This can be done by explicit checking the MMU notifier * or by ensuring that KVM already has a valid mapping that covers the hva. * * - Do not use the result to install new mappings, e.g. use the host mapping * level only to decide whether or not to zap an entry. In this case, it's * not required to hold mmu_lock (though it's highly likely the caller will * want to hold mmu_lock anyways, e.g. to modify SPTEs). * * Note! The lookup can still race with modifications to host page tables, but * the above "rules" ensure KVM will not _consume_ the result of the walk if a * race with the primary MMU occurs. */ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, const struct kvm_memory_slot *slot) { int level = PG_LEVEL_4K; unsigned long hva; unsigned long flags; pgd_t pgd; p4d_t p4d; pud_t pud; pmd_t pmd; /* * Note, using the already-retrieved memslot and __gfn_to_hva_memslot() * is not solely for performance, it's also necessary to avoid the * "writable" check in __gfn_to_hva_many(), which will always fail on * read-only memslots due to gfn_to_hva() assuming writes. Earlier * page fault steps have already verified the guest isn't writing a * read-only memslot. */ hva = __gfn_to_hva_memslot(slot, gfn); /* * Disable IRQs to prevent concurrent tear down of host page tables, * e.g. if the primary MMU promotes a P*D to a huge page and then frees * the original page table. */ local_irq_save(flags); /* * Read each entry once. As above, a non-leaf entry can be promoted to * a huge page _during_ this walk. Re-reading the entry could send the * walk into the weeks, e.g. p*d_leaf() returns false (sees the old * value) and then p*d_offset() walks into the target huge page instead * of the old page table (sees the new value). */ pgd = READ_ONCE(*pgd_offset(kvm->mm, hva)); if (pgd_none(pgd)) goto out; p4d = READ_ONCE(*p4d_offset(&pgd, hva)); if (p4d_none(p4d) || !p4d_present(p4d)) goto out; pud = READ_ONCE(*pud_offset(&p4d, hva)); if (pud_none(pud) || !pud_present(pud)) goto out; if (pud_leaf(pud)) { level = PG_LEVEL_1G; goto out; } pmd = READ_ONCE(*pmd_offset(&pud, hva)); if (pmd_none(pmd) || !pmd_present(pmd)) goto out; if (pmd_leaf(pmd)) level = PG_LEVEL_2M; out: local_irq_restore(flags); return level; } static int __kvm_mmu_max_mapping_level(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t gfn, int max_level, bool is_private) { struct kvm_lpage_info *linfo; int host_level; max_level = min(max_level, max_huge_page_level); for ( ; max_level > PG_LEVEL_4K; max_level--) { linfo = lpage_info_slot(gfn, slot, max_level); if (!linfo->disallow_lpage) break; } if (is_private) return max_level; if (max_level == PG_LEVEL_4K) return PG_LEVEL_4K; host_level = host_pfn_mapping_level(kvm, gfn, slot); return min(host_level, max_level); } int kvm_mmu_max_mapping_level(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t gfn, int max_level) { bool is_private = kvm_slot_can_be_private(slot) && kvm_mem_is_private(kvm, gfn); return __kvm_mmu_max_mapping_level(kvm, slot, gfn, max_level, is_private); } void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_memory_slot *slot = fault->slot; kvm_pfn_t mask; fault->huge_page_disallowed = fault->exec && fault->nx_huge_page_workaround_enabled; if (unlikely(fault->max_level == PG_LEVEL_4K)) return; if (is_error_noslot_pfn(fault->pfn)) return; if (kvm_slot_dirty_track_enabled(slot)) return; /* * Enforce the iTLB multihit workaround after capturing the requested * level, which will be used to do precise, accurate accounting. */ fault->req_level = __kvm_mmu_max_mapping_level(vcpu->kvm, slot, fault->gfn, fault->max_level, fault->is_private); if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed) return; /* * mmu_invalidate_retry() was successful and mmu_lock is held, so * the pmd can't be split from under us. */ fault->goal_level = fault->req_level; mask = KVM_PAGES_PER_HPAGE(fault->goal_level) - 1; VM_BUG_ON((fault->gfn & mask) != (fault->pfn & mask)); fault->pfn &= ~mask; } void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level) { if (cur_level > PG_LEVEL_4K && cur_level == fault->goal_level && is_shadow_present_pte(spte) && !is_large_pte(spte) && spte_to_child_sp(spte)->nx_huge_page_disallowed) { /* * A small SPTE exists for this pfn, but FNAME(fetch), * direct_map(), or kvm_tdp_mmu_map() would like to create a * large PTE instead: just force them to go down another level, * patching back for them into pfn the next 9 bits of the * address. */ u64 page_mask = KVM_PAGES_PER_HPAGE(cur_level) - KVM_PAGES_PER_HPAGE(cur_level - 1); fault->pfn |= fault->gfn & page_mask; fault->goal_level--; } } static int direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_shadow_walk_iterator it; struct kvm_mmu_page *sp; int ret; gfn_t base_gfn = fault->gfn; kvm_mmu_hugepage_adjust(vcpu, fault); trace_kvm_mmu_spte_requested(fault); for_each_shadow_entry(vcpu, fault->addr, it) { /* * We cannot overwrite existing page tables with an NX * large page, as the leaf could be executable. */ if (fault->nx_huge_page_workaround_enabled) disallowed_hugepage_adjust(fault, *it.sptep, it.level); base_gfn = gfn_round_for_level(fault->gfn, it.level); if (it.level == fault->goal_level) break; sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true, ACC_ALL); if (sp == ERR_PTR(-EEXIST)) continue; link_shadow_page(vcpu, it.sptep, sp); if (fault->huge_page_disallowed) account_nx_huge_page(vcpu->kvm, sp, fault->req_level >= it.level); } if (WARN_ON_ONCE(it.level != fault->goal_level)) return -EFAULT; ret = mmu_set_spte(vcpu, fault->slot, it.sptep, ACC_ALL, base_gfn, fault->pfn, fault); if (ret == RET_PF_SPURIOUS) return ret; direct_pte_prefetch(vcpu, it.sptep); return ret; } static void kvm_send_hwpoison_signal(struct kvm_memory_slot *slot, gfn_t gfn) { unsigned long hva = gfn_to_hva_memslot(slot, gfn); send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva, PAGE_SHIFT, current); } static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { if (is_sigpending_pfn(fault->pfn)) { kvm_handle_signal_exit(vcpu); return -EINTR; } /* * Do not cache the mmio info caused by writing the readonly gfn * into the spte otherwise read access on readonly gfn also can * caused mmio page fault and treat it as mmio access. */ if (fault->pfn == KVM_PFN_ERR_RO_FAULT) return RET_PF_EMULATE; if (fault->pfn == KVM_PFN_ERR_HWPOISON) { kvm_send_hwpoison_signal(fault->slot, fault->gfn); return RET_PF_RETRY; } return -EFAULT; } static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, unsigned int access) { gva_t gva = fault->is_tdp ? 0 : fault->addr; if (fault->is_private) { kvm_mmu_prepare_memory_fault_exit(vcpu, fault); return -EFAULT; } vcpu_cache_mmio_info(vcpu, gva, fault->gfn, access & shadow_mmio_access_mask); fault->slot = NULL; fault->pfn = KVM_PFN_NOSLOT; fault->map_writable = false; fault->hva = KVM_HVA_ERR_BAD; /* * If MMIO caching is disabled, emulate immediately without * touching the shadow page tables as attempting to install an * MMIO SPTE will just be an expensive nop. */ if (unlikely(!enable_mmio_caching)) return RET_PF_EMULATE; /* * Do not create an MMIO SPTE for a gfn greater than host.MAXPHYADDR, * any guest that generates such gfns is running nested and is being * tricked by L0 userspace (you can observe gfn > L1.MAXPHYADDR if and * only if L1's MAXPHYADDR is inaccurate with respect to the * hardware's). */ if (unlikely(fault->gfn > kvm_mmu_max_gfn())) return RET_PF_EMULATE; return RET_PF_CONTINUE; } static bool page_fault_can_be_fast(struct kvm *kvm, struct kvm_page_fault *fault) { /* * Page faults with reserved bits set, i.e. faults on MMIO SPTEs, only * reach the common page fault handler if the SPTE has an invalid MMIO * generation number. Refreshing the MMIO generation needs to go down * the slow path. Note, EPT Misconfigs do NOT set the PRESENT flag! */ if (fault->rsvd) return false; /* * For hardware-protected VMs, certain conditions like attempting to * perform a write to a page which is not in the state that the guest * expects it to be in can result in a nested/extended #PF. In this * case, the below code might misconstrue this situation as being the * result of a write-protected access, and treat it as a spurious case * rather than taking any action to satisfy the real source of the #PF * such as generating a KVM_EXIT_MEMORY_FAULT. This can lead to the * guest spinning on a #PF indefinitely, so don't attempt the fast path * in this case. * * Note that the kvm_mem_is_private() check might race with an * attribute update, but this will either result in the guest spinning * on RET_PF_SPURIOUS until the update completes, or an actual spurious * case might go down the slow path. Either case will resolve itself. */ if (kvm->arch.has_private_mem && fault->is_private != kvm_mem_is_private(kvm, fault->gfn)) return false; /* * #PF can be fast if: * * 1. The shadow page table entry is not present and A/D bits are * disabled _by KVM_, which could mean that the fault is potentially * caused by access tracking (if enabled). If A/D bits are enabled * by KVM, but disabled by L1 for L2, KVM is forced to disable A/D * bits for L2 and employ access tracking, but the fast page fault * mechanism only supports direct MMUs. * 2. The shadow page table entry is present, the access is a write, * and no reserved bits are set (MMIO SPTEs cannot be "fixed"), i.e. * the fault was caused by a write-protection violation. If the * SPTE is MMU-writable (determined later), the fault can be fixed * by setting the Writable bit, which can be done out of mmu_lock. */ if (!fault->present) return !kvm_ad_enabled(); /* * Note, instruction fetches and writes are mutually exclusive, ignore * the "exec" flag. */ return fault->write; } /* * Returns true if the SPTE was fixed successfully. Otherwise, * someone else modified the SPTE from its original value. */ static bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, u64 *sptep, u64 old_spte, u64 new_spte) { /* * Theoretically we could also set dirty bit (and flush TLB) here in * order to eliminate unnecessary PML logging. See comments in * set_spte. But fast_page_fault is very unlikely to happen with PML * enabled, so we do not do this. This might result in the same GPA * to be logged in PML buffer again when the write really happens, and * eventually to be called by mark_page_dirty twice. But it's also no * harm. This also avoids the TLB flush needed after setting dirty bit * so non-PML cases won't be impacted. * * Compare with set_spte where instead shadow_dirty_mask is set. */ if (!try_cmpxchg64(sptep, &old_spte, new_spte)) return false; if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) mark_page_dirty_in_slot(vcpu->kvm, fault->slot, fault->gfn); return true; } static bool is_access_allowed(struct kvm_page_fault *fault, u64 spte) { if (fault->exec) return is_executable_pte(spte); if (fault->write) return is_writable_pte(spte); /* Fault was on Read access */ return spte & PT_PRESENT_MASK; } /* * Returns the last level spte pointer of the shadow page walk for the given * gpa, and sets *spte to the spte value. This spte may be non-preset. If no * walk could be performed, returns NULL and *spte does not contain valid data. * * Contract: * - Must be called between walk_shadow_page_lockless_{begin,end}. * - The returned sptep must not be used after walk_shadow_page_lockless_end. */ static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte) { struct kvm_shadow_walk_iterator iterator; u64 old_spte; u64 *sptep = NULL; for_each_shadow_entry_lockless(vcpu, gpa, iterator, old_spte) { sptep = iterator.sptep; *spte = old_spte; } return sptep; } /* * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS. */ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_mmu_page *sp; int ret = RET_PF_INVALID; u64 spte; u64 *sptep; uint retry_count = 0; if (!page_fault_can_be_fast(vcpu->kvm, fault)) return ret; walk_shadow_page_lockless_begin(vcpu); do { u64 new_spte; if (tdp_mmu_enabled) sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->gfn, &spte); else sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte); /* * It's entirely possible for the mapping to have been zapped * by a different task, but the root page should always be * available as the vCPU holds a reference to its root(s). */ if (WARN_ON_ONCE(!sptep)) spte = FROZEN_SPTE; if (!is_shadow_present_pte(spte)) break; sp = sptep_to_sp(sptep); if (!is_last_spte(spte, sp->role.level)) break; /* * Check whether the memory access that caused the fault would * still cause it if it were to be performed right now. If not, * then this is a spurious fault caused by TLB lazily flushed, * or some other CPU has already fixed the PTE after the * current CPU took the fault. * * Need not check the access of upper level table entries since * they are always ACC_ALL. */ if (is_access_allowed(fault, spte)) { ret = RET_PF_SPURIOUS; break; } new_spte = spte; /* * KVM only supports fixing page faults outside of MMU lock for * direct MMUs, nested MMUs are always indirect, and KVM always * uses A/D bits for non-nested MMUs. Thus, if A/D bits are * enabled, the SPTE can't be an access-tracked SPTE. */ if (unlikely(!kvm_ad_enabled()) && is_access_track_spte(spte)) new_spte = restore_acc_track_spte(new_spte); /* * To keep things simple, only SPTEs that are MMU-writable can * be made fully writable outside of mmu_lock, e.g. only SPTEs * that were write-protected for dirty-logging or access * tracking are handled here. Don't bother checking if the * SPTE is writable to prioritize running with A/D bits enabled. * The is_access_allowed() check above handles the common case * of the fault being spurious, and the SPTE is known to be * shadow-present, i.e. except for access tracking restoration * making the new SPTE writable, the check is wasteful. */ if (fault->write && is_mmu_writable_spte(spte)) { new_spte |= PT_WRITABLE_MASK; /* * Do not fix write-permission on the large spte when * dirty logging is enabled. Since we only dirty the * first page into the dirty-bitmap in * fast_pf_fix_direct_spte(), other pages are missed * if its slot has dirty logging enabled. * * Instead, we let the slow page fault path create a * normal spte to fix the access. */ if (sp->role.level > PG_LEVEL_4K && kvm_slot_dirty_track_enabled(fault->slot)) break; } /* Verify that the fault can be handled in the fast path */ if (new_spte == spte || !is_access_allowed(fault, new_spte)) break; /* * Currently, fast page fault only works for direct mapping * since the gfn is not stable for indirect shadow page. See * Documentation/virt/kvm/locking.rst to get more detail. */ if (fast_pf_fix_direct_spte(vcpu, fault, sptep, spte, new_spte)) { ret = RET_PF_FIXED; break; } if (++retry_count > 4) { pr_warn_once("Fast #PF retrying more than 4 times.\n"); break; } } while (true); trace_fast_page_fault(vcpu, fault, sptep, spte, ret); walk_shadow_page_lockless_end(vcpu); if (ret != RET_PF_INVALID) vcpu->stat.pf_fast++; return ret; } static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, struct list_head *invalid_list) { struct kvm_mmu_page *sp; if (!VALID_PAGE(*root_hpa)) return; sp = root_to_sp(*root_hpa); if (WARN_ON_ONCE(!sp)) return; if (is_tdp_mmu_page(sp)) { lockdep_assert_held_read(&kvm->mmu_lock); kvm_tdp_mmu_put_root(kvm, sp); } else { lockdep_assert_held_write(&kvm->mmu_lock); if (!--sp->root_count && sp->role.invalid) kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); } *root_hpa = INVALID_PAGE; } /* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, ulong roots_to_free) { bool is_tdp_mmu = tdp_mmu_enabled && mmu->root_role.direct; int i; LIST_HEAD(invalid_list); bool free_active_root; WARN_ON_ONCE(roots_to_free & ~KVM_MMU_ROOTS_ALL); BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG); /* Before acquiring the MMU lock, see if we need to do any real work. */ free_active_root = (roots_to_free & KVM_MMU_ROOT_CURRENT) && VALID_PAGE(mmu->root.hpa); if (!free_active_root) { for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) && VALID_PAGE(mmu->prev_roots[i].hpa)) break; if (i == KVM_MMU_NUM_PREV_ROOTS) return; } if (is_tdp_mmu) read_lock(&kvm->mmu_lock); else write_lock(&kvm->mmu_lock); for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) mmu_free_root_page(kvm, &mmu->prev_roots[i].hpa, &invalid_list); if (free_active_root) { if (kvm_mmu_is_dummy_root(mmu->root.hpa)) { /* Nothing to cleanup for dummy roots. */ } else if (root_to_sp(mmu->root.hpa)) { mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list); } else if (mmu->pae_root) { for (i = 0; i < 4; ++i) { if (!IS_VALID_PAE_ROOT(mmu->pae_root[i])) continue; mmu_free_root_page(kvm, &mmu->pae_root[i], &invalid_list); mmu->pae_root[i] = INVALID_PAE_ROOT; } } mmu->root.hpa = INVALID_PAGE; mmu->root.pgd = 0; } if (is_tdp_mmu) { read_unlock(&kvm->mmu_lock); WARN_ON_ONCE(!list_empty(&invalid_list)); } else { kvm_mmu_commit_zap_page(kvm, &invalid_list); write_unlock(&kvm->mmu_lock); } } EXPORT_SYMBOL_GPL(kvm_mmu_free_roots); void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu) { unsigned long roots_to_free = 0; struct kvm_mmu_page *sp; hpa_t root_hpa; int i; /* * This should not be called while L2 is active, L2 can't invalidate * _only_ its own roots, e.g. INVVPID unconditionally exits. */ WARN_ON_ONCE(mmu->root_role.guest_mode); for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { root_hpa = mmu->prev_roots[i].hpa; if (!VALID_PAGE(root_hpa)) continue; sp = root_to_sp(root_hpa); if (!sp || sp->role.guest_mode) roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); } kvm_mmu_free_roots(kvm, mmu, roots_to_free); } EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots); static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, int quadrant, u8 level) { union kvm_mmu_page_role role = vcpu->arch.mmu->root_role; struct kvm_mmu_page *sp; role.level = level; role.quadrant = quadrant; WARN_ON_ONCE(quadrant && !role.has_4_byte_gpte); WARN_ON_ONCE(role.direct && role.has_4_byte_gpte); sp = kvm_mmu_get_shadow_page(vcpu, gfn, role); ++sp->root_count; return __pa(sp->spt); } static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; u8 shadow_root_level = mmu->root_role.level; hpa_t root; unsigned i; int r; if (tdp_mmu_enabled) return kvm_tdp_mmu_alloc_root(vcpu); write_lock(&vcpu->kvm->mmu_lock); r = make_mmu_pages_available(vcpu); if (r < 0) goto out_unlock; if (shadow_root_level >= PT64_ROOT_4LEVEL) { root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level); mmu->root.hpa = root; } else if (shadow_root_level == PT32E_ROOT_LEVEL) { if (WARN_ON_ONCE(!mmu->pae_root)) { r = -EIO; goto out_unlock; } for (i = 0; i < 4; ++i) { WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i])); root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), 0, PT32_ROOT_LEVEL); mmu->pae_root[i] = root | PT_PRESENT_MASK | shadow_me_value; } mmu->root.hpa = __pa(mmu->pae_root); } else { WARN_ONCE(1, "Bad TDP root level = %d\n", shadow_root_level); r = -EIO; goto out_unlock; } /* root.pgd is ignored for direct MMUs. */ mmu->root.pgd = 0; out_unlock: write_unlock(&vcpu->kvm->mmu_lock); return r; } static int mmu_first_shadow_root_alloc(struct kvm *kvm) { struct kvm_memslots *slots; struct kvm_memory_slot *slot; int r = 0, i, bkt; /* * Check if this is the first shadow root being allocated before * taking the lock. */ if (kvm_shadow_root_allocated(kvm)) return 0; mutex_lock(&kvm->slots_arch_lock); /* Recheck, under the lock, whether this is the first shadow root. */ if (kvm_shadow_root_allocated(kvm)) goto out_unlock; /* * Check if anything actually needs to be allocated, e.g. all metadata * will be allocated upfront if TDP is disabled. */ if (kvm_memslots_have_rmaps(kvm) && kvm_page_track_write_tracking_enabled(kvm)) goto out_success; for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { slots = __kvm_memslots(kvm, i); kvm_for_each_memslot(slot, bkt, slots) { /* * Both of these functions are no-ops if the target is * already allocated, so unconditionally calling both * is safe. Intentionally do NOT free allocations on * failure to avoid having to track which allocations * were made now versus when the memslot was created. * The metadata is guaranteed to be freed when the slot * is freed, and will be kept/used if userspace retries * KVM_RUN instead of killing the VM. */ r = memslot_rmap_alloc(slot, slot->npages); if (r) goto out_unlock; r = kvm_page_track_write_tracking_alloc(slot); if (r) goto out_unlock; } } /* * Ensure that shadow_root_allocated becomes true strictly after * all the related pointers are set. */ out_success: smp_store_release(&kvm->arch.shadow_root_allocated, true); out_unlock: mutex_unlock(&kvm->slots_arch_lock); return r; } static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; u64 pdptrs[4], pm_mask; gfn_t root_gfn, root_pgd; int quadrant, i, r; hpa_t root; root_pgd = kvm_mmu_get_guest_pgd(vcpu, mmu); root_gfn = (root_pgd & __PT_BASE_ADDR_MASK) >> PAGE_SHIFT; if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) { mmu->root.hpa = kvm_mmu_get_dummy_root(); return 0; } /* * On SVM, reading PDPTRs might access guest memory, which might fault * and thus might sleep. Grab the PDPTRs before acquiring mmu_lock. */ if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) { for (i = 0; i < 4; ++i) { pdptrs[i] = mmu->get_pdptr(vcpu, i); if (!(pdptrs[i] & PT_PRESENT_MASK)) continue; if (!kvm_vcpu_is_visible_gfn(vcpu, pdptrs[i] >> PAGE_SHIFT)) pdptrs[i] = 0; } } r = mmu_first_shadow_root_alloc(vcpu->kvm); if (r) return r; write_lock(&vcpu->kvm->mmu_lock); r = make_mmu_pages_available(vcpu); if (r < 0) goto out_unlock; /* * Do we shadow a long mode page table? If so we need to * write-protect the guests page table root. */ if (mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) { root = mmu_alloc_root(vcpu, root_gfn, 0, mmu->root_role.level); mmu->root.hpa = root; goto set_root_pgd; } if (WARN_ON_ONCE(!mmu->pae_root)) { r = -EIO; goto out_unlock; } /* * We shadow a 32 bit page table. This may be a legacy 2-level * or a PAE 3-level page table. In either case we need to be aware that * the shadow page table may be a PAE or a long mode page table. */ pm_mask = PT_PRESENT_MASK | shadow_me_value; if (mmu->root_role.level >= PT64_ROOT_4LEVEL) { pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; if (WARN_ON_ONCE(!mmu->pml4_root)) { r = -EIO; goto out_unlock; } mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask; if (mmu->root_role.level == PT64_ROOT_5LEVEL) { if (WARN_ON_ONCE(!mmu->pml5_root)) { r = -EIO; goto out_unlock; } mmu->pml5_root[0] = __pa(mmu->pml4_root) | pm_mask; } } for (i = 0; i < 4; ++i) { WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i])); if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) { if (!(pdptrs[i] & PT_PRESENT_MASK)) { mmu->pae_root[i] = INVALID_PAE_ROOT; continue; } root_gfn = pdptrs[i] >> PAGE_SHIFT; } /* * If shadowing 32-bit non-PAE page tables, each PAE page * directory maps one quarter of the guest's non-PAE page * directory. Othwerise each PAE page direct shadows one guest * PAE page directory so that quadrant should be 0. */ quadrant = (mmu->cpu_role.base.level == PT32_ROOT_LEVEL) ? i : 0; root = mmu_alloc_root(vcpu, root_gfn, quadrant, PT32_ROOT_LEVEL); mmu->pae_root[i] = root | pm_mask; } if (mmu->root_role.level == PT64_ROOT_5LEVEL) mmu->root.hpa = __pa(mmu->pml5_root); else if (mmu->root_role.level == PT64_ROOT_4LEVEL) mmu->root.hpa = __pa(mmu->pml4_root); else mmu->root.hpa = __pa(mmu->pae_root); set_root_pgd: mmu->root.pgd = root_pgd; out_unlock: write_unlock(&vcpu->kvm->mmu_lock); return r; } static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; bool need_pml5 = mmu->root_role.level > PT64_ROOT_4LEVEL; u64 *pml5_root = NULL; u64 *pml4_root = NULL; u64 *pae_root; /* * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP * tables are allocated and initialized at root creation as there is no * equivalent level in the guest's NPT to shadow. Allocate the tables * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare. */ if (mmu->root_role.direct || mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL || mmu->root_role.level < PT64_ROOT_4LEVEL) return 0; /* * NPT, the only paging mode that uses this horror, uses a fixed number * of levels for the shadow page tables, e.g. all MMUs are 4-level or * all MMus are 5-level. Thus, this can safely require that pml5_root * is allocated if the other roots are valid and pml5 is needed, as any * prior MMU would also have required pml5. */ if (mmu->pae_root && mmu->pml4_root && (!need_pml5 || mmu->pml5_root)) return 0; /* * The special roots should always be allocated in concert. Yell and * bail if KVM ends up in a state where only one of the roots is valid. */ if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root || (need_pml5 && mmu->pml5_root))) return -EIO; /* * Unlike 32-bit NPT, the PDP table doesn't need to be in low mem, and * doesn't need to be decrypted. */ pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!pae_root) return -ENOMEM; #ifdef CONFIG_X86_64 pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!pml4_root) goto err_pml4; if (need_pml5) { pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!pml5_root) goto err_pml5; } #endif mmu->pae_root = pae_root; mmu->pml4_root = pml4_root; mmu->pml5_root = pml5_root; return 0; #ifdef CONFIG_X86_64 err_pml5: free_page((unsigned long)pml4_root); err_pml4: free_page((unsigned long)pae_root); return -ENOMEM; #endif } static bool is_unsync_root(hpa_t root) { struct kvm_mmu_page *sp; if (!VALID_PAGE(root) || kvm_mmu_is_dummy_root(root)) return false; /* * The read barrier orders the CPU's read of SPTE.W during the page table * walk before the reads of sp->unsync/sp->unsync_children here. * * Even if another CPU was marking the SP as unsync-ed simultaneously, * any guest page table changes are not guaranteed to be visible anyway * until this VCPU issues a TLB flush strictly after those changes are * made. We only need to ensure that the other CPU sets these flags * before any actual changes to the page tables are made. The comments * in mmu_try_to_unsync_pages() describe what could go wrong if this * requirement isn't satisfied. */ smp_rmb(); sp = root_to_sp(root); /* * PAE roots (somewhat arbitrarily) aren't backed by shadow pages, the * PDPTEs for a given PAE root need to be synchronized individually. */ if (WARN_ON_ONCE(!sp)) return false; if (sp->unsync || sp->unsync_children) return true; return false; } void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) { int i; struct kvm_mmu_page *sp; if (vcpu->arch.mmu->root_role.direct) return; if (!VALID_PAGE(vcpu->arch.mmu->root.hpa)) return; vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); if (vcpu->arch.mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) { hpa_t root = vcpu->arch.mmu->root.hpa; if (!is_unsync_root(root)) return; sp = root_to_sp(root); write_lock(&vcpu->kvm->mmu_lock); mmu_sync_children(vcpu, sp, true); write_unlock(&vcpu->kvm->mmu_lock); return; } write_lock(&vcpu->kvm->mmu_lock); for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu->pae_root[i]; if (IS_VALID_PAE_ROOT(root)) { sp = spte_to_child_sp(root); mmu_sync_children(vcpu, sp, true); } } write_unlock(&vcpu->kvm->mmu_lock); } void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu) { unsigned long roots_to_free = 0; int i; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) if (is_unsync_root(vcpu->arch.mmu->prev_roots[i].hpa)) roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); /* sync prev_roots by simply freeing them */ kvm_mmu_free_roots(vcpu->kvm, vcpu->arch.mmu, roots_to_free); } static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gpa_t vaddr, u64 access, struct x86_exception *exception) { if (exception) exception->error_code = 0; return kvm_translate_gpa(vcpu, mmu, vaddr, access, exception); } static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) { /* * A nested guest cannot use the MMIO cache if it is using nested * page tables, because cr2 is a nGPA while the cache stores GPAs. */ if (mmu_is_nested(vcpu)) return false; if (direct) return vcpu_match_mmio_gpa(vcpu, addr); return vcpu_match_mmio_gva(vcpu, addr); } /* * Return the level of the lowest level SPTE added to sptes. * That SPTE may be non-present. * * Must be called between walk_shadow_page_lockless_{begin,end}. */ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level) { struct kvm_shadow_walk_iterator iterator; int leaf = -1; u64 spte; for (shadow_walk_init(&iterator, vcpu, addr), *root_level = iterator.level; shadow_walk_okay(&iterator); __shadow_walk_next(&iterator, spte)) { leaf = iterator.level; spte = mmu_spte_get_lockless(iterator.sptep); sptes[leaf] = spte; } return leaf; } static int get_sptes_lockless(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level) { int leaf; walk_shadow_page_lockless_begin(vcpu); if (is_tdp_mmu_active(vcpu)) leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, root_level); else leaf = get_walk(vcpu, addr, sptes, root_level); walk_shadow_page_lockless_end(vcpu); return leaf; } /* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) { u64 sptes[PT64_ROOT_MAX_LEVEL + 1]; struct rsvd_bits_validate *rsvd_check; int root, leaf, level; bool reserved = false; leaf = get_sptes_lockless(vcpu, addr, sptes, &root); if (unlikely(leaf < 0)) { *sptep = 0ull; return reserved; } *sptep = sptes[leaf]; /* * Skip reserved bits checks on the terminal leaf if it's not a valid * SPTE. Note, this also (intentionally) skips MMIO SPTEs, which, by * design, always have reserved bits set. The purpose of the checks is * to detect reserved bits on non-MMIO SPTEs. i.e. buggy SPTEs. */ if (!is_shadow_present_pte(sptes[leaf])) leaf++; rsvd_check = &vcpu->arch.mmu->shadow_zero_check; for (level = root; level >= leaf; level--) reserved |= is_rsvd_spte(rsvd_check, sptes[level], level); if (reserved) { pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n", __func__, addr); for (level = root; level >= leaf; level--) pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx", sptes[level], level, get_rsvd_bits(rsvd_check, sptes[level], level)); } return reserved; } static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) { u64 spte; bool reserved; if (mmio_info_in_cache(vcpu, addr, direct)) return RET_PF_EMULATE; reserved = get_mmio_spte(vcpu, addr, &spte); if (WARN_ON_ONCE(reserved)) return -EINVAL; if (is_mmio_spte(vcpu->kvm, spte)) { gfn_t gfn = get_mmio_spte_gfn(spte); unsigned int access = get_mmio_spte_access(spte); if (!check_mmio_spte(vcpu, spte)) return RET_PF_INVALID; if (direct) addr = 0; trace_handle_mmio_page_fault(addr, gfn, access); vcpu_cache_mmio_info(vcpu, addr, gfn, access); return RET_PF_EMULATE; } /* * If the page table is zapped by other cpus, let CPU fault again on * the address. */ return RET_PF_RETRY; } static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { if (unlikely(fault->rsvd)) return false; if (!fault->present || !fault->write) return false; /* * guest is writing the page which is write tracked which can * not be fixed by page fault handler. */ if (kvm_gfn_is_write_tracked(vcpu->kvm, fault->slot, fault->gfn)) return true; return false; } static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr) { struct kvm_shadow_walk_iterator iterator; u64 spte; walk_shadow_page_lockless_begin(vcpu); for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) clear_sp_write_flooding_count(iterator.sptep); walk_shadow_page_lockless_end(vcpu); } static u32 alloc_apf_token(struct kvm_vcpu *vcpu) { /* make sure the token value is not 0 */ u32 id = vcpu->arch.apf.id; if (id << 12 == 0) vcpu->arch.apf.id = 1; return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id; } static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_arch_async_pf arch; arch.token = alloc_apf_token(vcpu); arch.gfn = fault->gfn; arch.error_code = fault->error_code; arch.direct_map = vcpu->arch.mmu->root_role.direct; arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu); return kvm_setup_async_pf(vcpu, fault->addr, kvm_vcpu_gfn_to_hva(vcpu, fault->gfn), &arch); } void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) { int r; if (WARN_ON_ONCE(work->arch.error_code & PFERR_PRIVATE_ACCESS)) return; if ((vcpu->arch.mmu->root_role.direct != work->arch.direct_map) || work->wakeup_all) return; r = kvm_mmu_reload(vcpu); if (unlikely(r)) return; if (!vcpu->arch.mmu->root_role.direct && work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu)) return; r = kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code, true, NULL, NULL); /* * Account fixed page faults, otherwise they'll never be counted, but * ignore stats for all other return times. Page-ready "faults" aren't * truly spurious and never trigger emulation */ if (r == RET_PF_FIXED) vcpu->stat.pf_fixed++; } static inline u8 kvm_max_level_for_order(int order) { BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G); KVM_MMU_WARN_ON(order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G) && order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M) && order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K)); if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G)) return PG_LEVEL_1G; if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M)) return PG_LEVEL_2M; return PG_LEVEL_4K; } static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, u8 max_level, int gmem_order) { u8 req_max_level; if (max_level == PG_LEVEL_4K) return PG_LEVEL_4K; max_level = min(kvm_max_level_for_order(gmem_order), max_level); if (max_level == PG_LEVEL_4K) return PG_LEVEL_4K; req_max_level = kvm_x86_call(private_max_mapping_level)(kvm, pfn); if (req_max_level) max_level = min(max_level, req_max_level); return req_max_level; } static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { int max_order, r; if (!kvm_slot_can_be_private(fault->slot)) { kvm_mmu_prepare_memory_fault_exit(vcpu, fault); return -EFAULT; } r = kvm_gmem_get_pfn(vcpu->kvm, fault->slot, fault->gfn, &fault->pfn, &max_order); if (r) { kvm_mmu_prepare_memory_fault_exit(vcpu, fault); return r; } fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY); fault->max_level = kvm_max_private_mapping_level(vcpu->kvm, fault->pfn, fault->max_level, max_order); return RET_PF_CONTINUE; } static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { bool async; if (fault->is_private) return kvm_faultin_pfn_private(vcpu, fault); async = false; fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, false, &async, fault->write, &fault->map_writable, &fault->hva); if (!async) return RET_PF_CONTINUE; /* *pfn has correct page already */ if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) { trace_kvm_try_async_get_page(fault->addr, fault->gfn); if (kvm_find_async_pf_gfn(vcpu, fault->gfn)) { trace_kvm_async_pf_repeated_fault(fault->addr, fault->gfn); kvm_make_request(KVM_REQ_APF_HALT, vcpu); return RET_PF_RETRY; } else if (kvm_arch_setup_async_pf(vcpu, fault)) { return RET_PF_RETRY; } } /* * Allow gup to bail on pending non-fatal signals when it's also allowed * to wait for IO. Note, gup always bails if it is unable to quickly * get a page and a fatal signal, i.e. SIGKILL, is pending. */ fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, true, NULL, fault->write, &fault->map_writable, &fault->hva); return RET_PF_CONTINUE; } static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, unsigned int access) { struct kvm_memory_slot *slot = fault->slot; int ret; /* * Note that the mmu_invalidate_seq also serves to detect a concurrent * change in attributes. is_page_fault_stale() will detect an * invalidation relate to fault->fn and resume the guest without * installing a mapping in the page tables. */ fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq; smp_rmb(); /* * Now that we have a snapshot of mmu_invalidate_seq we can check for a * private vs. shared mismatch. */ if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) { kvm_mmu_prepare_memory_fault_exit(vcpu, fault); return -EFAULT; } if (unlikely(!slot)) return kvm_handle_noslot_fault(vcpu, fault, access); /* * Retry the page fault if the gfn hit a memslot that is being deleted * or moved. This ensures any existing SPTEs for the old memslot will * be zapped before KVM inserts a new MMIO SPTE for the gfn. */ if (slot->flags & KVM_MEMSLOT_INVALID) return RET_PF_RETRY; if (slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT) { /* * Don't map L1's APIC access page into L2, KVM doesn't support * using APICv/AVIC to accelerate L2 accesses to L1's APIC, * i.e. the access needs to be emulated. Emulating access to * L1's APIC is also correct if L1 is accelerating L2's own * virtual APIC, but for some reason L1 also maps _L1's_ APIC * into L2. Note, vcpu_is_mmio_gpa() always treats access to * the APIC as MMIO. Allow an MMIO SPTE to be created, as KVM * uses different roots for L1 vs. L2, i.e. there is no danger * of breaking APICv/AVIC for L1. */ if (is_guest_mode(vcpu)) return kvm_handle_noslot_fault(vcpu, fault, access); /* * If the APIC access page exists but is disabled, go directly * to emulation without caching the MMIO access or creating a * MMIO SPTE. That way the cache doesn't need to be purged * when the AVIC is re-enabled. */ if (!kvm_apicv_activated(vcpu->kvm)) return RET_PF_EMULATE; } /* * Check for a relevant mmu_notifier invalidation event before getting * the pfn from the primary MMU, and before acquiring mmu_lock. * * For mmu_lock, if there is an in-progress invalidation and the kernel * allows preemption, the invalidation task may drop mmu_lock and yield * in response to mmu_lock being contended, which is *very* counter- * productive as this vCPU can't actually make forward progress until * the invalidation completes. * * Retrying now can also avoid unnessary lock contention in the primary * MMU, as the primary MMU doesn't necessarily hold a single lock for * the duration of the invalidation, i.e. faulting in a conflicting pfn * can cause the invalidation to take longer by holding locks that are * needed to complete the invalidation. * * Do the pre-check even for non-preemtible kernels, i.e. even if KVM * will never yield mmu_lock in response to contention, as this vCPU is * *guaranteed* to need to retry, i.e. waiting until mmu_lock is held * to detect retry guarantees the worst case latency for the vCPU. */ if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) return RET_PF_RETRY; ret = __kvm_faultin_pfn(vcpu, fault); if (ret != RET_PF_CONTINUE) return ret; if (unlikely(is_error_pfn(fault->pfn))) return kvm_handle_error_pfn(vcpu, fault); if (WARN_ON_ONCE(!fault->slot || is_noslot_pfn(fault->pfn))) return kvm_handle_noslot_fault(vcpu, fault, access); /* * Check again for a relevant mmu_notifier invalidation event purely to * avoid contending mmu_lock. Most invalidations will be detected by * the previous check, but checking is extremely cheap relative to the * overall cost of failing to detect the invalidation until after * mmu_lock is acquired. */ if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) { kvm_release_pfn_clean(fault->pfn); return RET_PF_RETRY; } return RET_PF_CONTINUE; } /* * Returns true if the page fault is stale and needs to be retried, i.e. if the * root was invalidated by a memslot update or a relevant mmu_notifier fired. */ static bool is_page_fault_stale(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_mmu_page *sp = root_to_sp(vcpu->arch.mmu->root.hpa); /* Special roots, e.g. pae_root, are not backed by shadow pages. */ if (sp && is_obsolete_sp(vcpu->kvm, sp)) return true; /* * Roots without an associated shadow page are considered invalid if * there is a pending request to free obsolete roots. The request is * only a hint that the current root _may_ be obsolete and needs to be * reloaded, e.g. if the guest frees a PGD that KVM is tracking as a * previous root, then __kvm_mmu_prepare_zap_page() signals all vCPUs * to reload even if no vCPU is actively using the root. */ if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu)) return true; /* * Check for a relevant mmu_notifier invalidation event one last time * now that mmu_lock is held, as the "unsafe" checks performed without * holding mmu_lock can get false negatives. */ return fault->slot && mmu_invalidate_retry_gfn(vcpu->kvm, fault->mmu_seq, fault->gfn); } static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { int r; /* Dummy roots are used only for shadowing bad guest roots. */ if (WARN_ON_ONCE(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa))) return RET_PF_RETRY; if (page_fault_handle_page_track(vcpu, fault)) return RET_PF_EMULATE; r = fast_page_fault(vcpu, fault); if (r != RET_PF_INVALID) return r; r = mmu_topup_memory_caches(vcpu, false); if (r) return r; r = kvm_faultin_pfn(vcpu, fault, ACC_ALL); if (r != RET_PF_CONTINUE) return r; r = RET_PF_RETRY; write_lock(&vcpu->kvm->mmu_lock); if (is_page_fault_stale(vcpu, fault)) goto out_unlock; r = make_mmu_pages_available(vcpu); if (r) goto out_unlock; r = direct_map(vcpu, fault); out_unlock: write_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(fault->pfn); return r; } static int nonpaging_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */ fault->max_level = PG_LEVEL_2M; return direct_page_fault(vcpu, fault); } int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, u64 fault_address, char *insn, int insn_len) { int r = 1; u32 flags = vcpu->arch.apf.host_apf_flags; #ifndef CONFIG_X86_64 /* A 64-bit CR2 should be impossible on 32-bit KVM. */ if (WARN_ON_ONCE(fault_address >> 32)) return -EFAULT; #endif /* * Legacy #PF exception only have a 32-bit error code. Simply drop the * upper bits as KVM doesn't use them for #PF (because they are never * set), and to ensure there are no collisions with KVM-defined bits. */ if (WARN_ON_ONCE(error_code >> 32)) error_code = lower_32_bits(error_code); /* * Restrict KVM-defined flags to bits 63:32 so that it's impossible for * them to conflict with #PF error codes, which are limited to 32 bits. */ BUILD_BUG_ON(lower_32_bits(PFERR_SYNTHETIC_MASK)); vcpu->arch.l1tf_flush_l1d = true; if (!flags) { trace_kvm_page_fault(vcpu, fault_address, error_code); if (kvm_event_needs_reinjection(vcpu)) kvm_mmu_unprotect_page_virt(vcpu, fault_address); r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn, insn_len); } else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) { vcpu->arch.apf.host_apf_flags = 0; local_irq_disable(); kvm_async_pf_task_wait_schedule(fault_address); local_irq_enable(); } else { WARN_ONCE(1, "Unexpected host async PF flags: %x\n", flags); } return r; } EXPORT_SYMBOL_GPL(kvm_handle_page_fault); #ifdef CONFIG_X86_64 static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { int r; if (page_fault_handle_page_track(vcpu, fault)) return RET_PF_EMULATE; r = fast_page_fault(vcpu, fault); if (r != RET_PF_INVALID) return r; r = mmu_topup_memory_caches(vcpu, false); if (r) return r; r = kvm_faultin_pfn(vcpu, fault, ACC_ALL); if (r != RET_PF_CONTINUE) return r; r = RET_PF_RETRY; read_lock(&vcpu->kvm->mmu_lock); if (is_page_fault_stale(vcpu, fault)) goto out_unlock; r = kvm_tdp_mmu_map(vcpu, fault); out_unlock: read_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(fault->pfn); return r; } #endif bool kvm_mmu_may_ignore_guest_pat(void) { /* * When EPT is enabled (shadow_memtype_mask is non-zero), the CPU does * not support self-snoop (or is affected by an erratum), and the VM * has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is to * honor the memtype from the guest's PAT so that guest accesses to * memory that is DMA'd aren't cached against the guest's wishes. As a * result, KVM _may_ ignore guest PAT, whereas without non-coherent DMA, * KVM _always_ ignores or honors guest PAT, i.e. doesn't toggle SPTE * bits in response to non-coherent device (un)registration. */ return !static_cpu_has(X86_FEATURE_SELFSNOOP) && shadow_memtype_mask; } int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { #ifdef CONFIG_X86_64 if (tdp_mmu_enabled) return kvm_tdp_mmu_page_fault(vcpu, fault); #endif return direct_page_fault(vcpu, fault); } static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level) { int r; /* * Restrict to TDP page fault, since that's the only case where the MMU * is indexed by GPA. */ if (vcpu->arch.mmu->page_fault != kvm_tdp_page_fault) return -EOPNOTSUPP; do { if (signal_pending(current)) return -EINTR; cond_resched(); r = kvm_mmu_do_page_fault(vcpu, gpa, error_code, true, NULL, level); } while (r == RET_PF_RETRY); if (r < 0) return r; switch (r) { case RET_PF_FIXED: case RET_PF_SPURIOUS: return 0; case RET_PF_EMULATE: return -ENOENT; case RET_PF_RETRY: case RET_PF_CONTINUE: case RET_PF_INVALID: default: WARN_ONCE(1, "could not fix page fault during prefault"); return -EIO; } } long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, struct kvm_pre_fault_memory *range) { u64 error_code = PFERR_GUEST_FINAL_MASK; u8 level = PG_LEVEL_4K; u64 end; int r; /* * reload is efficient when called repeatedly, so we can do it on * every iteration. */ kvm_mmu_reload(vcpu); if (kvm_arch_has_private_mem(vcpu->kvm) && kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(range->gpa))) error_code |= PFERR_PRIVATE_ACCESS; /* * Shadow paging uses GVA for kvm page fault, so restrict to * two-dimensional paging. */ r = kvm_tdp_map_page(vcpu, range->gpa, error_code, &level); if (r < 0) return r; /* * If the mapping that covers range->gpa can use a huge page, it * may start below it or end after range->gpa + range->size. */ end = (range->gpa & KVM_HPAGE_MASK(level)) + KVM_HPAGE_SIZE(level); return min(range->size, end - range->gpa); } static void nonpaging_init_context(struct kvm_mmu *context) { context->page_fault = nonpaging_page_fault; context->gva_to_gpa = nonpaging_gva_to_gpa; context->sync_spte = NULL; } static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd, union kvm_mmu_page_role role) { struct kvm_mmu_page *sp; if (!VALID_PAGE(root->hpa)) return false; if (!role.direct && pgd != root->pgd) return false; sp = root_to_sp(root->hpa); if (WARN_ON_ONCE(!sp)) return false; return role.word == sp->role.word; } /* * Find out if a previously cached root matching the new pgd/role is available, * and insert the current root as the MRU in the cache. * If a matching root is found, it is assigned to kvm_mmu->root and * true is returned. * If no match is found, kvm_mmu->root is left invalid, the LRU root is * evicted to make room for the current root, and false is returned. */ static bool cached_root_find_and_keep_current(struct kvm *kvm, struct kvm_mmu *mmu, gpa_t new_pgd, union kvm_mmu_page_role new_role) { uint i; if (is_root_usable(&mmu->root, new_pgd, new_role)) return true; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { /* * The swaps end up rotating the cache like this: * C 0 1 2 3 (on entry to the function) * 0 C 1 2 3 * 1 C 0 2 3 * 2 C 0 1 3 * 3 C 0 1 2 (on exit from the loop) */ swap(mmu->root, mmu->prev_roots[i]); if (is_root_usable(&mmu->root, new_pgd, new_role)) return true; } kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT); return false; } /* * Find out if a previously cached root matching the new pgd/role is available. * On entry, mmu->root is invalid. * If a matching root is found, it is assigned to kvm_mmu->root, the LRU entry * of the cache becomes invalid, and true is returned. * If no match is found, kvm_mmu->root is left invalid and false is returned. */ static bool cached_root_find_without_current(struct kvm *kvm, struct kvm_mmu *mmu, gpa_t new_pgd, union kvm_mmu_page_role new_role) { uint i; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) if (is_root_usable(&mmu->prev_roots[i], new_pgd, new_role)) goto hit; return false; hit: swap(mmu->root, mmu->prev_roots[i]); /* Bubble up the remaining roots. */ for (; i < KVM_MMU_NUM_PREV_ROOTS - 1; i++) mmu->prev_roots[i] = mmu->prev_roots[i + 1]; mmu->prev_roots[i].hpa = INVALID_PAGE; return true; } static bool fast_pgd_switch(struct kvm *kvm, struct kvm_mmu *mmu, gpa_t new_pgd, union kvm_mmu_page_role new_role) { /* * Limit reuse to 64-bit hosts+VMs without "special" roots in order to * avoid having to deal with PDPTEs and other complexities. */ if (VALID_PAGE(mmu->root.hpa) && !root_to_sp(mmu->root.hpa)) kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT); if (VALID_PAGE(mmu->root.hpa)) return cached_root_find_and_keep_current(kvm, mmu, new_pgd, new_role); else return cached_root_find_without_current(kvm, mmu, new_pgd, new_role); } void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd) { struct kvm_mmu *mmu = vcpu->arch.mmu; union kvm_mmu_page_role new_role = mmu->root_role; /* * Return immediately if no usable root was found, kvm_mmu_reload() * will establish a valid root prior to the next VM-Enter. */ if (!fast_pgd_switch(vcpu->kvm, mmu, new_pgd, new_role)) return; /* * It's possible that the cached previous root page is obsolete because * of a change in the MMU generation number. However, changing the * generation number is accompanied by KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, * which will free the root set here and allocate a new one. */ kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu); if (force_flush_and_sync_on_reuse) { kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } /* * The last MMIO access's GVA and GPA are cached in the VCPU. When * switching to a new CR3, that GVA->GPA mapping may no longer be * valid. So clear any cached MMIO info even when we don't need to sync * the shadow page tables. */ vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); /* * If this is a direct root page, it doesn't have a write flooding * count. Otherwise, clear the write flooding count. */ if (!new_role.direct) { struct kvm_mmu_page *sp = root_to_sp(vcpu->arch.mmu->root.hpa); if (!WARN_ON_ONCE(!sp)) __clear_sp_write_flooding_count(sp); } } EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd); static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, unsigned int access) { if (unlikely(is_mmio_spte(vcpu->kvm, *sptep))) { if (gfn != get_mmio_spte_gfn(*sptep)) { mmu_spte_clear_no_track(sptep); return true; } mark_mmio_spte(vcpu, sptep, gfn, access); return true; } return false; } #define PTTYPE_EPT 18 /* arbitrary */ #define PTTYPE PTTYPE_EPT #include "paging_tmpl.h" #undef PTTYPE #define PTTYPE 64 #include "paging_tmpl.h" #undef PTTYPE #define PTTYPE 32 #include "paging_tmpl.h" #undef PTTYPE static void __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check, u64 pa_bits_rsvd, int level, bool nx, bool gbpages, bool pse, bool amd) { u64 gbpages_bit_rsvd = 0; u64 nonleaf_bit8_rsvd = 0; u64 high_bits_rsvd; rsvd_check->bad_mt_xwr = 0; if (!gbpages) gbpages_bit_rsvd = rsvd_bits(7, 7); if (level == PT32E_ROOT_LEVEL) high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 62); else high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51); /* Note, NX doesn't exist in PDPTEs, this is handled below. */ if (!nx) high_bits_rsvd |= rsvd_bits(63, 63); /* * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for * leaf entries) on AMD CPUs only. */ if (amd) nonleaf_bit8_rsvd = rsvd_bits(8, 8); switch (level) { case PT32_ROOT_LEVEL: /* no rsvd bits for 2 level 4K page table entries */ rsvd_check->rsvd_bits_mask[0][1] = 0; rsvd_check->rsvd_bits_mask[0][0] = 0; rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; if (!pse) { rsvd_check->rsvd_bits_mask[1][1] = 0; break; } if (is_cpuid_PSE36()) /* 36bits PSE 4MB page */ rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); else /* 32 bits PSE 4MB page */ rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); break; case PT32E_ROOT_LEVEL: rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(63, 63) | high_bits_rsvd | rsvd_bits(5, 8) | rsvd_bits(1, 2); /* PDPTE */ rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd; /* PDE */ rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; /* PTE */ rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(13, 20); /* large page */ rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; break; case PT64_ROOT_5LEVEL: rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | nonleaf_bit8_rsvd | rsvd_bits(7, 7); rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4]; fallthrough; case PT64_ROOT_4LEVEL: rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | nonleaf_bit8_rsvd | rsvd_bits(7, 7); rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | gbpages_bit_rsvd; rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd; rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3]; rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | gbpages_bit_rsvd | rsvd_bits(13, 29); rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(13, 20); /* large page */ rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; break; } } static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { __reset_rsvds_bits_mask(&context->guest_rsvd_check, vcpu->arch.reserved_gpa_bits, context->cpu_role.base.level, is_efer_nx(context), guest_can_use(vcpu, X86_FEATURE_GBPAGES), is_cr4_pse(context), guest_cpuid_is_amd_compatible(vcpu)); } static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, u64 pa_bits_rsvd, bool execonly, int huge_page_level) { u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51); u64 large_1g_rsvd = 0, large_2m_rsvd = 0; u64 bad_mt_xwr; if (huge_page_level < PG_LEVEL_1G) large_1g_rsvd = rsvd_bits(7, 7); if (huge_page_level < PG_LEVEL_2M) large_2m_rsvd = rsvd_bits(7, 7); rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7); rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7); rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6) | large_1g_rsvd; rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6) | large_2m_rsvd; rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; /* large page */ rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4]; rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3]; rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29) | large_1g_rsvd; rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20) | large_2m_rsvd; rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */ bad_mt_xwr |= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */ bad_mt_xwr |= 0xFFull << (7 * 8); /* bits 3..5 must not be 7 */ bad_mt_xwr |= REPEAT_BYTE(1ull << 2); /* bits 0..2 must not be 010 */ bad_mt_xwr |= REPEAT_BYTE(1ull << 6); /* bits 0..2 must not be 110 */ if (!execonly) { /* bits 0..2 must not be 100 unless VMX capabilities allow it */ bad_mt_xwr |= REPEAT_BYTE(1ull << 4); } rsvd_check->bad_mt_xwr = bad_mt_xwr; } static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, struct kvm_mmu *context, bool execonly, int huge_page_level) { __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check, vcpu->arch.reserved_gpa_bits, execonly, huge_page_level); } static inline u64 reserved_hpa_bits(void) { return rsvd_bits(kvm_host.maxphyaddr, 63); } /* * the page table on host is the shadow page table for the page * table in guest or amd nested guest, its mmu features completely * follow the features in guest. */ static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { /* @amd adds a check on bit of SPTEs, which KVM shouldn't use anyways. */ bool is_amd = true; /* KVM doesn't use 2-level page tables for the shadow MMU. */ bool is_pse = false; struct rsvd_bits_validate *shadow_zero_check; int i; WARN_ON_ONCE(context->root_role.level < PT32E_ROOT_LEVEL); shadow_zero_check = &context->shadow_zero_check; __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(), context->root_role.level, context->root_role.efer_nx, guest_can_use(vcpu, X86_FEATURE_GBPAGES), is_pse, is_amd); if (!shadow_me_mask) return; for (i = context->root_role.level; --i >= 0;) { /* * So far shadow_me_value is a constant during KVM's life * time. Bits in shadow_me_value are allowed to be set. * Bits in shadow_me_mask but not in shadow_me_value are * not allowed to be set. */ shadow_zero_check->rsvd_bits_mask[0][i] |= shadow_me_mask; shadow_zero_check->rsvd_bits_mask[1][i] |= shadow_me_mask; shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_value; shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_value; } } static inline bool boot_cpu_is_amd(void) { WARN_ON_ONCE(!tdp_enabled); return shadow_x_mask == 0; } /* * the direct page table on host, use as much mmu features as * possible, however, kvm currently does not do execution-protection. */ static void reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context) { struct rsvd_bits_validate *shadow_zero_check; int i; shadow_zero_check = &context->shadow_zero_check; if (boot_cpu_is_amd()) __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(), context->root_role.level, true, boot_cpu_has(X86_FEATURE_GBPAGES), false, true); else __reset_rsvds_bits_mask_ept(shadow_zero_check, reserved_hpa_bits(), false, max_huge_page_level); if (!shadow_me_mask) return; for (i = context->root_role.level; --i >= 0;) { shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask; shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask; } } /* * as the comments in reset_shadow_zero_bits_mask() except it * is the shadow page table for intel nested guest. */ static void reset_ept_shadow_zero_bits_mask(struct kvm_mmu *context, bool execonly) { __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, reserved_hpa_bits(), execonly, max_huge_page_level); } #define BYTE_MASK(access) \ ((1 & (access) ? 2 : 0) | \ (2 & (access) ? 4 : 0) | \ (3 & (access) ? 8 : 0) | \ (4 & (access) ? 16 : 0) | \ (5 & (access) ? 32 : 0) | \ (6 & (access) ? 64 : 0) | \ (7 & (access) ? 128 : 0)) static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept) { unsigned byte; const u8 x = BYTE_MASK(ACC_EXEC_MASK); const u8 w = BYTE_MASK(ACC_WRITE_MASK); const u8 u = BYTE_MASK(ACC_USER_MASK); bool cr4_smep = is_cr4_smep(mmu); bool cr4_smap = is_cr4_smap(mmu); bool cr0_wp = is_cr0_wp(mmu); bool efer_nx = is_efer_nx(mmu); for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) { unsigned pfec = byte << 1; /* * Each "*f" variable has a 1 bit for each UWX value * that causes a fault with the given PFEC. */ /* Faults from writes to non-writable pages */ u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0; /* Faults from user mode accesses to supervisor pages */ u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0; /* Faults from fetches of non-executable pages*/ u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0; /* Faults from kernel mode fetches of user pages */ u8 smepf = 0; /* Faults from kernel mode accesses of user pages */ u8 smapf = 0; if (!ept) { /* Faults from kernel mode accesses to user pages */ u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u; /* Not really needed: !nx will cause pte.nx to fault */ if (!efer_nx) ff = 0; /* Allow supervisor writes if !cr0.wp */ if (!cr0_wp) wf = (pfec & PFERR_USER_MASK) ? wf : 0; /* Disallow supervisor fetches of user code if cr4.smep */ if (cr4_smep) smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0; /* * SMAP:kernel-mode data accesses from user-mode * mappings should fault. A fault is considered * as a SMAP violation if all of the following * conditions are true: * - X86_CR4_SMAP is set in CR4 * - A user page is accessed * - The access is not a fetch * - The access is supervisor mode * - If implicit supervisor access or X86_EFLAGS_AC is clear * * Here, we cover the first four conditions. * The fifth is computed dynamically in permission_fault(); * PFERR_RSVD_MASK bit will be set in PFEC if the access is * *not* subject to SMAP restrictions. */ if (cr4_smap) smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf; } mmu->permissions[byte] = ff | uf | wf | smepf | smapf; } } /* * PKU is an additional mechanism by which the paging controls access to * user-mode addresses based on the value in the PKRU register. Protection * key violations are reported through a bit in the page fault error code. * Unlike other bits of the error code, the PK bit is not known at the * call site of e.g. gva_to_gpa; it must be computed directly in * permission_fault based on two bits of PKRU, on some machine state (CR4, * CR0, EFER, CPL), and on other bits of the error code and the page tables. * * In particular the following conditions come from the error code, the * page tables and the machine state: * - PK is always zero unless CR4.PKE=1 and EFER.LMA=1 * - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch) * - PK is always zero if U=0 in the page tables * - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access. * * The PKRU bitmask caches the result of these four conditions. The error * code (minus the P bit) and the page table's U bit form an index into the * PKRU bitmask. Two bits of the PKRU bitmask are then extracted and ANDed * with the two bits of the PKRU register corresponding to the protection key. * For the first three conditions above the bits will be 00, thus masking * away both AD and WD. For all reads or if the last condition holds, WD * only will be masked away. */ static void update_pkru_bitmask(struct kvm_mmu *mmu) { unsigned bit; bool wp; mmu->pkru_mask = 0; if (!is_cr4_pke(mmu)) return; wp = is_cr0_wp(mmu); for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) { unsigned pfec, pkey_bits; bool check_pkey, check_write, ff, uf, wf, pte_user; pfec = bit << 1; ff = pfec & PFERR_FETCH_MASK; uf = pfec & PFERR_USER_MASK; wf = pfec & PFERR_WRITE_MASK; /* PFEC.RSVD is replaced by ACC_USER_MASK. */ pte_user = pfec & PFERR_RSVD_MASK; /* * Only need to check the access which is not an * instruction fetch and is to a user page. */ check_pkey = (!ff && pte_user); /* * write access is controlled by PKRU if it is a * user access or CR0.WP = 1. */ check_write = check_pkey && wf && (uf || wp); /* PKRU.AD stops both read and write access. */ pkey_bits = !!check_pkey; /* PKRU.WD stops write access. */ pkey_bits |= (!!check_write) << 1; mmu->pkru_mask |= (pkey_bits & 3) << pfec; } } static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) { if (!is_cr0_pg(mmu)) return; reset_guest_rsvds_bits_mask(vcpu, mmu); update_permission_bitmask(mmu, false); update_pkru_bitmask(mmu); } static void paging64_init_context(struct kvm_mmu *context) { context->page_fault = paging64_page_fault; context->gva_to_gpa = paging64_gva_to_gpa; context->sync_spte = paging64_sync_spte; } static void paging32_init_context(struct kvm_mmu *context) { context->page_fault = paging32_page_fault; context->gva_to_gpa = paging32_gva_to_gpa; context->sync_spte = paging32_sync_spte; } static union kvm_cpu_role kvm_calc_cpu_role(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs) { union kvm_cpu_role role = {0}; role.base.access = ACC_ALL; role.base.smm = is_smm(vcpu); role.base.guest_mode = is_guest_mode(vcpu); role.ext.valid = 1; if (!____is_cr0_pg(regs)) { role.base.direct = 1; return role; } role.base.efer_nx = ____is_efer_nx(regs); role.base.cr0_wp = ____is_cr0_wp(regs); role.base.smep_andnot_wp = ____is_cr4_smep(regs) && !____is_cr0_wp(regs); role.base.smap_andnot_wp = ____is_cr4_smap(regs) && !____is_cr0_wp(regs); role.base.has_4_byte_gpte = !____is_cr4_pae(regs); if (____is_efer_lma(regs)) role.base.level = ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; else if (____is_cr4_pae(regs)) role.base.level = PT32E_ROOT_LEVEL; else role.base.level = PT32_ROOT_LEVEL; role.ext.cr4_smep = ____is_cr4_smep(regs); role.ext.cr4_smap = ____is_cr4_smap(regs); role.ext.cr4_pse = ____is_cr4_pse(regs); /* PKEY and LA57 are active iff long mode is active. */ role.ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs); role.ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs); role.ext.efer_lma = ____is_efer_lma(regs); return role; } void __kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) { const bool cr0_wp = kvm_is_cr0_bit_set(vcpu, X86_CR0_WP); BUILD_BUG_ON((KVM_MMU_CR0_ROLE_BITS & KVM_POSSIBLE_CR0_GUEST_BITS) != X86_CR0_WP); BUILD_BUG_ON((KVM_MMU_CR4_ROLE_BITS & KVM_POSSIBLE_CR4_GUEST_BITS)); if (is_cr0_wp(mmu) == cr0_wp) return; mmu->cpu_role.base.cr0_wp = cr0_wp; reset_guest_paging_metadata(vcpu, mmu); } static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu) { /* tdp_root_level is architecture forced level, use it if nonzero */ if (tdp_root_level) return tdp_root_level; /* Use 5-level TDP if and only if it's useful/necessary. */ if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48) return 4; return max_tdp_level; } u8 kvm_mmu_get_max_tdp_level(void) { return tdp_root_level ? tdp_root_level : max_tdp_level; } static union kvm_mmu_page_role kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, union kvm_cpu_role cpu_role) { union kvm_mmu_page_role role = {0}; role.access = ACC_ALL; role.cr0_wp = true; role.efer_nx = true; role.smm = cpu_role.base.smm; role.guest_mode = cpu_role.base.guest_mode; role.ad_disabled = !kvm_ad_enabled(); role.level = kvm_mmu_get_tdp_level(vcpu); role.direct = true; role.has_4_byte_gpte = false; return role; } static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu, union kvm_cpu_role cpu_role) { struct kvm_mmu *context = &vcpu->arch.root_mmu; union kvm_mmu_page_role root_role = kvm_calc_tdp_mmu_root_page_role(vcpu, cpu_role); if (cpu_role.as_u64 == context->cpu_role.as_u64 && root_role.word == context->root_role.word) return; context->cpu_role.as_u64 = cpu_role.as_u64; context->root_role.word = root_role.word; context->page_fault = kvm_tdp_page_fault; context->sync_spte = NULL; context->get_guest_pgd = get_guest_cr3; context->get_pdptr = kvm_pdptr_read; context->inject_page_fault = kvm_inject_page_fault; if (!is_cr0_pg(context)) context->gva_to_gpa = nonpaging_gva_to_gpa; else if (is_cr4_pae(context)) context->gva_to_gpa = paging64_gva_to_gpa; else context->gva_to_gpa = paging32_gva_to_gpa; reset_guest_paging_metadata(vcpu, context); reset_tdp_shadow_zero_bits_mask(context); } static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context, union kvm_cpu_role cpu_role, union kvm_mmu_page_role root_role) { if (cpu_role.as_u64 == context->cpu_role.as_u64 && root_role.word == context->root_role.word) return; context->cpu_role.as_u64 = cpu_role.as_u64; context->root_role.word = root_role.word; if (!is_cr0_pg(context)) nonpaging_init_context(context); else if (is_cr4_pae(context)) paging64_init_context(context); else paging32_init_context(context); reset_guest_paging_metadata(vcpu, context); reset_shadow_zero_bits_mask(vcpu, context); } static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, union kvm_cpu_role cpu_role) { struct kvm_mmu *context = &vcpu->arch.root_mmu; union kvm_mmu_page_role root_role; root_role = cpu_role.base; /* KVM uses PAE paging whenever the guest isn't using 64-bit paging. */ root_role.level = max_t(u32, root_role.level, PT32E_ROOT_LEVEL); /* * KVM forces EFER.NX=1 when TDP is disabled, reflect it in the MMU role. * KVM uses NX when TDP is disabled to handle a variety of scenarios, * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0. * The iTLB multi-hit workaround can be toggled at any time, so assume * NX can be used by any non-nested shadow MMU to avoid having to reset * MMU contexts. */ root_role.efer_nx = true; shadow_mmu_init_context(vcpu, context, cpu_role, root_role); } void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, unsigned long cr4, u64 efer, gpa_t nested_cr3) { struct kvm_mmu *context = &vcpu->arch.guest_mmu; struct kvm_mmu_role_regs regs = { .cr0 = cr0, .cr4 = cr4 & ~X86_CR4_PKE, .efer = efer, }; union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, ®s); union kvm_mmu_page_role root_role; /* NPT requires CR0.PG=1. */ WARN_ON_ONCE(cpu_role.base.direct); root_role = cpu_role.base; root_role.level = kvm_mmu_get_tdp_level(vcpu); if (root_role.level == PT64_ROOT_5LEVEL && cpu_role.base.level == PT64_ROOT_4LEVEL) root_role.passthrough = 1; shadow_mmu_init_context(vcpu, context, cpu_role, root_role); kvm_mmu_new_pgd(vcpu, nested_cr3); } EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu); static union kvm_cpu_role kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, bool execonly, u8 level) { union kvm_cpu_role role = {0}; /* * KVM does not support SMM transfer monitors, and consequently does not * support the "entry to SMM" control either. role.base.smm is always 0. */ WARN_ON_ONCE(is_smm(vcpu)); role.base.level = level; role.base.has_4_byte_gpte = false; role.base.direct = false; role.base.ad_disabled = !accessed_dirty; role.base.guest_mode = true; role.base.access = ACC_ALL; role.ext.word = 0; role.ext.execonly = execonly; role.ext.valid = 1; return role; } void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, int huge_page_level, bool accessed_dirty, gpa_t new_eptp) { struct kvm_mmu *context = &vcpu->arch.guest_mmu; u8 level = vmx_eptp_page_walk_level(new_eptp); union kvm_cpu_role new_mode = kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty, execonly, level); if (new_mode.as_u64 != context->cpu_role.as_u64) { /* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */ context->cpu_role.as_u64 = new_mode.as_u64; context->root_role.word = new_mode.base.word; context->page_fault = ept_page_fault; context->gva_to_gpa = ept_gva_to_gpa; context->sync_spte = ept_sync_spte; update_permission_bitmask(context, true); context->pkru_mask = 0; reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level); reset_ept_shadow_zero_bits_mask(context, execonly); } kvm_mmu_new_pgd(vcpu, new_eptp); } EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); static void init_kvm_softmmu(struct kvm_vcpu *vcpu, union kvm_cpu_role cpu_role) { struct kvm_mmu *context = &vcpu->arch.root_mmu; kvm_init_shadow_mmu(vcpu, cpu_role); context->get_guest_pgd = get_guest_cr3; context->get_pdptr = kvm_pdptr_read; context->inject_page_fault = kvm_inject_page_fault; } static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu, union kvm_cpu_role new_mode) { struct kvm_mmu *g_context = &vcpu->arch.nested_mmu; if (new_mode.as_u64 == g_context->cpu_role.as_u64) return; g_context->cpu_role.as_u64 = new_mode.as_u64; g_context->get_guest_pgd = get_guest_cr3; g_context->get_pdptr = kvm_pdptr_read; g_context->inject_page_fault = kvm_inject_page_fault; /* * L2 page tables are never shadowed, so there is no need to sync * SPTEs. */ g_context->sync_spte = NULL; /* * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using * L1's nested page tables (e.g. EPT12). The nested translation * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using * L2's page tables as the first level of translation and L1's * nested page tables as the second level of translation. Basically * the gva_to_gpa functions between mmu and nested_mmu are swapped. */ if (!is_paging(vcpu)) g_context->gva_to_gpa = nonpaging_gva_to_gpa; else if (is_long_mode(vcpu)) g_context->gva_to_gpa = paging64_gva_to_gpa; else if (is_pae(vcpu)) g_context->gva_to_gpa = paging64_gva_to_gpa; else g_context->gva_to_gpa = paging32_gva_to_gpa; reset_guest_paging_metadata(vcpu, g_context); } void kvm_init_mmu(struct kvm_vcpu *vcpu) { struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu); union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, ®s); if (mmu_is_nested(vcpu)) init_kvm_nested_mmu(vcpu, cpu_role); else if (tdp_enabled) init_kvm_tdp_mmu(vcpu, cpu_role); else init_kvm_softmmu(vcpu, cpu_role); } EXPORT_SYMBOL_GPL(kvm_init_mmu); void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu) { /* * Invalidate all MMU roles to force them to reinitialize as CPUID * information is factored into reserved bit calculations. * * Correctly handling multiple vCPU models with respect to paging and * physical address properties) in a single VM would require tracking * all relevant CPUID information in kvm_mmu_page_role. That is very * undesirable as it would increase the memory requirements for * gfn_write_track (see struct kvm_mmu_page_role comments). For now * that problem is swept under the rug; KVM's CPUID API is horrific and * it's all but impossible to solve it without introducing a new API. */ vcpu->arch.root_mmu.root_role.invalid = 1; vcpu->arch.guest_mmu.root_role.invalid = 1; vcpu->arch.nested_mmu.root_role.invalid = 1; vcpu->arch.root_mmu.cpu_role.ext.valid = 0; vcpu->arch.guest_mmu.cpu_role.ext.valid = 0; vcpu->arch.nested_mmu.cpu_role.ext.valid = 0; kvm_mmu_reset_context(vcpu); /* * Changing guest CPUID after KVM_RUN is forbidden, see the comment in * kvm_arch_vcpu_ioctl(). */ KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm); } void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) { kvm_mmu_unload(vcpu); kvm_init_mmu(vcpu); } EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); int kvm_mmu_load(struct kvm_vcpu *vcpu) { int r; r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->root_role.direct); if (r) goto out; r = mmu_alloc_special_roots(vcpu); if (r) goto out; if (vcpu->arch.mmu->root_role.direct) r = mmu_alloc_direct_roots(vcpu); else r = mmu_alloc_shadow_roots(vcpu); if (r) goto out; kvm_mmu_sync_roots(vcpu); kvm_mmu_load_pgd(vcpu); /* * Flush any TLB entries for the new root, the provenance of the root * is unknown. Even if KVM ensures there are no stale TLB entries * for a freed root, in theory another hypervisor could have left * stale entries. Flushing on alloc also allows KVM to skip the TLB * flush when freeing a root (see kvm_tdp_mmu_put_root()). */ kvm_x86_call(flush_tlb_current)(vcpu); out: return r; } void kvm_mmu_unload(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; kvm_mmu_free_roots(kvm, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL); WARN_ON_ONCE(VALID_PAGE(vcpu->arch.root_mmu.root.hpa)); kvm_mmu_free_roots(kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); WARN_ON_ONCE(VALID_PAGE(vcpu->arch.guest_mmu.root.hpa)); vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); } static bool is_obsolete_root(struct kvm *kvm, hpa_t root_hpa) { struct kvm_mmu_page *sp; if (!VALID_PAGE(root_hpa)) return false; /* * When freeing obsolete roots, treat roots as obsolete if they don't * have an associated shadow page, as it's impossible to determine if * such roots are fresh or stale. This does mean KVM will get false * positives and free roots that don't strictly need to be freed, but * such false positives are relatively rare: * * (a) only PAE paging and nested NPT have roots without shadow pages * (or any shadow paging flavor with a dummy root, see note below) * (b) remote reloads due to a memslot update obsoletes _all_ roots * (c) KVM doesn't track previous roots for PAE paging, and the guest * is unlikely to zap an in-use PGD. * * Note! Dummy roots are unique in that they are obsoleted by memslot * _creation_! See also FNAME(fetch). */ sp = root_to_sp(root_hpa); return !sp || is_obsolete_sp(kvm, sp); } static void __kvm_mmu_free_obsolete_roots(struct kvm *kvm, struct kvm_mmu *mmu) { unsigned long roots_to_free = 0; int i; if (is_obsolete_root(kvm, mmu->root.hpa)) roots_to_free |= KVM_MMU_ROOT_CURRENT; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { if (is_obsolete_root(kvm, mmu->prev_roots[i].hpa)) roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); } if (roots_to_free) kvm_mmu_free_roots(kvm, mmu, roots_to_free); } void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu) { __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.root_mmu); __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu); } static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, int *bytes) { u64 gentry = 0; int r; /* * Assume that the pte write on a page table of the same type * as the current vcpu paging mode since we update the sptes only * when they have the same mode. */ if (is_pae(vcpu) && *bytes == 4) { /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ *gpa &= ~(gpa_t)7; *bytes = 8; } if (*bytes == 4 || *bytes == 8) { r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes); if (r) gentry = 0; } return gentry; } /* * If we're seeing too many writes to a page, it may no longer be a page table, * or we may be forking, in which case it is better to unmap the page. */ static bool detect_write_flooding(struct kvm_mmu_page *sp) { /* * Skip write-flooding detected for the sp whose level is 1, because * it can become unsync, then the guest page is not write-protected. */ if (sp->role.level == PG_LEVEL_4K) return false; atomic_inc(&sp->write_flooding_count); return atomic_read(&sp->write_flooding_count) >= 3; } /* * Misaligned accesses are too much trouble to fix up; also, they usually * indicate a page is not used as a page table. */ static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa, int bytes) { unsigned offset, pte_size, misaligned; offset = offset_in_page(gpa); pte_size = sp->role.has_4_byte_gpte ? 4 : 8; /* * Sometimes, the OS only writes the last one bytes to update status * bits, for example, in linux, andb instruction is used in clear_bit(). */ if (!(offset & (pte_size - 1)) && bytes == 1) return false; misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); misaligned |= bytes < 4; return misaligned; } static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte) { unsigned page_offset, quadrant; u64 *spte; int level; page_offset = offset_in_page(gpa); level = sp->role.level; *nspte = 1; if (sp->role.has_4_byte_gpte) { page_offset <<= 1; /* 32->64 */ /* * A 32-bit pde maps 4MB while the shadow pdes map * only 2MB. So we need to double the offset again * and zap two pdes instead of one. */ if (level == PT32_ROOT_LEVEL) { page_offset &= ~7; /* kill rounding error */ page_offset <<= 1; *nspte = 2; } quadrant = page_offset >> PAGE_SHIFT; page_offset &= ~PAGE_MASK; if (quadrant != sp->role.quadrant) return NULL; } spte = &sp->spt[page_offset / sizeof(*spte)]; return spte; } void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, int bytes) { gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm_mmu_page *sp; LIST_HEAD(invalid_list); u64 entry, gentry, *spte; int npte; bool flush = false; /* * When emulating guest writes, ensure the written value is visible to * any task that is handling page faults before checking whether or not * KVM is shadowing a guest PTE. This ensures either KVM will create * the correct SPTE in the page fault handler, or this task will see * a non-zero indirect_shadow_pages. Pairs with the smp_mb() in * account_shadowed(). */ smp_mb(); if (!vcpu->kvm->arch.indirect_shadow_pages) return; write_lock(&vcpu->kvm->mmu_lock); gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes); ++vcpu->kvm->stat.mmu_pte_write; for_each_gfn_valid_sp_with_gptes(vcpu->kvm, sp, gfn) { if (detect_write_misaligned(sp, gpa, bytes) || detect_write_flooding(sp)) { kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); ++vcpu->kvm->stat.mmu_flooded; continue; } spte = get_written_sptes(sp, gpa, &npte); if (!spte) continue; while (npte--) { entry = *spte; mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL); if (gentry && sp->role.level != PG_LEVEL_4K) ++vcpu->kvm->stat.mmu_pde_zapped; if (is_shadow_present_pte(entry)) flush = true; ++spte; } } kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush); write_unlock(&vcpu->kvm->mmu_lock); } int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, void *insn, int insn_len) { int r, emulation_type = EMULTYPE_PF; bool direct = vcpu->arch.mmu->root_role.direct; if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) return RET_PF_RETRY; /* * Except for reserved faults (emulated MMIO is shared-only), set the * PFERR_PRIVATE_ACCESS flag for software-protected VMs based on the gfn's * current attributes, which are the source of truth for such VMs. Note, * this wrong for nested MMUs as the GPA is an L2 GPA, but KVM doesn't * currently supported nested virtualization (among many other things) * for software-protected VMs. */ if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && !(error_code & PFERR_RSVD_MASK) && vcpu->kvm->arch.vm_type == KVM_X86_SW_PROTECTED_VM && kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(cr2_or_gpa))) error_code |= PFERR_PRIVATE_ACCESS; r = RET_PF_INVALID; if (unlikely(error_code & PFERR_RSVD_MASK)) { if (WARN_ON_ONCE(error_code & PFERR_PRIVATE_ACCESS)) return -EFAULT; r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct); if (r == RET_PF_EMULATE) goto emulate; } if (r == RET_PF_INVALID) { vcpu->stat.pf_taken++; r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, error_code, false, &emulation_type, NULL); if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm)) return -EIO; } if (r < 0) return r; if (r == RET_PF_FIXED) vcpu->stat.pf_fixed++; else if (r == RET_PF_EMULATE) vcpu->stat.pf_emulate++; else if (r == RET_PF_SPURIOUS) vcpu->stat.pf_spurious++; if (r != RET_PF_EMULATE) return 1; /* * Before emulating the instruction, check if the error code * was due to a RO violation while translating the guest page. * This can occur when using nested virtualization with nested * paging in both guests. If true, we simply unprotect the page * and resume the guest. */ if (vcpu->arch.mmu->root_role.direct && (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) { kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa)); return 1; } /* * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still * optimistically try to just unprotect the page and let the processor * re-execute the instruction that caused the page fault. Do not allow * retrying MMIO emulation, as it's not only pointless but could also * cause us to enter an infinite loop because the processor will keep * faulting on the non-existent MMIO address. Retrying an instruction * from a nested guest is also pointless and dangerous as we are only * explicitly shadowing L1's page tables, i.e. unprotecting something * for L1 isn't going to magically fix whatever issue cause L2 to fail. */ if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu)) emulation_type |= EMULTYPE_ALLOW_RETRY_PF; emulate: return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn, insn_len); } EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); void kvm_mmu_print_sptes(struct kvm_vcpu *vcpu, gpa_t gpa, const char *msg) { u64 sptes[PT64_ROOT_MAX_LEVEL + 1]; int root_level, leaf, level; leaf = get_sptes_lockless(vcpu, gpa, sptes, &root_level); if (unlikely(leaf < 0)) return; pr_err("%s %llx", msg, gpa); for (level = root_level; level >= leaf; level--) pr_cont(", spte[%d] = 0x%llx", level, sptes[level]); pr_cont("\n"); } EXPORT_SYMBOL_GPL(kvm_mmu_print_sptes); static void __kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, u64 addr, hpa_t root_hpa) { struct kvm_shadow_walk_iterator iterator; vcpu_clear_mmio_info(vcpu, addr); /* * Walking and synchronizing SPTEs both assume they are operating in * the context of the current MMU, and would need to be reworked if * this is ever used to sync the guest_mmu, e.g. to emulate INVEPT. */ if (WARN_ON_ONCE(mmu != vcpu->arch.mmu)) return; if (!VALID_PAGE(root_hpa)) return; write_lock(&vcpu->kvm->mmu_lock); for_each_shadow_entry_using_root(vcpu, root_hpa, addr, iterator) { struct kvm_mmu_page *sp = sptep_to_sp(iterator.sptep); if (sp->unsync) { int ret = kvm_sync_spte(vcpu, sp, iterator.index); if (ret < 0) mmu_page_zap_pte(vcpu->kvm, sp, iterator.sptep, NULL); if (ret) kvm_flush_remote_tlbs_sptep(vcpu->kvm, iterator.sptep); } if (!sp->unsync_children) break; } write_unlock(&vcpu->kvm->mmu_lock); } void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, u64 addr, unsigned long roots) { int i; WARN_ON_ONCE(roots & ~KVM_MMU_ROOTS_ALL); /* It's actually a GPA for vcpu->arch.guest_mmu. */ if (mmu != &vcpu->arch.guest_mmu) { /* INVLPG on a non-canonical address is a NOP according to the SDM. */ if (is_noncanonical_address(addr, vcpu)) return; kvm_x86_call(flush_tlb_gva)(vcpu, addr); } if (!mmu->sync_spte) return; if (roots & KVM_MMU_ROOT_CURRENT) __kvm_mmu_invalidate_addr(vcpu, mmu, addr, mmu->root.hpa); for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { if (roots & KVM_MMU_ROOT_PREVIOUS(i)) __kvm_mmu_invalidate_addr(vcpu, mmu, addr, mmu->prev_roots[i].hpa); } } EXPORT_SYMBOL_GPL(kvm_mmu_invalidate_addr); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { /* * INVLPG is required to invalidate any global mappings for the VA, * irrespective of PCID. Blindly sync all roots as it would take * roughly the same amount of work/time to determine whether any of the * previous roots have a global mapping. * * Mappings not reachable via the current or previous cached roots will * be synced when switching to that new cr3, so nothing needs to be * done here for them. */ kvm_mmu_invalidate_addr(vcpu, vcpu->arch.walk_mmu, gva, KVM_MMU_ROOTS_ALL); ++vcpu->stat.invlpg; } EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) { struct kvm_mmu *mmu = vcpu->arch.mmu; unsigned long roots = 0; uint i; if (pcid == kvm_get_active_pcid(vcpu)) roots |= KVM_MMU_ROOT_CURRENT; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { if (VALID_PAGE(mmu->prev_roots[i].hpa) && pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) roots |= KVM_MMU_ROOT_PREVIOUS(i); } if (roots) kvm_mmu_invalidate_addr(vcpu, mmu, gva, roots); ++vcpu->stat.invlpg; /* * Mappings not reachable via the current cr3 or the prev_roots will be * synced when switching to that cr3, so nothing needs to be done here * for them. */ } void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, int tdp_max_root_level, int tdp_huge_page_level) { tdp_enabled = enable_tdp; tdp_root_level = tdp_forced_root_level; max_tdp_level = tdp_max_root_level; #ifdef CONFIG_X86_64 tdp_mmu_enabled = tdp_mmu_allowed && tdp_enabled; #endif /* * max_huge_page_level reflects KVM's MMU capabilities irrespective * of kernel support, e.g. KVM may be capable of using 1GB pages when * the kernel is not. But, KVM never creates a page size greater than * what is used by the kernel for any given HVA, i.e. the kernel's * capabilities are ultimately consulted by kvm_mmu_hugepage_adjust(). */ if (tdp_enabled) max_huge_page_level = tdp_huge_page_level; else if (boot_cpu_has(X86_FEATURE_GBPAGES)) max_huge_page_level = PG_LEVEL_1G; else max_huge_page_level = PG_LEVEL_2M; } EXPORT_SYMBOL_GPL(kvm_configure_mmu); /* The return value indicates if tlb flush on all vcpus is needed. */ typedef bool (*slot_rmaps_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head, const struct kvm_memory_slot *slot); static __always_inline bool __walk_slot_rmaps(struct kvm *kvm, const struct kvm_memory_slot *slot, slot_rmaps_handler fn, int start_level, int end_level, gfn_t start_gfn, gfn_t end_gfn, bool flush_on_yield, bool flush) { struct slot_rmap_walk_iterator iterator; lockdep_assert_held_write(&kvm->mmu_lock); for_each_slot_rmap_range(slot, start_level, end_level, start_gfn, end_gfn, &iterator) { if (iterator.rmap) flush |= fn(kvm, iterator.rmap, slot); if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { if (flush && flush_on_yield) { kvm_flush_remote_tlbs_range(kvm, start_gfn, iterator.gfn - start_gfn + 1); flush = false; } cond_resched_rwlock_write(&kvm->mmu_lock); } } return flush; } static __always_inline bool walk_slot_rmaps(struct kvm *kvm, const struct kvm_memory_slot *slot, slot_rmaps_handler fn, int start_level, int end_level, bool flush_on_yield) { return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level, slot->base_gfn, slot->base_gfn + slot->npages - 1, flush_on_yield, false); } static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm, const struct kvm_memory_slot *slot, slot_rmaps_handler fn, bool flush_on_yield) { return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield); } static void free_mmu_pages(struct kvm_mmu *mmu) { if (!tdp_enabled && mmu->pae_root) set_memory_encrypted((unsigned long)mmu->pae_root, 1); free_page((unsigned long)mmu->pae_root); free_page((unsigned long)mmu->pml4_root); free_page((unsigned long)mmu->pml5_root); } static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) { struct page *page; int i; mmu->root.hpa = INVALID_PAGE; mmu->root.pgd = 0; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; /* vcpu->arch.guest_mmu isn't used when !tdp_enabled. */ if (!tdp_enabled && mmu == &vcpu->arch.guest_mmu) return 0; /* * When using PAE paging, the four PDPTEs are treated as 'root' pages, * while the PDP table is a per-vCPU construct that's allocated at MMU * creation. When emulating 32-bit mode, cr3 is only 32 bits even on * x86_64. Therefore we need to allocate the PDP table in the first * 4GB of memory, which happens to fit the DMA32 zone. TDP paging * generally doesn't use PAE paging and can skip allocating the PDP * table. The main exception, handled here, is SVM's 32-bit NPT. The * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit * KVM; that horror is handled on-demand by mmu_alloc_special_roots(). */ if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL) return 0; page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32); if (!page) return -ENOMEM; mmu->pae_root = page_address(page); /* * CR3 is only 32 bits when PAE paging is used, thus it's impossible to * get the CPU to treat the PDPTEs as encrypted. Decrypt the page so * that KVM's writes and the CPU's reads get along. Note, this is * only necessary when using shadow paging, as 64-bit NPT can get at * the C-bit even when shadowing 32-bit NPT, and SME isn't supported * by 32-bit kernels (when KVM itself uses 32-bit NPT). */ if (!tdp_enabled) set_memory_decrypted((unsigned long)mmu->pae_root, 1); else WARN_ON_ONCE(shadow_me_value); for (i = 0; i < 4; ++i) mmu->pae_root[i] = INVALID_PAE_ROOT; return 0; } int kvm_mmu_create(struct kvm_vcpu *vcpu) { int ret; vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache; vcpu->arch.mmu_pte_list_desc_cache.gfp_zero = __GFP_ZERO; vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache; vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO; vcpu->arch.mmu_shadow_page_cache.init_value = SHADOW_NONPRESENT_VALUE; if (!vcpu->arch.mmu_shadow_page_cache.init_value) vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO; vcpu->arch.mmu = &vcpu->arch.root_mmu; vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu); if (ret) return ret; ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu); if (ret) goto fail_allocate_root; return ret; fail_allocate_root: free_mmu_pages(&vcpu->arch.guest_mmu); return ret; } #define BATCH_ZAP_PAGES 10 static void kvm_zap_obsolete_pages(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; int nr_zapped, batch = 0; bool unstable; restart: list_for_each_entry_safe_reverse(sp, node, &kvm->arch.active_mmu_pages, link) { /* * No obsolete valid page exists before a newly created page * since active_mmu_pages is a FIFO list. */ if (!is_obsolete_sp(kvm, sp)) break; /* * Invalid pages should never land back on the list of active * pages. Skip the bogus page, otherwise we'll get stuck in an * infinite loop if the page gets put back on the list (again). */ if (WARN_ON_ONCE(sp->role.invalid)) continue; /* * No need to flush the TLB since we're only zapping shadow * pages with an obsolete generation number and all vCPUS have * loaded a new root, i.e. the shadow pages being zapped cannot * be in active use by the guest. */ if (batch >= BATCH_ZAP_PAGES && cond_resched_rwlock_write(&kvm->mmu_lock)) { batch = 0; goto restart; } unstable = __kvm_mmu_prepare_zap_page(kvm, sp, &kvm->arch.zapped_obsolete_pages, &nr_zapped); batch += nr_zapped; if (unstable) goto restart; } /* * Kick all vCPUs (via remote TLB flush) before freeing the page tables * to ensure KVM is not in the middle of a lockless shadow page table * walk, which may reference the pages. The remote TLB flush itself is * not required and is simply a convenient way to kick vCPUs as needed. * KVM performs a local TLB flush when allocating a new root (see * kvm_mmu_load()), and the reload in the caller ensure no vCPUs are * running with an obsolete MMU. */ kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages); } /* * Fast invalidate all shadow pages and use lock-break technique * to zap obsolete pages. * * It's required when memslot is being deleted or VM is being * destroyed, in these cases, we should ensure that KVM MMU does * not use any resource of the being-deleted slot or all slots * after calling the function. */ static void kvm_mmu_zap_all_fast(struct kvm *kvm) { lockdep_assert_held(&kvm->slots_lock); write_lock(&kvm->mmu_lock); trace_kvm_mmu_zap_all_fast(kvm); /* * Toggle mmu_valid_gen between '0' and '1'. Because slots_lock is * held for the entire duration of zapping obsolete pages, it's * impossible for there to be multiple invalid generations associated * with *valid* shadow pages at any given time, i.e. there is exactly * one valid generation and (at most) one invalid generation. */ kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1; /* * In order to ensure all vCPUs drop their soon-to-be invalid roots, * invalidating TDP MMU roots must be done while holding mmu_lock for * write and in the same critical section as making the reload request, * e.g. before kvm_zap_obsolete_pages() could drop mmu_lock and yield. */ if (tdp_mmu_enabled) kvm_tdp_mmu_invalidate_all_roots(kvm); /* * Notify all vcpus to reload its shadow page table and flush TLB. * Then all vcpus will switch to new shadow page table with the new * mmu_valid_gen. * * Note: we need to do this under the protection of mmu_lock, * otherwise, vcpu would purge shadow page but miss tlb flush. */ kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS); kvm_zap_obsolete_pages(kvm); write_unlock(&kvm->mmu_lock); /* * Zap the invalidated TDP MMU roots, all SPTEs must be dropped before * returning to the caller, e.g. if the zap is in response to a memslot * deletion, mmu_notifier callbacks will be unable to reach the SPTEs * associated with the deleted memslot once the update completes, and * Deferring the zap until the final reference to the root is put would * lead to use-after-free. */ if (tdp_mmu_enabled) kvm_tdp_mmu_zap_invalidated_roots(kvm); } static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) { return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); } void kvm_mmu_init_vm(struct kvm *kvm) { kvm->arch.shadow_mmio_value = shadow_mmio_value; INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages); spin_lock_init(&kvm->arch.mmu_unsync_pages_lock); if (tdp_mmu_enabled) kvm_mmu_init_tdp_mmu(kvm); kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache; kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO; kvm->arch.split_shadow_page_cache.gfp_zero = __GFP_ZERO; kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache; kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO; } static void mmu_free_vm_memory_caches(struct kvm *kvm) { kvm_mmu_free_memory_cache(&kvm->arch.split_desc_cache); kvm_mmu_free_memory_cache(&kvm->arch.split_page_header_cache); kvm_mmu_free_memory_cache(&kvm->arch.split_shadow_page_cache); } void kvm_mmu_uninit_vm(struct kvm *kvm) { if (tdp_mmu_enabled) kvm_mmu_uninit_tdp_mmu(kvm); mmu_free_vm_memory_caches(kvm); } static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) { const struct kvm_memory_slot *memslot; struct kvm_memslots *slots; struct kvm_memslot_iter iter; bool flush = false; gfn_t start, end; int i; if (!kvm_memslots_have_rmaps(kvm)) return flush; for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { slots = __kvm_memslots(kvm, i); kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) { memslot = iter.slot; start = max(gfn_start, memslot->base_gfn); end = min(gfn_end, memslot->base_gfn + memslot->npages); if (WARN_ON_ONCE(start >= end)) continue; flush = __walk_slot_rmaps(kvm, memslot, __kvm_zap_rmap, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, start, end - 1, true, flush); } } return flush; } /* * Invalidate (zap) SPTEs that cover GFNs from gfn_start and up to gfn_end * (not including it) */ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) { bool flush; if (WARN_ON_ONCE(gfn_end <= gfn_start)) return; write_lock(&kvm->mmu_lock); kvm_mmu_invalidate_begin(kvm); kvm_mmu_invalidate_range_add(kvm, gfn_start, gfn_end); flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end); if (tdp_mmu_enabled) flush = kvm_tdp_mmu_zap_leafs(kvm, gfn_start, gfn_end, flush); if (flush) kvm_flush_remote_tlbs_range(kvm, gfn_start, gfn_end - gfn_start); kvm_mmu_invalidate_end(kvm); write_unlock(&kvm->mmu_lock); } static bool slot_rmap_write_protect(struct kvm *kvm, struct kvm_rmap_head *rmap_head, const struct kvm_memory_slot *slot) { return rmap_write_protect(rmap_head, false); } void kvm_mmu_slot_remove_write_access(struct kvm *kvm, const struct kvm_memory_slot *memslot, int start_level) { if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); walk_slot_rmaps(kvm, memslot, slot_rmap_write_protect, start_level, KVM_MAX_HUGEPAGE_LEVEL, false); write_unlock(&kvm->mmu_lock); } if (tdp_mmu_enabled) { read_lock(&kvm->mmu_lock); kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level); read_unlock(&kvm->mmu_lock); } } static inline bool need_topup(struct kvm_mmu_memory_cache *cache, int min) { return kvm_mmu_memory_cache_nr_free_objects(cache) < min; } static bool need_topup_split_caches_or_resched(struct kvm *kvm) { if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) return true; /* * In the worst case, SPLIT_DESC_CACHE_MIN_NR_OBJECTS descriptors are needed * to split a single huge page. Calculating how many are actually needed * is possible but not worth the complexity. */ return need_topup(&kvm->arch.split_desc_cache, SPLIT_DESC_CACHE_MIN_NR_OBJECTS) || need_topup(&kvm->arch.split_page_header_cache, 1) || need_topup(&kvm->arch.split_shadow_page_cache, 1); } static int topup_split_caches(struct kvm *kvm) { /* * Allocating rmap list entries when splitting huge pages for nested * MMUs is uncommon as KVM needs to use a list if and only if there is * more than one rmap entry for a gfn, i.e. requires an L1 gfn to be * aliased by multiple L2 gfns and/or from multiple nested roots with * different roles. Aliasing gfns when using TDP is atypical for VMMs; * a few gfns are often aliased during boot, e.g. when remapping BIOS, * but aliasing rarely occurs post-boot or for many gfns. If there is * only one rmap entry, rmap->val points directly at that one entry and * doesn't need to allocate a list. Buffer the cache by the default * capacity so that KVM doesn't have to drop mmu_lock to topup if KVM * encounters an aliased gfn or two. */ const int capacity = SPLIT_DESC_CACHE_MIN_NR_OBJECTS + KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE; int r; lockdep_assert_held(&kvm->slots_lock); r = __kvm_mmu_topup_memory_cache(&kvm->arch.split_desc_cache, capacity, SPLIT_DESC_CACHE_MIN_NR_OBJECTS); if (r) return r; r = kvm_mmu_topup_memory_cache(&kvm->arch.split_page_header_cache, 1); if (r) return r; return kvm_mmu_topup_memory_cache(&kvm->arch.split_shadow_page_cache, 1); } static struct kvm_mmu_page *shadow_mmu_get_sp_for_split(struct kvm *kvm, u64 *huge_sptep) { struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep); struct shadow_page_caches caches = {}; union kvm_mmu_page_role role; unsigned int access; gfn_t gfn; gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep)); access = kvm_mmu_page_get_access(huge_sp, spte_index(huge_sptep)); /* * Note, huge page splitting always uses direct shadow pages, regardless * of whether the huge page itself is mapped by a direct or indirect * shadow page, since the huge page region itself is being directly * mapped with smaller pages. */ role = kvm_mmu_child_role(huge_sptep, /*direct=*/true, access); /* Direct SPs do not require a shadowed_info_cache. */ caches.page_header_cache = &kvm->arch.split_page_header_cache; caches.shadow_page_cache = &kvm->arch.split_shadow_page_cache; /* Safe to pass NULL for vCPU since requesting a direct SP. */ return __kvm_mmu_get_shadow_page(kvm, NULL, &caches, gfn, role); } static void shadow_mmu_split_huge_page(struct kvm *kvm, const struct kvm_memory_slot *slot, u64 *huge_sptep) { struct kvm_mmu_memory_cache *cache = &kvm->arch.split_desc_cache; u64 huge_spte = READ_ONCE(*huge_sptep); struct kvm_mmu_page *sp; bool flush = false; u64 *sptep, spte; gfn_t gfn; int index; sp = shadow_mmu_get_sp_for_split(kvm, huge_sptep); for (index = 0; index < SPTE_ENT_PER_PAGE; index++) { sptep = &sp->spt[index]; gfn = kvm_mmu_page_get_gfn(sp, index); /* * The SP may already have populated SPTEs, e.g. if this huge * page is aliased by multiple sptes with the same access * permissions. These entries are guaranteed to map the same * gfn-to-pfn translation since the SP is direct, so no need to * modify them. * * However, if a given SPTE points to a lower level page table, * that lower level page table may only be partially populated. * Installing such SPTEs would effectively unmap a potion of the * huge page. Unmapping guest memory always requires a TLB flush * since a subsequent operation on the unmapped regions would * fail to detect the need to flush. */ if (is_shadow_present_pte(*sptep)) { flush |= !is_last_spte(*sptep, sp->role.level); continue; } spte = make_huge_page_split_spte(kvm, huge_spte, sp->role, index); mmu_spte_set(sptep, spte); __rmap_add(kvm, cache, slot, sptep, gfn, sp->role.access); } __link_shadow_page(kvm, cache, huge_sptep, sp, flush); } static int shadow_mmu_try_split_huge_page(struct kvm *kvm, const struct kvm_memory_slot *slot, u64 *huge_sptep) { struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep); int level, r = 0; gfn_t gfn; u64 spte; /* Grab information for the tracepoint before dropping the MMU lock. */ gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep)); level = huge_sp->role.level; spte = *huge_sptep; if (kvm_mmu_available_pages(kvm) <= KVM_MIN_FREE_MMU_PAGES) { r = -ENOSPC; goto out; } if (need_topup_split_caches_or_resched(kvm)) { write_unlock(&kvm->mmu_lock); cond_resched(); /* * If the topup succeeds, return -EAGAIN to indicate that the * rmap iterator should be restarted because the MMU lock was * dropped. */ r = topup_split_caches(kvm) ?: -EAGAIN; write_lock(&kvm->mmu_lock); goto out; } shadow_mmu_split_huge_page(kvm, slot, huge_sptep); out: trace_kvm_mmu_split_huge_page(gfn, spte, level, r); return r; } static bool shadow_mmu_try_split_huge_pages(struct kvm *kvm, struct kvm_rmap_head *rmap_head, const struct kvm_memory_slot *slot) { struct rmap_iterator iter; struct kvm_mmu_page *sp; u64 *huge_sptep; int r; restart: for_each_rmap_spte(rmap_head, &iter, huge_sptep) { sp = sptep_to_sp(huge_sptep); /* TDP MMU is enabled, so rmap only contains nested MMU SPs. */ if (WARN_ON_ONCE(!sp->role.guest_mode)) continue; /* The rmaps should never contain non-leaf SPTEs. */ if (WARN_ON_ONCE(!is_large_pte(*huge_sptep))) continue; /* SPs with level >PG_LEVEL_4K should never by unsync. */ if (WARN_ON_ONCE(sp->unsync)) continue; /* Don't bother splitting huge pages on invalid SPs. */ if (sp->role.invalid) continue; r = shadow_mmu_try_split_huge_page(kvm, slot, huge_sptep); /* * The split succeeded or needs to be retried because the MMU * lock was dropped. Either way, restart the iterator to get it * back into a consistent state. */ if (!r || r == -EAGAIN) goto restart; /* The split failed and shouldn't be retried (e.g. -ENOMEM). */ break; } return false; } static void kvm_shadow_mmu_try_split_huge_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t start, gfn_t end, int target_level) { int level; /* * Split huge pages starting with KVM_MAX_HUGEPAGE_LEVEL and working * down to the target level. This ensures pages are recursively split * all the way to the target level. There's no need to split pages * already at the target level. */ for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--) __walk_slot_rmaps(kvm, slot, shadow_mmu_try_split_huge_pages, level, level, start, end - 1, true, false); } /* Must be called with the mmu_lock held in write-mode. */ void kvm_mmu_try_split_huge_pages(struct kvm *kvm, const struct kvm_memory_slot *memslot, u64 start, u64 end, int target_level) { if (!tdp_mmu_enabled) return; if (kvm_memslots_have_rmaps(kvm)) kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level); kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, false); /* * A TLB flush is unnecessary at this point for the same reasons as in * kvm_mmu_slot_try_split_huge_pages(). */ } void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm, const struct kvm_memory_slot *memslot, int target_level) { u64 start = memslot->base_gfn; u64 end = start + memslot->npages; if (!tdp_mmu_enabled) return; if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level); write_unlock(&kvm->mmu_lock); } read_lock(&kvm->mmu_lock); kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true); read_unlock(&kvm->mmu_lock); /* * No TLB flush is necessary here. KVM will flush TLBs after * write-protecting and/or clearing dirty on the newly split SPTEs to * ensure that guest writes are reflected in the dirty log before the * ioctl to enable dirty logging on this memslot completes. Since the * split SPTEs retain the write and dirty bits of the huge SPTE, it is * safe for KVM to decide if a TLB flush is necessary based on the split * SPTEs. */ } static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, struct kvm_rmap_head *rmap_head, const struct kvm_memory_slot *slot) { u64 *sptep; struct rmap_iterator iter; int need_tlb_flush = 0; struct kvm_mmu_page *sp; restart: for_each_rmap_spte(rmap_head, &iter, sptep) { sp = sptep_to_sp(sptep); /* * We cannot do huge page mapping for indirect shadow pages, * which are found on the last rmap (level = 1) when not using * tdp; such shadow pages are synced with the page table in * the guest, and the guest page table is using 4K page size * mapping if the indirect sp has level = 1. */ if (sp->role.direct && sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, PG_LEVEL_NUM)) { kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); if (kvm_available_flush_remote_tlbs_range()) kvm_flush_remote_tlbs_sptep(kvm, sptep); else need_tlb_flush = 1; goto restart; } } return need_tlb_flush; } EXPORT_SYMBOL_GPL(kvm_zap_gfn_range); static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *slot) { /* * Note, use KVM_MAX_HUGEPAGE_LEVEL - 1 since there's no need to zap * pages that are already mapped at the maximum hugepage level. */ if (walk_slot_rmaps(kvm, slot, kvm_mmu_zap_collapsible_spte, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - 1, true)) kvm_flush_remote_tlbs_memslot(kvm, slot); } void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *slot) { if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); kvm_rmap_zap_collapsible_sptes(kvm, slot); write_unlock(&kvm->mmu_lock); } if (tdp_mmu_enabled) { read_lock(&kvm->mmu_lock); kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot); read_unlock(&kvm->mmu_lock); } } void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, const struct kvm_memory_slot *memslot) { if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); /* * Clear dirty bits only on 4k SPTEs since the legacy MMU only * support dirty logging at a 4k granularity. */ walk_slot_rmaps_4k(kvm, memslot, __rmap_clear_dirty, false); write_unlock(&kvm->mmu_lock); } if (tdp_mmu_enabled) { read_lock(&kvm->mmu_lock); kvm_tdp_mmu_clear_dirty_slot(kvm, memslot); read_unlock(&kvm->mmu_lock); } /* * The caller will flush the TLBs after this function returns. * * It's also safe to flush TLBs out of mmu lock here as currently this * function is only used for dirty logging, in which case flushing TLB * out of mmu lock also guarantees no dirty pages will be lost in * dirty_bitmap. */ } static void kvm_mmu_zap_all(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; LIST_HEAD(invalid_list); int ign; write_lock(&kvm->mmu_lock); restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { if (WARN_ON_ONCE(sp->role.invalid)) continue; if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) goto restart; if (cond_resched_rwlock_write(&kvm->mmu_lock)) goto restart; } kvm_mmu_commit_zap_page(kvm, &invalid_list); if (tdp_mmu_enabled) kvm_tdp_mmu_zap_all(kvm); write_unlock(&kvm->mmu_lock); } void kvm_arch_flush_shadow_all(struct kvm *kvm) { kvm_mmu_zap_all(kvm); } void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { kvm_mmu_zap_all_fast(kvm); } void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) { WARN_ON_ONCE(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); gen &= MMIO_SPTE_GEN_MASK; /* * Generation numbers are incremented in multiples of the number of * address spaces in order to provide unique generations across all * address spaces. Strip what is effectively the address space * modifier prior to checking for a wrap of the MMIO generation so * that a wrap in any address space is detected. */ gen &= ~((u64)kvm_arch_nr_memslot_as_ids(kvm) - 1); /* * The very rare case: if the MMIO generation number has wrapped, * zap all shadow pages. */ if (unlikely(gen == 0)) { kvm_debug_ratelimited("zapping shadow pages for mmio generation wraparound\n"); kvm_mmu_zap_all_fast(kvm); } } static unsigned long mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { struct kvm *kvm; int nr_to_scan = sc->nr_to_scan; unsigned long freed = 0; mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { int idx; /* * Never scan more than sc->nr_to_scan VM instances. * Will not hit this condition practically since we do not try * to shrink more than one VM and it is very unlikely to see * !n_used_mmu_pages so many times. */ if (!nr_to_scan--) break; /* * n_used_mmu_pages is accessed without holding kvm->mmu_lock * here. We may skip a VM instance errorneosly, but we do not * want to shrink a VM that only started to populate its MMU * anyway. */ if (!kvm->arch.n_used_mmu_pages && !kvm_has_zapped_obsolete_pages(kvm)) continue; idx = srcu_read_lock(&kvm->srcu); write_lock(&kvm->mmu_lock); if (kvm_has_zapped_obsolete_pages(kvm)) { kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages); goto unlock; } freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan); unlock: write_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); /* * unfair on small ones * per-vm shrinkers cry out * sadness comes quickly */ list_move_tail(&kvm->vm_list, &vm_list); break; } mutex_unlock(&kvm_lock); return freed; } static unsigned long mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { return percpu_counter_read_positive(&kvm_total_used_mmu_pages); } static struct shrinker *mmu_shrinker; static void mmu_destroy_caches(void) { kmem_cache_destroy(pte_list_desc_cache); kmem_cache_destroy(mmu_page_header_cache); } static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp) { if (nx_hugepage_mitigation_hard_disabled) return sysfs_emit(buffer, "never\n"); return param_get_bool(buffer, kp); } static bool get_nx_auto_mode(void) { /* Return true when CPU has the bug, and mitigations are ON */ return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off(); } static void __set_nx_huge_pages(bool val) { nx_huge_pages = itlb_multihit_kvm_mitigation = val; } static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) { bool old_val = nx_huge_pages; bool new_val; if (nx_hugepage_mitigation_hard_disabled) return -EPERM; /* In "auto" mode deploy workaround only if CPU has the bug. */ if (sysfs_streq(val, "off")) { new_val = 0; } else if (sysfs_streq(val, "force")) { new_val = 1; } else if (sysfs_streq(val, "auto")) { new_val = get_nx_auto_mode(); } else if (sysfs_streq(val, "never")) { new_val = 0; mutex_lock(&kvm_lock); if (!list_empty(&vm_list)) { mutex_unlock(&kvm_lock); return -EBUSY; } nx_hugepage_mitigation_hard_disabled = true; mutex_unlock(&kvm_lock); } else if (kstrtobool(val, &new_val) < 0) { return -EINVAL; } __set_nx_huge_pages(new_val); if (new_val != old_val) { struct kvm *kvm; mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { mutex_lock(&kvm->slots_lock); kvm_mmu_zap_all_fast(kvm); mutex_unlock(&kvm->slots_lock); wake_up_process(kvm->arch.nx_huge_page_recovery_thread); } mutex_unlock(&kvm_lock); } return 0; } /* * nx_huge_pages needs to be resolved to true/false when kvm.ko is loaded, as * its default value of -1 is technically undefined behavior for a boolean. * Forward the module init call to SPTE code so that it too can handle module * params that need to be resolved/snapshot. */ void __init kvm_mmu_x86_module_init(void) { if (nx_huge_pages == -1) __set_nx_huge_pages(get_nx_auto_mode()); /* * Snapshot userspace's desire to enable the TDP MMU. Whether or not the * TDP MMU is actually enabled is determined in kvm_configure_mmu() * when the vendor module is loaded. */ tdp_mmu_allowed = tdp_mmu_enabled; kvm_mmu_spte_module_init(); } /* * The bulk of the MMU initialization is deferred until the vendor module is * loaded as many of the masks/values may be modified by VMX or SVM, i.e. need * to be reset when a potentially different vendor module is loaded. */ int kvm_mmu_vendor_module_init(void) { int ret = -ENOMEM; /* * MMU roles use union aliasing which is, generally speaking, an * undefined behavior. However, we supposedly know how compilers behave * and the current status quo is unlikely to change. Guardians below are * supposed to let us know if the assumption becomes false. */ BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32)); BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32)); BUILD_BUG_ON(sizeof(union kvm_cpu_role) != sizeof(u64)); kvm_mmu_reset_all_pte_masks(); pte_list_desc_cache = KMEM_CACHE(pte_list_desc, SLAB_ACCOUNT); if (!pte_list_desc_cache) goto out; mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", sizeof(struct kvm_mmu_page), 0, SLAB_ACCOUNT, NULL); if (!mmu_page_header_cache) goto out; if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL)) goto out; mmu_shrinker = shrinker_alloc(0, "x86-mmu"); if (!mmu_shrinker) goto out_shrinker; mmu_shrinker->count_objects = mmu_shrink_count; mmu_shrinker->scan_objects = mmu_shrink_scan; mmu_shrinker->seeks = DEFAULT_SEEKS * 10; shrinker_register(mmu_shrinker); return 0; out_shrinker: percpu_counter_destroy(&kvm_total_used_mmu_pages); out: mmu_destroy_caches(); return ret; } void kvm_mmu_destroy(struct kvm_vcpu *vcpu) { kvm_mmu_unload(vcpu); free_mmu_pages(&vcpu->arch.root_mmu); free_mmu_pages(&vcpu->arch.guest_mmu); mmu_free_memory_caches(vcpu); } void kvm_mmu_vendor_module_exit(void) { mmu_destroy_caches(); percpu_counter_destroy(&kvm_total_used_mmu_pages); shrinker_free(mmu_shrinker); } /* * Calculate the effective recovery period, accounting for '0' meaning "let KVM * select a halving time of 1 hour". Returns true if recovery is enabled. */ static bool calc_nx_huge_pages_recovery_period(uint *period) { /* * Use READ_ONCE to get the params, this may be called outside of the * param setters, e.g. by the kthread to compute its next timeout. */ bool enabled = READ_ONCE(nx_huge_pages); uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio); if (!enabled || !ratio) return false; *period = READ_ONCE(nx_huge_pages_recovery_period_ms); if (!*period) { /* Make sure the period is not less than one second. */ ratio = min(ratio, 3600u); *period = 60 * 60 * 1000 / ratio; } return true; } static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp) { bool was_recovery_enabled, is_recovery_enabled; uint old_period, new_period; int err; if (nx_hugepage_mitigation_hard_disabled) return -EPERM; was_recovery_enabled = calc_nx_huge_pages_recovery_period(&old_period); err = param_set_uint(val, kp); if (err) return err; is_recovery_enabled = calc_nx_huge_pages_recovery_period(&new_period); if (is_recovery_enabled && (!was_recovery_enabled || old_period > new_period)) { struct kvm *kvm; mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) wake_up_process(kvm->arch.nx_huge_page_recovery_thread); mutex_unlock(&kvm_lock); } return err; } static void kvm_recover_nx_huge_pages(struct kvm *kvm) { unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits; struct kvm_memory_slot *slot; int rcu_idx; struct kvm_mmu_page *sp; unsigned int ratio; LIST_HEAD(invalid_list); bool flush = false; ulong to_zap; rcu_idx = srcu_read_lock(&kvm->srcu); write_lock(&kvm->mmu_lock); /* * Zapping TDP MMU shadow pages, including the remote TLB flush, must * be done under RCU protection, because the pages are freed via RCU * callback. */ rcu_read_lock(); ratio = READ_ONCE(nx_huge_pages_recovery_ratio); to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0; for ( ; to_zap; --to_zap) { if (list_empty(&kvm->arch.possible_nx_huge_pages)) break; /* * We use a separate list instead of just using active_mmu_pages * because the number of shadow pages that be replaced with an * NX huge page is expected to be relatively small compared to * the total number of shadow pages. And because the TDP MMU * doesn't use active_mmu_pages. */ sp = list_first_entry(&kvm->arch.possible_nx_huge_pages, struct kvm_mmu_page, possible_nx_huge_page_link); WARN_ON_ONCE(!sp->nx_huge_page_disallowed); WARN_ON_ONCE(!sp->role.direct); /* * Unaccount and do not attempt to recover any NX Huge Pages * that are being dirty tracked, as they would just be faulted * back in as 4KiB pages. The NX Huge Pages in this slot will be * recovered, along with all the other huge pages in the slot, * when dirty logging is disabled. * * Since gfn_to_memslot() is relatively expensive, it helps to * skip it if it the test cannot possibly return true. On the * other hand, if any memslot has logging enabled, chances are * good that all of them do, in which case unaccount_nx_huge_page() * is much cheaper than zapping the page. * * If a memslot update is in progress, reading an incorrect value * of kvm->nr_memslots_dirty_logging is not a problem: if it is * becoming zero, gfn_to_memslot() will be done unnecessarily; if * it is becoming nonzero, the page will be zapped unnecessarily. * Either way, this only affects efficiency in racy situations, * and not correctness. */ slot = NULL; if (atomic_read(&kvm->nr_memslots_dirty_logging)) { struct kvm_memslots *slots; slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, sp->gfn); WARN_ON_ONCE(!slot); } if (slot && kvm_slot_dirty_track_enabled(slot)) unaccount_nx_huge_page(kvm, sp); else if (is_tdp_mmu_page(sp)) flush |= kvm_tdp_mmu_zap_sp(kvm, sp); else kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); WARN_ON_ONCE(sp->nx_huge_page_disallowed); if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); rcu_read_unlock(); cond_resched_rwlock_write(&kvm->mmu_lock); flush = false; rcu_read_lock(); } } kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); rcu_read_unlock(); write_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, rcu_idx); } static long get_nx_huge_page_recovery_timeout(u64 start_time) { bool enabled; uint period; enabled = calc_nx_huge_pages_recovery_period(&period); return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64() : MAX_SCHEDULE_TIMEOUT; } static int kvm_nx_huge_page_recovery_worker(struct kvm *kvm, uintptr_t data) { u64 start_time; long remaining_time; while (true) { start_time = get_jiffies_64(); remaining_time = get_nx_huge_page_recovery_timeout(start_time); set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop() && remaining_time > 0) { schedule_timeout(remaining_time); remaining_time = get_nx_huge_page_recovery_timeout(start_time); set_current_state(TASK_INTERRUPTIBLE); } set_current_state(TASK_RUNNING); if (kthread_should_stop()) return 0; kvm_recover_nx_huge_pages(kvm); } } int kvm_mmu_post_init_vm(struct kvm *kvm) { int err; if (nx_hugepage_mitigation_hard_disabled) return 0; err = kvm_vm_create_worker_thread(kvm, kvm_nx_huge_page_recovery_worker, 0, "kvm-nx-lpage-recovery", &kvm->arch.nx_huge_page_recovery_thread); if (!err) kthread_unpark(kvm->arch.nx_huge_page_recovery_thread); return err; } void kvm_mmu_pre_destroy_vm(struct kvm *kvm) { if (kvm->arch.nx_huge_page_recovery_thread) kthread_stop(kvm->arch.nx_huge_page_recovery_thread); } #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm, struct kvm_gfn_range *range) { /* * Zap SPTEs even if the slot can't be mapped PRIVATE. KVM x86 only * supports KVM_MEMORY_ATTRIBUTE_PRIVATE, and so it *seems* like KVM * can simply ignore such slots. But if userspace is making memory * PRIVATE, then KVM must prevent the guest from accessing the memory * as shared. And if userspace is making memory SHARED and this point * is reached, then at least one page within the range was previously * PRIVATE, i.e. the slot's possible hugepage ranges are changing. * Zapping SPTEs in this case ensures KVM will reassess whether or not * a hugepage can be used for affected ranges. */ if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm))) return false; return kvm_unmap_gfn_range(kvm, range); } static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn, int level) { return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG; } static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn, int level) { lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG; } static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn, int level) { lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG; } static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, int level, unsigned long attrs) { const unsigned long start = gfn; const unsigned long end = start + KVM_PAGES_PER_HPAGE(level); if (level == PG_LEVEL_2M) return kvm_range_has_memory_attributes(kvm, start, end, attrs); for (gfn = start; gfn < end; gfn += KVM_PAGES_PER_HPAGE(level - 1)) { if (hugepage_test_mixed(slot, gfn, level - 1) || attrs != kvm_get_memory_attributes(kvm, gfn)) return false; } return true; } bool kvm_arch_post_set_memory_attributes(struct kvm *kvm, struct kvm_gfn_range *range) { unsigned long attrs = range->arg.attributes; struct kvm_memory_slot *slot = range->slot; int level; lockdep_assert_held_write(&kvm->mmu_lock); lockdep_assert_held(&kvm->slots_lock); /* * Calculate which ranges can be mapped with hugepages even if the slot * can't map memory PRIVATE. KVM mustn't create a SHARED hugepage over * a range that has PRIVATE GFNs, and conversely converting a range to * SHARED may now allow hugepages. */ if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm))) return false; /* * The sequence matters here: upper levels consume the result of lower * level's scanning. */ for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) { gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level); gfn_t gfn = gfn_round_for_level(range->start, level); /* Process the head page if it straddles the range. */ if (gfn != range->start || gfn + nr_pages > range->end) { /* * Skip mixed tracking if the aligned gfn isn't covered * by the memslot, KVM can't use a hugepage due to the * misaligned address regardless of memory attributes. */ if (gfn >= slot->base_gfn && gfn + nr_pages <= slot->base_gfn + slot->npages) { if (hugepage_has_attrs(kvm, slot, gfn, level, attrs)) hugepage_clear_mixed(slot, gfn, level); else hugepage_set_mixed(slot, gfn, level); } gfn += nr_pages; } /* * Pages entirely covered by the range are guaranteed to have * only the attributes which were just set. */ for ( ; gfn + nr_pages <= range->end; gfn += nr_pages) hugepage_clear_mixed(slot, gfn, level); /* * Process the last tail page if it straddles the range and is * contained by the memslot. Like the head page, KVM can't * create a hugepage if the slot size is misaligned. */ if (gfn < range->end && (gfn + nr_pages) <= (slot->base_gfn + slot->npages)) { if (hugepage_has_attrs(kvm, slot, gfn, level, attrs)) hugepage_clear_mixed(slot, gfn, level); else hugepage_set_mixed(slot, gfn, level); } } return false; } void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm, struct kvm_memory_slot *slot) { int level; if (!kvm_arch_has_private_mem(kvm)) return; for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) { /* * Don't bother tracking mixed attributes for pages that can't * be huge due to alignment, i.e. process only pages that are * entirely contained by the memslot. */ gfn_t end = gfn_round_for_level(slot->base_gfn + slot->npages, level); gfn_t start = gfn_round_for_level(slot->base_gfn, level); gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level); gfn_t gfn; if (start < slot->base_gfn) start += nr_pages; /* * Unlike setting attributes, every potential hugepage needs to * be manually checked as the attributes may already be mixed. */ for (gfn = start; gfn < end; gfn += nr_pages) { unsigned long attrs = kvm_get_memory_attributes(kvm, gfn); if (hugepage_has_attrs(kvm, slot, gfn, level, attrs)) hugepage_clear_mixed(slot, gfn, level); else hugepage_set_mixed(slot, gfn, level); } } } #endif |
4 1 1 2 1 1 1 1 1 2 1 1 1 1 12 13 13 2 2 2 2 4 3 3 3 3 1 2 1 1 23 9 23 1 436 436 2 1 1 309 311 2 25 25 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 | // SPDX-License-Identifier: GPL-2.0 #include <linux/bpf-cgroup.h> #include <linux/bpf.h> #include <linux/bpf_local_storage.h> #include <linux/btf.h> #include <linux/bug.h> #include <linux/filter.h> #include <linux/mm.h> #include <linux/rbtree.h> #include <linux/slab.h> #include <uapi/linux/btf.h> #include <linux/btf_ids.h> #ifdef CONFIG_CGROUP_BPF #include "../cgroup/cgroup-internal.h" #define LOCAL_STORAGE_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) struct bpf_cgroup_storage_map { struct bpf_map map; spinlock_t lock; struct rb_root root; struct list_head list; }; static struct bpf_cgroup_storage_map *map_to_storage(struct bpf_map *map) { return container_of(map, struct bpf_cgroup_storage_map, map); } static bool attach_type_isolated(const struct bpf_map *map) { return map->key_size == sizeof(struct bpf_cgroup_storage_key); } static int bpf_cgroup_storage_key_cmp(const struct bpf_cgroup_storage_map *map, const void *_key1, const void *_key2) { if (attach_type_isolated(&map->map)) { const struct bpf_cgroup_storage_key *key1 = _key1; const struct bpf_cgroup_storage_key *key2 = _key2; if (key1->cgroup_inode_id < key2->cgroup_inode_id) return -1; else if (key1->cgroup_inode_id > key2->cgroup_inode_id) return 1; else if (key1->attach_type < key2->attach_type) return -1; else if (key1->attach_type > key2->attach_type) return 1; } else { const __u64 *cgroup_inode_id1 = _key1; const __u64 *cgroup_inode_id2 = _key2; if (*cgroup_inode_id1 < *cgroup_inode_id2) return -1; else if (*cgroup_inode_id1 > *cgroup_inode_id2) return 1; } return 0; } struct bpf_cgroup_storage * cgroup_storage_lookup(struct bpf_cgroup_storage_map *map, void *key, bool locked) { struct rb_root *root = &map->root; struct rb_node *node; if (!locked) spin_lock_bh(&map->lock); node = root->rb_node; while (node) { struct bpf_cgroup_storage *storage; storage = container_of(node, struct bpf_cgroup_storage, node); switch (bpf_cgroup_storage_key_cmp(map, key, &storage->key)) { case -1: node = node->rb_left; break; case 1: node = node->rb_right; break; default: if (!locked) spin_unlock_bh(&map->lock); return storage; } } if (!locked) spin_unlock_bh(&map->lock); return NULL; } static int cgroup_storage_insert(struct bpf_cgroup_storage_map *map, struct bpf_cgroup_storage *storage) { struct rb_root *root = &map->root; struct rb_node **new = &(root->rb_node), *parent = NULL; while (*new) { struct bpf_cgroup_storage *this; this = container_of(*new, struct bpf_cgroup_storage, node); parent = *new; switch (bpf_cgroup_storage_key_cmp(map, &storage->key, &this->key)) { case -1: new = &((*new)->rb_left); break; case 1: new = &((*new)->rb_right); break; default: return -EEXIST; } } rb_link_node(&storage->node, parent, new); rb_insert_color(&storage->node, root); return 0; } static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *key) { struct bpf_cgroup_storage_map *map = map_to_storage(_map); struct bpf_cgroup_storage *storage; storage = cgroup_storage_lookup(map, key, false); if (!storage) return NULL; return &READ_ONCE(storage->buf)->data[0]; } static long cgroup_storage_update_elem(struct bpf_map *map, void *key, void *value, u64 flags) { struct bpf_cgroup_storage *storage; struct bpf_storage_buffer *new; if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST))) return -EINVAL; if (unlikely((flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK))) return -EINVAL; storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map, key, false); if (!storage) return -ENOENT; if (flags & BPF_F_LOCK) { copy_map_value_locked(map, storage->buf->data, value, false); return 0; } new = bpf_map_kmalloc_node(map, struct_size(new, data, map->value_size), __GFP_ZERO | GFP_NOWAIT | __GFP_NOWARN, map->numa_node); if (!new) return -ENOMEM; memcpy(&new->data[0], value, map->value_size); check_and_init_map_value(map, new->data); new = xchg(&storage->buf, new); kfree_rcu(new, rcu); return 0; } int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *key, void *value) { struct bpf_cgroup_storage_map *map = map_to_storage(_map); struct bpf_cgroup_storage *storage; int cpu, off = 0; u32 size; rcu_read_lock(); storage = cgroup_storage_lookup(map, key, false); if (!storage) { rcu_read_unlock(); return -ENOENT; } /* per_cpu areas are zero-filled and bpf programs can only * access 'value_size' of them, so copying rounded areas * will not leak any kernel data */ size = round_up(_map->value_size, 8); for_each_possible_cpu(cpu) { bpf_long_memcpy(value + off, per_cpu_ptr(storage->percpu_buf, cpu), size); off += size; } rcu_read_unlock(); return 0; } int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *key, void *value, u64 map_flags) { struct bpf_cgroup_storage_map *map = map_to_storage(_map); struct bpf_cgroup_storage *storage; int cpu, off = 0; u32 size; if (map_flags != BPF_ANY && map_flags != BPF_EXIST) return -EINVAL; rcu_read_lock(); storage = cgroup_storage_lookup(map, key, false); if (!storage) { rcu_read_unlock(); return -ENOENT; } /* the user space will provide round_up(value_size, 8) bytes that * will be copied into per-cpu area. bpf programs can only access * value_size of it. During lookup the same extra bytes will be * returned or zeros which were zero-filled by percpu_alloc, * so no kernel data leaks possible */ size = round_up(_map->value_size, 8); for_each_possible_cpu(cpu) { bpf_long_memcpy(per_cpu_ptr(storage->percpu_buf, cpu), value + off, size); off += size; } rcu_read_unlock(); return 0; } static int cgroup_storage_get_next_key(struct bpf_map *_map, void *key, void *_next_key) { struct bpf_cgroup_storage_map *map = map_to_storage(_map); struct bpf_cgroup_storage *storage; spin_lock_bh(&map->lock); if (list_empty(&map->list)) goto enoent; if (key) { storage = cgroup_storage_lookup(map, key, true); if (!storage) goto enoent; storage = list_next_entry(storage, list_map); if (!storage) goto enoent; } else { storage = list_first_entry(&map->list, struct bpf_cgroup_storage, list_map); } spin_unlock_bh(&map->lock); if (attach_type_isolated(&map->map)) { struct bpf_cgroup_storage_key *next = _next_key; *next = storage->key; } else { __u64 *next = _next_key; *next = storage->key.cgroup_inode_id; } return 0; enoent: spin_unlock_bh(&map->lock); return -ENOENT; } static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) { __u32 max_value_size = BPF_LOCAL_STORAGE_MAX_VALUE_SIZE; int numa_node = bpf_map_attr_numa_node(attr); struct bpf_cgroup_storage_map *map; /* percpu is bound by PCPU_MIN_UNIT_SIZE, non-percu * is the same as other local storages. */ if (attr->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) max_value_size = min_t(__u32, max_value_size, PCPU_MIN_UNIT_SIZE); if (attr->key_size != sizeof(struct bpf_cgroup_storage_key) && attr->key_size != sizeof(__u64)) return ERR_PTR(-EINVAL); if (attr->value_size == 0) return ERR_PTR(-EINVAL); if (attr->value_size > max_value_size) return ERR_PTR(-E2BIG); if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK || !bpf_map_flags_access_ok(attr->map_flags)) return ERR_PTR(-EINVAL); if (attr->max_entries) /* max_entries is not used and enforced to be 0 */ return ERR_PTR(-EINVAL); map = bpf_map_area_alloc(sizeof(struct bpf_cgroup_storage_map), numa_node); if (!map) return ERR_PTR(-ENOMEM); /* copy mandatory map attributes */ bpf_map_init_from_attr(&map->map, attr); spin_lock_init(&map->lock); map->root = RB_ROOT; INIT_LIST_HEAD(&map->list); return &map->map; } static void cgroup_storage_map_free(struct bpf_map *_map) { struct bpf_cgroup_storage_map *map = map_to_storage(_map); struct list_head *storages = &map->list; struct bpf_cgroup_storage *storage, *stmp; cgroup_lock(); list_for_each_entry_safe(storage, stmp, storages, list_map) { bpf_cgroup_storage_unlink(storage); bpf_cgroup_storage_free(storage); } cgroup_unlock(); WARN_ON(!RB_EMPTY_ROOT(&map->root)); WARN_ON(!list_empty(&map->list)); bpf_map_area_free(map); } static long cgroup_storage_delete_elem(struct bpf_map *map, void *key) { return -EINVAL; } static int cgroup_storage_check_btf(const struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) { if (attach_type_isolated(map)) { struct btf_member *m; u32 offset, size; /* Key is expected to be of struct bpf_cgroup_storage_key type, * which is: * struct bpf_cgroup_storage_key { * __u64 cgroup_inode_id; * __u32 attach_type; * }; */ /* * Key_type must be a structure with two fields. */ if (BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT || BTF_INFO_VLEN(key_type->info) != 2) return -EINVAL; /* * The first field must be a 64 bit integer at 0 offset. */ m = (struct btf_member *)(key_type + 1); size = sizeof_field(struct bpf_cgroup_storage_key, cgroup_inode_id); if (!btf_member_is_reg_int(btf, key_type, m, 0, size)) return -EINVAL; /* * The second field must be a 32 bit integer at 64 bit offset. */ m++; offset = offsetof(struct bpf_cgroup_storage_key, attach_type); size = sizeof_field(struct bpf_cgroup_storage_key, attach_type); if (!btf_member_is_reg_int(btf, key_type, m, offset, size)) return -EINVAL; } else { u32 int_data; /* * Key is expected to be u64, which stores the cgroup_inode_id */ if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) return -EINVAL; int_data = *(u32 *)(key_type + 1); if (BTF_INT_BITS(int_data) != 64 || BTF_INT_OFFSET(int_data)) return -EINVAL; } return 0; } static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key, struct seq_file *m) { enum bpf_cgroup_storage_type stype; struct bpf_cgroup_storage *storage; int cpu; rcu_read_lock(); storage = cgroup_storage_lookup(map_to_storage(map), key, false); if (!storage) { rcu_read_unlock(); return; } btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); stype = cgroup_storage_type(map); if (stype == BPF_CGROUP_STORAGE_SHARED) { seq_puts(m, ": "); btf_type_seq_show(map->btf, map->btf_value_type_id, &READ_ONCE(storage->buf)->data[0], m); seq_puts(m, "\n"); } else { seq_puts(m, ": {\n"); for_each_possible_cpu(cpu) { seq_printf(m, "\tcpu%d: ", cpu); btf_type_seq_show(map->btf, map->btf_value_type_id, per_cpu_ptr(storage->percpu_buf, cpu), m); seq_puts(m, "\n"); } seq_puts(m, "}\n"); } rcu_read_unlock(); } static u64 cgroup_storage_map_usage(const struct bpf_map *map) { /* Currently the dynamically allocated elements are not counted. */ return sizeof(struct bpf_cgroup_storage_map); } BTF_ID_LIST_SINGLE(cgroup_storage_map_btf_ids, struct, bpf_cgroup_storage_map) const struct bpf_map_ops cgroup_storage_map_ops = { .map_alloc = cgroup_storage_map_alloc, .map_free = cgroup_storage_map_free, .map_get_next_key = cgroup_storage_get_next_key, .map_lookup_elem = cgroup_storage_lookup_elem, .map_update_elem = cgroup_storage_update_elem, .map_delete_elem = cgroup_storage_delete_elem, .map_check_btf = cgroup_storage_check_btf, .map_seq_show_elem = cgroup_storage_seq_show_elem, .map_mem_usage = cgroup_storage_map_usage, .map_btf_id = &cgroup_storage_map_btf_ids[0], }; int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *_map) { enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map); if (aux->cgroup_storage[stype] && aux->cgroup_storage[stype] != _map) return -EBUSY; aux->cgroup_storage[stype] = _map; return 0; } static size_t bpf_cgroup_storage_calculate_size(struct bpf_map *map, u32 *pages) { size_t size; if (cgroup_storage_type(map) == BPF_CGROUP_STORAGE_SHARED) { size = sizeof(struct bpf_storage_buffer) + map->value_size; *pages = round_up(sizeof(struct bpf_cgroup_storage) + size, PAGE_SIZE) >> PAGE_SHIFT; } else { size = map->value_size; *pages = round_up(round_up(size, 8) * num_possible_cpus(), PAGE_SIZE) >> PAGE_SHIFT; } return size; } struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { const gfp_t gfp = __GFP_ZERO | GFP_USER; struct bpf_cgroup_storage *storage; struct bpf_map *map; size_t size; u32 pages; map = prog->aux->cgroup_storage[stype]; if (!map) return NULL; size = bpf_cgroup_storage_calculate_size(map, &pages); storage = bpf_map_kmalloc_node(map, sizeof(struct bpf_cgroup_storage), gfp, map->numa_node); if (!storage) goto enomem; if (stype == BPF_CGROUP_STORAGE_SHARED) { storage->buf = bpf_map_kmalloc_node(map, size, gfp, map->numa_node); if (!storage->buf) goto enomem; check_and_init_map_value(map, storage->buf->data); } else { storage->percpu_buf = bpf_map_alloc_percpu(map, size, 8, gfp); if (!storage->percpu_buf) goto enomem; } storage->map = (struct bpf_cgroup_storage_map *)map; return storage; enomem: kfree(storage); return ERR_PTR(-ENOMEM); } static void free_shared_cgroup_storage_rcu(struct rcu_head *rcu) { struct bpf_cgroup_storage *storage = container_of(rcu, struct bpf_cgroup_storage, rcu); kfree(storage->buf); kfree(storage); } static void free_percpu_cgroup_storage_rcu(struct rcu_head *rcu) { struct bpf_cgroup_storage *storage = container_of(rcu, struct bpf_cgroup_storage, rcu); free_percpu(storage->percpu_buf); kfree(storage); } void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage) { enum bpf_cgroup_storage_type stype; struct bpf_map *map; if (!storage) return; map = &storage->map->map; stype = cgroup_storage_type(map); if (stype == BPF_CGROUP_STORAGE_SHARED) call_rcu(&storage->rcu, free_shared_cgroup_storage_rcu); else call_rcu(&storage->rcu, free_percpu_cgroup_storage_rcu); } void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, struct cgroup *cgroup, enum bpf_attach_type type) { struct bpf_cgroup_storage_map *map; if (!storage) return; storage->key.attach_type = type; storage->key.cgroup_inode_id = cgroup_id(cgroup); map = storage->map; spin_lock_bh(&map->lock); WARN_ON(cgroup_storage_insert(map, storage)); list_add(&storage->list_map, &map->list); list_add(&storage->list_cg, &cgroup->bpf.storages); spin_unlock_bh(&map->lock); } void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage) { struct bpf_cgroup_storage_map *map; struct rb_root *root; if (!storage) return; map = storage->map; spin_lock_bh(&map->lock); root = &map->root; rb_erase(&storage->node, root); list_del(&storage->list_map); list_del(&storage->list_cg); spin_unlock_bh(&map->lock); } #endif |
12 58 58 57 39 25 37 48 17 18 14 9 58 51 7 59 59 49 29 31 49 22 49 1 13 49 48 49 3 3 59 49 49 59 59 15 5 10 49 48 10 41 9 8 38 14 41 10 22 8 10 14 5 5 5 5 5 5 2 4 4 12 1 12 11 5 12 7 5 12 7 7 7 21 6 15 15 5 5 6 20 20 20 15 5 5 15 15 15 15 20 2 2 2 1 1 1 2 13 13 13 1 2 12 13 1 13 2 2 2 1 1 1 1 1 1 20 20 1 33 33 2 18 15 15 15 15 37 37 27 20 1 1 15 26 15 15 37 36 6 31 6 31 15 15 15 15 15 15 6 6 6 3 3 5 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) International Business Machines Corp., 2000-2004 * Portions Copyright (C) Christoph Hellwig, 2001-2002 */ /* * jfs_logmgr.c: log manager * * for related information, see transaction manager (jfs_txnmgr.c), and * recovery manager (jfs_logredo.c). * * note: for detail, RTFS. * * log buffer manager: * special purpose buffer manager supporting log i/o requirements. * per log serial pageout of logpage * queuing i/o requests and redrive i/o at iodone * maintain current logpage buffer * no caching since append only * appropriate jfs buffer cache buffers as needed * * group commit: * transactions which wrote COMMIT records in the same in-memory * log page during the pageout of previous/current log page(s) are * committed together by the pageout of the page. * * TBD lazy commit: * transactions are committed asynchronously when the log page * containing it COMMIT is paged out when it becomes full; * * serialization: * . a per log lock serialize log write. * . a per log lock serialize group commit. * . a per log lock serialize log open/close; * * TBD log integrity: * careful-write (ping-pong) of last logpage to recover from crash * in overwrite. * detection of split (out-of-order) write of physical sectors * of last logpage via timestamp at end of each sector * with its mirror data array at trailer). * * alternatives: * lsn - 64-bit monotonically increasing integer vs * 32-bit lspn and page eor. */ #include <linux/fs.h> #include <linux/blkdev.h> #include <linux/interrupt.h> #include <linux/completion.h> #include <linux/kthread.h> #include <linux/buffer_head.h> /* for sync_blockdev() */ #include <linux/bio.h> #include <linux/freezer.h> #include <linux/export.h> #include <linux/delay.h> #include <linux/mutex.h> #include <linux/seq_file.h> #include <linux/slab.h> #include "jfs_incore.h" #include "jfs_filsys.h" #include "jfs_metapage.h" #include "jfs_superblock.h" #include "jfs_txnmgr.h" #include "jfs_debug.h" /* * lbuf's ready to be redriven. Protected by log_redrive_lock (jfsIO thread) */ static struct lbuf *log_redrive_list; static DEFINE_SPINLOCK(log_redrive_lock); /* * log read/write serialization (per log) */ #define LOG_LOCK_INIT(log) mutex_init(&(log)->loglock) #define LOG_LOCK(log) mutex_lock(&((log)->loglock)) #define LOG_UNLOCK(log) mutex_unlock(&((log)->loglock)) /* * log group commit serialization (per log) */ #define LOGGC_LOCK_INIT(log) spin_lock_init(&(log)->gclock) #define LOGGC_LOCK(log) spin_lock_irq(&(log)->gclock) #define LOGGC_UNLOCK(log) spin_unlock_irq(&(log)->gclock) #define LOGGC_WAKEUP(tblk) wake_up_all(&(tblk)->gcwait) /* * log sync serialization (per log) */ #define LOGSYNC_DELTA(logsize) min((logsize)/8, 128*LOGPSIZE) #define LOGSYNC_BARRIER(logsize) ((logsize)/4) /* #define LOGSYNC_DELTA(logsize) min((logsize)/4, 256*LOGPSIZE) #define LOGSYNC_BARRIER(logsize) ((logsize)/2) */ /* * log buffer cache synchronization */ static DEFINE_SPINLOCK(jfsLCacheLock); #define LCACHE_LOCK(flags) spin_lock_irqsave(&jfsLCacheLock, flags) #define LCACHE_UNLOCK(flags) spin_unlock_irqrestore(&jfsLCacheLock, flags) /* * See __SLEEP_COND in jfs_locks.h */ #define LCACHE_SLEEP_COND(wq, cond, flags) \ do { \ if (cond) \ break; \ __SLEEP_COND(wq, cond, LCACHE_LOCK(flags), LCACHE_UNLOCK(flags)); \ } while (0) #define LCACHE_WAKEUP(event) wake_up(event) /* * lbuf buffer cache (lCache) control */ /* log buffer manager pageout control (cumulative, inclusive) */ #define lbmREAD 0x0001 #define lbmWRITE 0x0002 /* enqueue at tail of write queue; * init pageout if at head of queue; */ #define lbmRELEASE 0x0004 /* remove from write queue * at completion of pageout; * do not free/recycle it yet: * caller will free it; */ #define lbmSYNC 0x0008 /* do not return to freelist * when removed from write queue; */ #define lbmFREE 0x0010 /* return to freelist * at completion of pageout; * the buffer may be recycled; */ #define lbmDONE 0x0020 #define lbmERROR 0x0040 #define lbmGC 0x0080 /* lbmIODone to perform post-GC processing * of log page */ #define lbmDIRECT 0x0100 /* * Global list of active external journals */ static LIST_HEAD(jfs_external_logs); static struct jfs_log *dummy_log; static DEFINE_MUTEX(jfs_log_mutex); /* * forward references */ static int lmWriteRecord(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, struct tlock * tlck); static int lmNextPage(struct jfs_log * log); static int lmLogFileSystem(struct jfs_log * log, struct jfs_sb_info *sbi, int activate); static int open_inline_log(struct super_block *sb); static int open_dummy_log(struct super_block *sb); static int lbmLogInit(struct jfs_log * log); static void lbmLogShutdown(struct jfs_log * log); static struct lbuf *lbmAllocate(struct jfs_log * log, int); static void lbmFree(struct lbuf * bp); static void lbmfree(struct lbuf * bp); static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp); static void lbmWrite(struct jfs_log * log, struct lbuf * bp, int flag, int cant_block); static void lbmDirectWrite(struct jfs_log * log, struct lbuf * bp, int flag); static int lbmIOWait(struct lbuf * bp, int flag); static bio_end_io_t lbmIODone; static void lbmStartIO(struct lbuf * bp); static void lmGCwrite(struct jfs_log * log, int cant_block); static int lmLogSync(struct jfs_log * log, int hard_sync); /* * statistics */ #ifdef CONFIG_JFS_STATISTICS static struct lmStat { uint commit; /* # of commit */ uint pagedone; /* # of page written */ uint submitted; /* # of pages submitted */ uint full_page; /* # of full pages submitted */ uint partial_page; /* # of partial pages submitted */ } lmStat; #endif static void write_special_inodes(struct jfs_log *log, int (*writer)(struct address_space *)) { struct jfs_sb_info *sbi; list_for_each_entry(sbi, &log->sb_list, log_list) { writer(sbi->ipbmap->i_mapping); writer(sbi->ipimap->i_mapping); writer(sbi->direct_inode->i_mapping); } } /* * NAME: lmLog() * * FUNCTION: write a log record; * * PARAMETER: * * RETURN: lsn - offset to the next log record to write (end-of-log); * -1 - error; * * note: todo: log error handler */ int lmLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, struct tlock * tlck) { int lsn; int diffp, difft; struct metapage *mp = NULL; unsigned long flags; jfs_info("lmLog: log:0x%p tblk:0x%p, lrd:0x%p tlck:0x%p", log, tblk, lrd, tlck); LOG_LOCK(log); /* log by (out-of-transaction) JFS ? */ if (tblk == NULL) goto writeRecord; /* log from page ? */ if (tlck == NULL || tlck->type & tlckBTROOT || (mp = tlck->mp) == NULL) goto writeRecord; /* * initialize/update page/transaction recovery lsn */ lsn = log->lsn; LOGSYNC_LOCK(log, flags); /* * initialize page lsn if first log write of the page */ if (mp->lsn == 0) { mp->log = log; mp->lsn = lsn; log->count++; /* insert page at tail of logsynclist */ list_add_tail(&mp->synclist, &log->synclist); } /* * initialize/update lsn of tblock of the page * * transaction inherits oldest lsn of pages associated * with allocation/deallocation of resources (their * log records are used to reconstruct allocation map * at recovery time: inode for inode allocation map, * B+-tree index of extent descriptors for block * allocation map); * allocation map pages inherit transaction lsn at * commit time to allow forwarding log syncpt past log * records associated with allocation/deallocation of * resources only after persistent map of these map pages * have been updated and propagated to home. */ /* * initialize transaction lsn: */ if (tblk->lsn == 0) { /* inherit lsn of its first page logged */ tblk->lsn = mp->lsn; log->count++; /* insert tblock after the page on logsynclist */ list_add(&tblk->synclist, &mp->synclist); } /* * update transaction lsn: */ else { /* inherit oldest/smallest lsn of page */ logdiff(diffp, mp->lsn, log); logdiff(difft, tblk->lsn, log); if (diffp < difft) { /* update tblock lsn with page lsn */ tblk->lsn = mp->lsn; /* move tblock after page on logsynclist */ list_move(&tblk->synclist, &mp->synclist); } } LOGSYNC_UNLOCK(log, flags); /* * write the log record */ writeRecord: lsn = lmWriteRecord(log, tblk, lrd, tlck); /* * forward log syncpt if log reached next syncpt trigger */ logdiff(diffp, lsn, log); if (diffp >= log->nextsync) lsn = lmLogSync(log, 0); /* update end-of-log lsn */ log->lsn = lsn; LOG_UNLOCK(log); /* return end-of-log address */ return lsn; } /* * NAME: lmWriteRecord() * * FUNCTION: move the log record to current log page * * PARAMETER: cd - commit descriptor * * RETURN: end-of-log address * * serialization: LOG_LOCK() held on entry/exit */ static int lmWriteRecord(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, struct tlock * tlck) { int lsn = 0; /* end-of-log address */ struct lbuf *bp; /* dst log page buffer */ struct logpage *lp; /* dst log page */ caddr_t dst; /* destination address in log page */ int dstoffset; /* end-of-log offset in log page */ int freespace; /* free space in log page */ caddr_t p; /* src meta-data page */ caddr_t src; int srclen; int nbytes; /* number of bytes to move */ int i; int len; struct linelock *linelock; struct lv *lv; struct lvd *lvd; int l2linesize; len = 0; /* retrieve destination log page to write */ bp = (struct lbuf *) log->bp; lp = (struct logpage *) bp->l_ldata; dstoffset = log->eor; /* any log data to write ? */ if (tlck == NULL) goto moveLrd; /* * move log record data */ /* retrieve source meta-data page to log */ if (tlck->flag & tlckPAGELOCK) { p = (caddr_t) (tlck->mp->data); linelock = (struct linelock *) & tlck->lock; } /* retrieve source in-memory inode to log */ else if (tlck->flag & tlckINODELOCK) { if (tlck->type & tlckDTREE) p = (caddr_t) &JFS_IP(tlck->ip)->i_dtroot; else p = (caddr_t) &JFS_IP(tlck->ip)->i_xtroot; linelock = (struct linelock *) & tlck->lock; } else { jfs_err("lmWriteRecord: UFO tlck:0x%p", tlck); return 0; /* Probably should trap */ } l2linesize = linelock->l2linesize; moveData: ASSERT(linelock->index <= linelock->maxcnt); lv = linelock->lv; for (i = 0; i < linelock->index; i++, lv++) { if (lv->length == 0) continue; /* is page full ? */ if (dstoffset >= LOGPSIZE - LOGPTLRSIZE) { /* page become full: move on to next page */ lmNextPage(log); bp = log->bp; lp = (struct logpage *) bp->l_ldata; dstoffset = LOGPHDRSIZE; } /* * move log vector data */ src = (u8 *) p + (lv->offset << l2linesize); srclen = lv->length << l2linesize; len += srclen; while (srclen > 0) { freespace = (LOGPSIZE - LOGPTLRSIZE) - dstoffset; nbytes = min(freespace, srclen); dst = (caddr_t) lp + dstoffset; memcpy(dst, src, nbytes); dstoffset += nbytes; /* is page not full ? */ if (dstoffset < LOGPSIZE - LOGPTLRSIZE) break; /* page become full: move on to next page */ lmNextPage(log); bp = (struct lbuf *) log->bp; lp = (struct logpage *) bp->l_ldata; dstoffset = LOGPHDRSIZE; srclen -= nbytes; src += nbytes; } /* * move log vector descriptor */ len += 4; lvd = (struct lvd *) ((caddr_t) lp + dstoffset); lvd->offset = cpu_to_le16(lv->offset); lvd->length = cpu_to_le16(lv->length); dstoffset += 4; jfs_info("lmWriteRecord: lv offset:%d length:%d", lv->offset, lv->length); } if ((i = linelock->next)) { linelock = (struct linelock *) lid_to_tlock(i); goto moveData; } /* * move log record descriptor */ moveLrd: lrd->length = cpu_to_le16(len); src = (caddr_t) lrd; srclen = LOGRDSIZE; while (srclen > 0) { freespace = (LOGPSIZE - LOGPTLRSIZE) - dstoffset; nbytes = min(freespace, srclen); dst = (caddr_t) lp + dstoffset; memcpy(dst, src, nbytes); dstoffset += nbytes; srclen -= nbytes; /* are there more to move than freespace of page ? */ if (srclen) goto pageFull; /* * end of log record descriptor */ /* update last log record eor */ log->eor = dstoffset; bp->l_eor = dstoffset; lsn = (log->page << L2LOGPSIZE) + dstoffset; if (lrd->type & cpu_to_le16(LOG_COMMIT)) { tblk->clsn = lsn; jfs_info("wr: tclsn:0x%x, beor:0x%x", tblk->clsn, bp->l_eor); INCREMENT(lmStat.commit); /* # of commit */ /* * enqueue tblock for group commit: * * enqueue tblock of non-trivial/synchronous COMMIT * at tail of group commit queue * (trivial/asynchronous COMMITs are ignored by * group commit.) */ LOGGC_LOCK(log); /* init tblock gc state */ tblk->flag = tblkGC_QUEUE; tblk->bp = log->bp; tblk->pn = log->page; tblk->eor = log->eor; /* enqueue transaction to commit queue */ list_add_tail(&tblk->cqueue, &log->cqueue); LOGGC_UNLOCK(log); } jfs_info("lmWriteRecord: lrd:0x%04x bp:0x%p pn:%d eor:0x%x", le16_to_cpu(lrd->type), log->bp, log->page, dstoffset); /* page not full ? */ if (dstoffset < LOGPSIZE - LOGPTLRSIZE) return lsn; pageFull: /* page become full: move on to next page */ lmNextPage(log); bp = (struct lbuf *) log->bp; lp = (struct logpage *) bp->l_ldata; dstoffset = LOGPHDRSIZE; src += nbytes; } return lsn; } /* * NAME: lmNextPage() * * FUNCTION: write current page and allocate next page. * * PARAMETER: log * * RETURN: 0 * * serialization: LOG_LOCK() held on entry/exit */ static int lmNextPage(struct jfs_log * log) { struct logpage *lp; int lspn; /* log sequence page number */ int pn; /* current page number */ struct lbuf *bp; struct lbuf *nextbp; struct tblock *tblk; /* get current log page number and log sequence page number */ pn = log->page; bp = log->bp; lp = (struct logpage *) bp->l_ldata; lspn = le32_to_cpu(lp->h.page); LOGGC_LOCK(log); /* * write or queue the full page at the tail of write queue */ /* get the tail tblk on commit queue */ if (list_empty(&log->cqueue)) tblk = NULL; else tblk = list_entry(log->cqueue.prev, struct tblock, cqueue); /* every tblk who has COMMIT record on the current page, * and has not been committed, must be on commit queue * since tblk is queued at commit queueu at the time * of writing its COMMIT record on the page before * page becomes full (even though the tblk thread * who wrote COMMIT record may have been suspended * currently); */ /* is page bound with outstanding tail tblk ? */ if (tblk && tblk->pn == pn) { /* mark tblk for end-of-page */ tblk->flag |= tblkGC_EOP; if (log->cflag & logGC_PAGEOUT) { /* if page is not already on write queue, * just enqueue (no lbmWRITE to prevent redrive) * buffer to wqueue to ensure correct serial order * of the pages since log pages will be added * continuously */ if (bp->l_wqnext == NULL) lbmWrite(log, bp, 0, 0); } else { /* * No current GC leader, initiate group commit */ log->cflag |= logGC_PAGEOUT; lmGCwrite(log, 0); } } /* page is not bound with outstanding tblk: * init write or mark it to be redriven (lbmWRITE) */ else { /* finalize the page */ bp->l_ceor = bp->l_eor; lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor); lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmFREE, 0); } LOGGC_UNLOCK(log); /* * allocate/initialize next page */ /* if log wraps, the first data page of log is 2 * (0 never used, 1 is superblock). */ log->page = (pn == log->size - 1) ? 2 : pn + 1; log->eor = LOGPHDRSIZE; /* ? valid page empty/full at logRedo() */ /* allocate/initialize next log page buffer */ nextbp = lbmAllocate(log, log->page); nextbp->l_eor = log->eor; log->bp = nextbp; /* initialize next log page */ lp = (struct logpage *) nextbp->l_ldata; lp->h.page = lp->t.page = cpu_to_le32(lspn + 1); lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE); return 0; } /* * NAME: lmGroupCommit() * * FUNCTION: group commit * initiate pageout of the pages with COMMIT in the order of * page number - redrive pageout of the page at the head of * pageout queue until full page has been written. * * RETURN: * * NOTE: * LOGGC_LOCK serializes log group commit queue, and * transaction blocks on the commit queue. * N.B. LOG_LOCK is NOT held during lmGroupCommit(). */ int lmGroupCommit(struct jfs_log * log, struct tblock * tblk) { int rc = 0; LOGGC_LOCK(log); /* group committed already ? */ if (tblk->flag & tblkGC_COMMITTED) { if (tblk->flag & tblkGC_ERROR) rc = -EIO; LOGGC_UNLOCK(log); return rc; } jfs_info("lmGroup Commit: tblk = 0x%p, gcrtc = %d", tblk, log->gcrtc); if (tblk->xflag & COMMIT_LAZY) tblk->flag |= tblkGC_LAZY; if ((!(log->cflag & logGC_PAGEOUT)) && (!list_empty(&log->cqueue)) && (!(tblk->xflag & COMMIT_LAZY) || test_bit(log_FLUSH, &log->flag) || jfs_tlocks_low)) { /* * No pageout in progress * * start group commit as its group leader. */ log->cflag |= logGC_PAGEOUT; lmGCwrite(log, 0); } if (tblk->xflag & COMMIT_LAZY) { /* * Lazy transactions can leave now */ LOGGC_UNLOCK(log); return 0; } /* lmGCwrite gives up LOGGC_LOCK, check again */ if (tblk->flag & tblkGC_COMMITTED) { if (tblk->flag & tblkGC_ERROR) rc = -EIO; LOGGC_UNLOCK(log); return rc; } /* upcount transaction waiting for completion */ log->gcrtc++; tblk->flag |= tblkGC_READY; __SLEEP_COND(tblk->gcwait, (tblk->flag & tblkGC_COMMITTED), LOGGC_LOCK(log), LOGGC_UNLOCK(log)); /* removed from commit queue */ if (tblk->flag & tblkGC_ERROR) rc = -EIO; LOGGC_UNLOCK(log); return rc; } /* * NAME: lmGCwrite() * * FUNCTION: group commit write * initiate write of log page, building a group of all transactions * with commit records on that page. * * RETURN: None * * NOTE: * LOGGC_LOCK must be held by caller. * N.B. LOG_LOCK is NOT held during lmGroupCommit(). */ static void lmGCwrite(struct jfs_log * log, int cant_write) { struct lbuf *bp; struct logpage *lp; int gcpn; /* group commit page number */ struct tblock *tblk; struct tblock *xtblk = NULL; /* * build the commit group of a log page * * scan commit queue and make a commit group of all * transactions with COMMIT records on the same log page. */ /* get the head tblk on the commit queue */ gcpn = list_entry(log->cqueue.next, struct tblock, cqueue)->pn; list_for_each_entry(tblk, &log->cqueue, cqueue) { if (tblk->pn != gcpn) break; xtblk = tblk; /* state transition: (QUEUE, READY) -> COMMIT */ tblk->flag |= tblkGC_COMMIT; } tblk = xtblk; /* last tblk of the page */ /* * pageout to commit transactions on the log page. */ bp = (struct lbuf *) tblk->bp; lp = (struct logpage *) bp->l_ldata; /* is page already full ? */ if (tblk->flag & tblkGC_EOP) { /* mark page to free at end of group commit of the page */ tblk->flag &= ~tblkGC_EOP; tblk->flag |= tblkGC_FREE; bp->l_ceor = bp->l_eor; lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor); lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmGC, cant_write); INCREMENT(lmStat.full_page); } /* page is not yet full */ else { bp->l_ceor = tblk->eor; /* ? bp->l_ceor = bp->l_eor; */ lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor); lbmWrite(log, bp, lbmWRITE | lbmGC, cant_write); INCREMENT(lmStat.partial_page); } } /* * NAME: lmPostGC() * * FUNCTION: group commit post-processing * Processes transactions after their commit records have been written * to disk, redriving log I/O if necessary. * * RETURN: None * * NOTE: * This routine is called a interrupt time by lbmIODone */ static void lmPostGC(struct lbuf * bp) { unsigned long flags; struct jfs_log *log = bp->l_log; struct logpage *lp; struct tblock *tblk, *temp; //LOGGC_LOCK(log); spin_lock_irqsave(&log->gclock, flags); /* * current pageout of group commit completed. * * remove/wakeup transactions from commit queue who were * group committed with the current log page */ list_for_each_entry_safe(tblk, temp, &log->cqueue, cqueue) { if (!(tblk->flag & tblkGC_COMMIT)) break; /* if transaction was marked GC_COMMIT then * it has been shipped in the current pageout * and made it to disk - it is committed. */ if (bp->l_flag & lbmERROR) tblk->flag |= tblkGC_ERROR; /* remove it from the commit queue */ list_del(&tblk->cqueue); tblk->flag &= ~tblkGC_QUEUE; if (tblk == log->flush_tblk) { /* we can stop flushing the log now */ clear_bit(log_FLUSH, &log->flag); log->flush_tblk = NULL; } jfs_info("lmPostGC: tblk = 0x%p, flag = 0x%x", tblk, tblk->flag); if (!(tblk->xflag & COMMIT_FORCE)) /* * Hand tblk over to lazy commit thread */ txLazyUnlock(tblk); else { /* state transition: COMMIT -> COMMITTED */ tblk->flag |= tblkGC_COMMITTED; if (tblk->flag & tblkGC_READY) log->gcrtc--; LOGGC_WAKEUP(tblk); } /* was page full before pageout ? * (and this is the last tblk bound with the page) */ if (tblk->flag & tblkGC_FREE) lbmFree(bp); /* did page become full after pageout ? * (and this is the last tblk bound with the page) */ else if (tblk->flag & tblkGC_EOP) { /* finalize the page */ lp = (struct logpage *) bp->l_ldata; bp->l_ceor = bp->l_eor; lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor); jfs_info("lmPostGC: calling lbmWrite"); lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmFREE, 1); } } /* are there any transactions who have entered lnGroupCommit() * (whose COMMITs are after that of the last log page written. * They are waiting for new group commit (above at (SLEEP 1)) * or lazy transactions are on a full (queued) log page, * select the latest ready transaction as new group leader and * wake her up to lead her group. */ if ((!list_empty(&log->cqueue)) && ((log->gcrtc > 0) || (tblk->bp->l_wqnext != NULL) || test_bit(log_FLUSH, &log->flag) || jfs_tlocks_low)) /* * Call lmGCwrite with new group leader */ lmGCwrite(log, 1); /* no transaction are ready yet (transactions are only just * queued (GC_QUEUE) and not entered for group commit yet). * the first transaction entering group commit * will elect herself as new group leader. */ else log->cflag &= ~logGC_PAGEOUT; //LOGGC_UNLOCK(log); spin_unlock_irqrestore(&log->gclock, flags); return; } /* * NAME: lmLogSync() * * FUNCTION: write log SYNCPT record for specified log * if new sync address is available * (normally the case if sync() is executed by back-ground * process). * calculate new value of i_nextsync which determines when * this code is called again. * * PARAMETERS: log - log structure * hard_sync - 1 to force all metadata to be written * * RETURN: 0 * * serialization: LOG_LOCK() held on entry/exit */ static int lmLogSync(struct jfs_log * log, int hard_sync) { int logsize; int written; /* written since last syncpt */ int free; /* free space left available */ int delta; /* additional delta to write normally */ int more; /* additional write granted */ struct lrd lrd; int lsn; struct logsyncblk *lp; unsigned long flags; /* push dirty metapages out to disk */ if (hard_sync) write_special_inodes(log, filemap_fdatawrite); else write_special_inodes(log, filemap_flush); /* * forward syncpt */ /* if last sync is same as last syncpt, * invoke sync point forward processing to update sync. */ if (log->sync == log->syncpt) { LOGSYNC_LOCK(log, flags); if (list_empty(&log->synclist)) log->sync = log->lsn; else { lp = list_entry(log->synclist.next, struct logsyncblk, synclist); log->sync = lp->lsn; } LOGSYNC_UNLOCK(log, flags); } /* if sync is different from last syncpt, * write a SYNCPT record with syncpt = sync. * reset syncpt = sync */ if (log->sync != log->syncpt) { lrd.logtid = 0; lrd.backchain = 0; lrd.type = cpu_to_le16(LOG_SYNCPT); lrd.length = 0; lrd.log.syncpt.sync = cpu_to_le32(log->sync); lsn = lmWriteRecord(log, NULL, &lrd, NULL); log->syncpt = log->sync; } else lsn = log->lsn; /* * setup next syncpt trigger (SWAG) */ logsize = log->logsize; logdiff(written, lsn, log); free = logsize - written; delta = LOGSYNC_DELTA(logsize); more = min(free / 2, delta); if (more < 2 * LOGPSIZE) { jfs_warn("\n ... Log Wrap ... Log Wrap ... Log Wrap ...\n"); /* * log wrapping * * option 1 - panic ? No.! * option 2 - shutdown file systems * associated with log ? * option 3 - extend log ? * option 4 - second chance * * mark log wrapped, and continue. * when all active transactions are completed, * mark log valid for recovery. * if crashed during invalid state, log state * implies invalid log, forcing fsck(). */ /* mark log state log wrap in log superblock */ /* log->state = LOGWRAP; */ /* reset sync point computation */ log->syncpt = log->sync = lsn; log->nextsync = delta; } else /* next syncpt trigger = written + more */ log->nextsync = written + more; /* if number of bytes written from last sync point is more * than 1/4 of the log size, stop new transactions from * starting until all current transactions are completed * by setting syncbarrier flag. */ if (!test_bit(log_SYNCBARRIER, &log->flag) && (written > LOGSYNC_BARRIER(logsize)) && log->active) { set_bit(log_SYNCBARRIER, &log->flag); jfs_info("log barrier on: lsn=0x%x syncpt=0x%x", lsn, log->syncpt); /* * We may have to initiate group commit */ jfs_flush_journal(log, 0); } return lsn; } /* * NAME: jfs_syncpt * * FUNCTION: write log SYNCPT record for specified log * * PARAMETERS: log - log structure * hard_sync - set to 1 to force metadata to be written */ void jfs_syncpt(struct jfs_log *log, int hard_sync) { LOG_LOCK(log); if (!test_bit(log_QUIESCE, &log->flag)) lmLogSync(log, hard_sync); LOG_UNLOCK(log); } /* * NAME: lmLogOpen() * * FUNCTION: open the log on first open; * insert filesystem in the active list of the log. * * PARAMETER: ipmnt - file system mount inode * iplog - log inode (out) * * RETURN: * * serialization: */ int lmLogOpen(struct super_block *sb) { int rc; struct file *bdev_file; struct jfs_log *log; struct jfs_sb_info *sbi = JFS_SBI(sb); if (sbi->flag & JFS_NOINTEGRITY) return open_dummy_log(sb); if (sbi->mntflag & JFS_INLINELOG) return open_inline_log(sb); mutex_lock(&jfs_log_mutex); list_for_each_entry(log, &jfs_external_logs, journal_list) { if (file_bdev(log->bdev_file)->bd_dev == sbi->logdev) { if (!uuid_equal(&log->uuid, &sbi->loguuid)) { jfs_warn("wrong uuid on JFS journal"); mutex_unlock(&jfs_log_mutex); return -EINVAL; } /* * add file system to log active file system list */ if ((rc = lmLogFileSystem(log, sbi, 1))) { mutex_unlock(&jfs_log_mutex); return rc; } goto journal_found; } } if (!(log = kzalloc(sizeof(struct jfs_log), GFP_KERNEL))) { mutex_unlock(&jfs_log_mutex); return -ENOMEM; } INIT_LIST_HEAD(&log->sb_list); init_waitqueue_head(&log->syncwait); /* * external log as separate logical volume * * file systems to log may have n-to-1 relationship; */ bdev_file = bdev_file_open_by_dev(sbi->logdev, BLK_OPEN_READ | BLK_OPEN_WRITE, log, NULL); if (IS_ERR(bdev_file)) { rc = PTR_ERR(bdev_file); goto free; } log->bdev_file = bdev_file; uuid_copy(&log->uuid, &sbi->loguuid); /* * initialize log: */ if ((rc = lmLogInit(log))) goto close; list_add(&log->journal_list, &jfs_external_logs); /* * add file system to log active file system list */ if ((rc = lmLogFileSystem(log, sbi, 1))) goto shutdown; journal_found: LOG_LOCK(log); list_add(&sbi->log_list, &log->sb_list); sbi->log = log; LOG_UNLOCK(log); mutex_unlock(&jfs_log_mutex); return 0; /* * unwind on error */ shutdown: /* unwind lbmLogInit() */ list_del(&log->journal_list); lbmLogShutdown(log); close: /* close external log device */ bdev_fput(bdev_file); free: /* free log descriptor */ mutex_unlock(&jfs_log_mutex); kfree(log); jfs_warn("lmLogOpen: exit(%d)", rc); return rc; } static int open_inline_log(struct super_block *sb) { struct jfs_log *log; int rc; if (!(log = kzalloc(sizeof(struct jfs_log), GFP_KERNEL))) return -ENOMEM; INIT_LIST_HEAD(&log->sb_list); init_waitqueue_head(&log->syncwait); set_bit(log_INLINELOG, &log->flag); log->bdev_file = sb->s_bdev_file; log->base = addressPXD(&JFS_SBI(sb)->logpxd); log->size = lengthPXD(&JFS_SBI(sb)->logpxd) >> (L2LOGPSIZE - sb->s_blocksize_bits); log->l2bsize = sb->s_blocksize_bits; ASSERT(L2LOGPSIZE >= sb->s_blocksize_bits); /* * initialize log. */ if ((rc = lmLogInit(log))) { kfree(log); jfs_warn("lmLogOpen: exit(%d)", rc); return rc; } list_add(&JFS_SBI(sb)->log_list, &log->sb_list); JFS_SBI(sb)->log = log; return rc; } static int open_dummy_log(struct super_block *sb) { int rc; mutex_lock(&jfs_log_mutex); if (!dummy_log) { dummy_log = kzalloc(sizeof(struct jfs_log), GFP_KERNEL); if (!dummy_log) { mutex_unlock(&jfs_log_mutex); return -ENOMEM; } INIT_LIST_HEAD(&dummy_log->sb_list); init_waitqueue_head(&dummy_log->syncwait); dummy_log->no_integrity = 1; /* Make up some stuff */ dummy_log->base = 0; dummy_log->size = 1024; rc = lmLogInit(dummy_log); if (rc) { kfree(dummy_log); dummy_log = NULL; mutex_unlock(&jfs_log_mutex); return rc; } } LOG_LOCK(dummy_log); list_add(&JFS_SBI(sb)->log_list, &dummy_log->sb_list); JFS_SBI(sb)->log = dummy_log; LOG_UNLOCK(dummy_log); mutex_unlock(&jfs_log_mutex); return 0; } /* * NAME: lmLogInit() * * FUNCTION: log initialization at first log open. * * logredo() (or logformat()) should have been run previously. * initialize the log from log superblock. * set the log state in the superblock to LOGMOUNT and * write SYNCPT log record. * * PARAMETER: log - log structure * * RETURN: 0 - if ok * -EINVAL - bad log magic number or superblock dirty * error returned from logwait() * * serialization: single first open thread */ int lmLogInit(struct jfs_log * log) { int rc = 0; struct lrd lrd; struct logsuper *logsuper; struct lbuf *bpsuper; struct lbuf *bp; struct logpage *lp; int lsn = 0; jfs_info("lmLogInit: log:0x%p", log); /* initialize the group commit serialization lock */ LOGGC_LOCK_INIT(log); /* allocate/initialize the log write serialization lock */ LOG_LOCK_INIT(log); LOGSYNC_LOCK_INIT(log); INIT_LIST_HEAD(&log->synclist); INIT_LIST_HEAD(&log->cqueue); log->flush_tblk = NULL; log->count = 0; /* * initialize log i/o */ if ((rc = lbmLogInit(log))) return rc; if (!test_bit(log_INLINELOG, &log->flag)) log->l2bsize = L2LOGPSIZE; /* check for disabled journaling to disk */ if (log->no_integrity) { /* * Journal pages will still be filled. When the time comes * to actually do the I/O, the write is not done, and the * endio routine is called directly. */ bp = lbmAllocate(log , 0); log->bp = bp; bp->l_pn = bp->l_eor = 0; } else { /* * validate log superblock */ if ((rc = lbmRead(log, 1, &bpsuper))) goto errout10; logsuper = (struct logsuper *) bpsuper->l_ldata; if (logsuper->magic != cpu_to_le32(LOGMAGIC)) { jfs_warn("*** Log Format Error ! ***"); rc = -EINVAL; goto errout20; } /* logredo() should have been run successfully. */ if (logsuper->state != cpu_to_le32(LOGREDONE)) { jfs_warn("*** Log Is Dirty ! ***"); rc = -EINVAL; goto errout20; } /* initialize log from log superblock */ if (test_bit(log_INLINELOG,&log->flag)) { if (log->size != le32_to_cpu(logsuper->size)) { rc = -EINVAL; goto errout20; } jfs_info("lmLogInit: inline log:0x%p base:0x%Lx size:0x%x", log, (unsigned long long)log->base, log->size); } else { if (!uuid_equal(&logsuper->uuid, &log->uuid)) { jfs_warn("wrong uuid on JFS log device"); rc = -EINVAL; goto errout20; } log->size = le32_to_cpu(logsuper->size); log->l2bsize = le32_to_cpu(logsuper->l2bsize); jfs_info("lmLogInit: external log:0x%p base:0x%Lx size:0x%x", log, (unsigned long long)log->base, log->size); } log->page = le32_to_cpu(logsuper->end) / LOGPSIZE; log->eor = le32_to_cpu(logsuper->end) - (LOGPSIZE * log->page); /* * initialize for log append write mode */ /* establish current/end-of-log page/buffer */ if ((rc = lbmRead(log, log->page, &bp))) goto errout20; lp = (struct logpage *) bp->l_ldata; jfs_info("lmLogInit: lsn:0x%x page:%d eor:%d:%d", le32_to_cpu(logsuper->end), log->page, log->eor, le16_to_cpu(lp->h.eor)); log->bp = bp; bp->l_pn = log->page; bp->l_eor = log->eor; /* if current page is full, move on to next page */ if (log->eor >= LOGPSIZE - LOGPTLRSIZE) lmNextPage(log); /* * initialize log syncpoint */ /* * write the first SYNCPT record with syncpoint = 0 * (i.e., log redo up to HERE !); * remove current page from lbm write queue at end of pageout * (to write log superblock update), but do not release to * freelist; */ lrd.logtid = 0; lrd.backchain = 0; lrd.type = cpu_to_le16(LOG_SYNCPT); lrd.length = 0; lrd.log.syncpt.sync = 0; lsn = lmWriteRecord(log, NULL, &lrd, NULL); bp = log->bp; bp->l_ceor = bp->l_eor; lp = (struct logpage *) bp->l_ldata; lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor); lbmWrite(log, bp, lbmWRITE | lbmSYNC, 0); if ((rc = lbmIOWait(bp, 0))) goto errout30; /* * update/write superblock */ logsuper->state = cpu_to_le32(LOGMOUNT); log->serial = le32_to_cpu(logsuper->serial) + 1; logsuper->serial = cpu_to_le32(log->serial); lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC); if ((rc = lbmIOWait(bpsuper, lbmFREE))) goto errout30; } /* initialize logsync parameters */ log->logsize = (log->size - 2) << L2LOGPSIZE; log->lsn = lsn; log->syncpt = lsn; log->sync = log->syncpt; log->nextsync = LOGSYNC_DELTA(log->logsize); jfs_info("lmLogInit: lsn:0x%x syncpt:0x%x sync:0x%x", log->lsn, log->syncpt, log->sync); /* * initialize for lazy/group commit */ log->clsn = lsn; return 0; /* * unwind on error */ errout30: /* release log page */ log->wqueue = NULL; bp->l_wqnext = NULL; lbmFree(bp); errout20: /* release log superblock */ lbmFree(bpsuper); errout10: /* unwind lbmLogInit() */ lbmLogShutdown(log); jfs_warn("lmLogInit: exit(%d)", rc); return rc; } /* * NAME: lmLogClose() * * FUNCTION: remove file system <ipmnt> from active list of log <iplog> * and close it on last close. * * PARAMETER: sb - superblock * * RETURN: errors from subroutines * * serialization: */ int lmLogClose(struct super_block *sb) { struct jfs_sb_info *sbi = JFS_SBI(sb); struct jfs_log *log = sbi->log; struct file *bdev_file; int rc = 0; jfs_info("lmLogClose: log:0x%p", log); mutex_lock(&jfs_log_mutex); LOG_LOCK(log); list_del(&sbi->log_list); LOG_UNLOCK(log); sbi->log = NULL; /* * We need to make sure all of the "written" metapages * actually make it to disk */ sync_blockdev(sb->s_bdev); if (test_bit(log_INLINELOG, &log->flag)) { /* * in-line log in host file system */ rc = lmLogShutdown(log); kfree(log); goto out; } if (!log->no_integrity) lmLogFileSystem(log, sbi, 0); if (!list_empty(&log->sb_list)) goto out; /* * TODO: ensure that the dummy_log is in a state to allow * lbmLogShutdown to deallocate all the buffers and call * kfree against dummy_log. For now, leave dummy_log & its * buffers in memory, and resuse if another no-integrity mount * is requested. */ if (log->no_integrity) goto out; /* * external log as separate logical volume */ list_del(&log->journal_list); bdev_file = log->bdev_file; rc = lmLogShutdown(log); bdev_fput(bdev_file); kfree(log); out: mutex_unlock(&jfs_log_mutex); jfs_info("lmLogClose: exit(%d)", rc); return rc; } /* * NAME: jfs_flush_journal() * * FUNCTION: initiate write of any outstanding transactions to the journal * and optionally wait until they are all written to disk * * wait == 0 flush until latest txn is committed, don't wait * wait == 1 flush until latest txn is committed, wait * wait > 1 flush until all txn's are complete, wait */ void jfs_flush_journal(struct jfs_log *log, int wait) { int i; struct tblock *target = NULL; /* jfs_write_inode may call us during read-only mount */ if (!log) return; jfs_info("jfs_flush_journal: log:0x%p wait=%d", log, wait); LOGGC_LOCK(log); if (!list_empty(&log->cqueue)) { /* * This ensures that we will keep writing to the journal as long * as there are unwritten commit records */ target = list_entry(log->cqueue.prev, struct tblock, cqueue); if (test_bit(log_FLUSH, &log->flag)) { /* * We're already flushing. * if flush_tblk is NULL, we are flushing everything, * so leave it that way. Otherwise, update it to the * latest transaction */ if (log->flush_tblk) log->flush_tblk = target; } else { /* Only flush until latest transaction is committed */ log->flush_tblk = target; set_bit(log_FLUSH, &log->flag); /* * Initiate I/O on outstanding transactions */ if (!(log->cflag & logGC_PAGEOUT)) { log->cflag |= logGC_PAGEOUT; lmGCwrite(log, 0); } } } if ((wait > 1) || test_bit(log_SYNCBARRIER, &log->flag)) { /* Flush until all activity complete */ set_bit(log_FLUSH, &log->flag); log->flush_tblk = NULL; } if (wait && target && !(target->flag & tblkGC_COMMITTED)) { DECLARE_WAITQUEUE(__wait, current); add_wait_queue(&target->gcwait, &__wait); set_current_state(TASK_UNINTERRUPTIBLE); LOGGC_UNLOCK(log); schedule(); LOGGC_LOCK(log); remove_wait_queue(&target->gcwait, &__wait); } LOGGC_UNLOCK(log); if (wait < 2) return; write_special_inodes(log, filemap_fdatawrite); /* * If there was recent activity, we may need to wait * for the lazycommit thread to catch up */ if ((!list_empty(&log->cqueue)) || !list_empty(&log->synclist)) { for (i = 0; i < 200; i++) { /* Too much? */ msleep(250); write_special_inodes(log, filemap_fdatawrite); if (list_empty(&log->cqueue) && list_empty(&log->synclist)) break; } } assert(list_empty(&log->cqueue)); #ifdef CONFIG_JFS_DEBUG if (!list_empty(&log->synclist)) { struct logsyncblk *lp; printk(KERN_ERR "jfs_flush_journal: synclist not empty\n"); list_for_each_entry(lp, &log->synclist, synclist) { if (lp->xflag & COMMIT_PAGE) { struct metapage *mp = (struct metapage *)lp; print_hex_dump(KERN_ERR, "metapage: ", DUMP_PREFIX_ADDRESS, 16, 4, mp, sizeof(struct metapage), 0); print_hex_dump(KERN_ERR, "page: ", DUMP_PREFIX_ADDRESS, 16, sizeof(long), mp->folio, sizeof(struct page), 0); } else print_hex_dump(KERN_ERR, "tblock:", DUMP_PREFIX_ADDRESS, 16, 4, lp, sizeof(struct tblock), 0); } } #else WARN_ON(!list_empty(&log->synclist)); #endif clear_bit(log_FLUSH, &log->flag); } /* * NAME: lmLogShutdown() * * FUNCTION: log shutdown at last LogClose(). * * write log syncpt record. * update super block to set redone flag to 0. * * PARAMETER: log - log inode * * RETURN: 0 - success * * serialization: single last close thread */ int lmLogShutdown(struct jfs_log * log) { int rc; struct lrd lrd; int lsn; struct logsuper *logsuper; struct lbuf *bpsuper; struct lbuf *bp; struct logpage *lp; jfs_info("lmLogShutdown: log:0x%p", log); jfs_flush_journal(log, 2); /* * write the last SYNCPT record with syncpoint = 0 * (i.e., log redo up to HERE !) */ lrd.logtid = 0; lrd.backchain = 0; lrd.type = cpu_to_le16(LOG_SYNCPT); lrd.length = 0; lrd.log.syncpt.sync = 0; lsn = lmWriteRecord(log, NULL, &lrd, NULL); bp = log->bp; lp = (struct logpage *) bp->l_ldata; lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor); lbmWrite(log, log->bp, lbmWRITE | lbmRELEASE | lbmSYNC, 0); lbmIOWait(log->bp, lbmFREE); log->bp = NULL; /* * synchronous update log superblock * mark log state as shutdown cleanly * (i.e., Log does not need to be replayed). */ if ((rc = lbmRead(log, 1, &bpsuper))) goto out; logsuper = (struct logsuper *) bpsuper->l_ldata; logsuper->state = cpu_to_le32(LOGREDONE); logsuper->end = cpu_to_le32(lsn); lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC); rc = lbmIOWait(bpsuper, lbmFREE); jfs_info("lmLogShutdown: lsn:0x%x page:%d eor:%d", lsn, log->page, log->eor); out: /* * shutdown per log i/o */ lbmLogShutdown(log); if (rc) { jfs_warn("lmLogShutdown: exit(%d)", rc); } return rc; } /* * NAME: lmLogFileSystem() * * FUNCTION: insert (<activate> = true)/remove (<activate> = false) * file system into/from log active file system list. * * PARAMETE: log - pointer to logs inode. * fsdev - kdev_t of filesystem. * serial - pointer to returned log serial number * activate - insert/remove device from active list. * * RETURN: 0 - success * errors returned by vms_iowait(). */ static int lmLogFileSystem(struct jfs_log * log, struct jfs_sb_info *sbi, int activate) { int rc = 0; int i; struct logsuper *logsuper; struct lbuf *bpsuper; uuid_t *uuid = &sbi->uuid; /* * insert/remove file system device to log active file system list. */ if ((rc = lbmRead(log, 1, &bpsuper))) return rc; logsuper = (struct logsuper *) bpsuper->l_ldata; if (activate) { for (i = 0; i < MAX_ACTIVE; i++) if (uuid_is_null(&logsuper->active[i].uuid)) { uuid_copy(&logsuper->active[i].uuid, uuid); sbi->aggregate = i; break; } if (i == MAX_ACTIVE) { jfs_warn("Too many file systems sharing journal!"); lbmFree(bpsuper); return -EMFILE; /* Is there a better rc? */ } } else { for (i = 0; i < MAX_ACTIVE; i++) if (uuid_equal(&logsuper->active[i].uuid, uuid)) { uuid_copy(&logsuper->active[i].uuid, &uuid_null); break; } if (i == MAX_ACTIVE) { jfs_warn("Somebody stomped on the journal!"); lbmFree(bpsuper); return -EIO; } } /* * synchronous write log superblock: * * write sidestream bypassing write queue: * at file system mount, log super block is updated for * activation of the file system before any log record * (MOUNT record) of the file system, and at file system * unmount, all meta data for the file system has been * flushed before log super block is updated for deactivation * of the file system. */ lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC); rc = lbmIOWait(bpsuper, lbmFREE); return rc; } /* * log buffer manager (lbm) * ------------------------ * * special purpose buffer manager supporting log i/o requirements. * * per log write queue: * log pageout occurs in serial order by fifo write queue and * restricting to a single i/o in pregress at any one time. * a circular singly-linked list * (log->wrqueue points to the tail, and buffers are linked via * bp->wrqueue field), and * maintains log page in pageout ot waiting for pageout in serial pageout. */ /* * lbmLogInit() * * initialize per log I/O setup at lmLogInit() */ static int lbmLogInit(struct jfs_log * log) { /* log inode */ int i; struct lbuf *lbuf; jfs_info("lbmLogInit: log:0x%p", log); /* initialize current buffer cursor */ log->bp = NULL; /* initialize log device write queue */ log->wqueue = NULL; /* * Each log has its own buffer pages allocated to it. These are * not managed by the page cache. This ensures that a transaction * writing to the log does not block trying to allocate a page from * the page cache (for the log). This would be bad, since page * allocation waits on the kswapd thread that may be committing inodes * which would cause log activity. Was that clear? I'm trying to * avoid deadlock here. */ init_waitqueue_head(&log->free_wait); log->lbuf_free = NULL; for (i = 0; i < LOGPAGES;) { char *buffer; uint offset; struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!page) goto error; buffer = page_address(page); for (offset = 0; offset < PAGE_SIZE; offset += LOGPSIZE) { lbuf = kmalloc(sizeof(struct lbuf), GFP_KERNEL); if (lbuf == NULL) { if (offset == 0) __free_page(page); goto error; } if (offset) /* we already have one reference */ get_page(page); lbuf->l_offset = offset; lbuf->l_ldata = buffer + offset; lbuf->l_page = page; lbuf->l_log = log; init_waitqueue_head(&lbuf->l_ioevent); lbuf->l_freelist = log->lbuf_free; log->lbuf_free = lbuf; i++; } } return (0); error: lbmLogShutdown(log); return -ENOMEM; } /* * lbmLogShutdown() * * finalize per log I/O setup at lmLogShutdown() */ static void lbmLogShutdown(struct jfs_log * log) { struct lbuf *lbuf; jfs_info("lbmLogShutdown: log:0x%p", log); lbuf = log->lbuf_free; while (lbuf) { struct lbuf *next = lbuf->l_freelist; __free_page(lbuf->l_page); kfree(lbuf); lbuf = next; } } /* * lbmAllocate() * * allocate an empty log buffer */ static struct lbuf *lbmAllocate(struct jfs_log * log, int pn) { struct lbuf *bp; unsigned long flags; /* * recycle from log buffer freelist if any */ LCACHE_LOCK(flags); LCACHE_SLEEP_COND(log->free_wait, (bp = log->lbuf_free), flags); log->lbuf_free = bp->l_freelist; LCACHE_UNLOCK(flags); bp->l_flag = 0; bp->l_wqnext = NULL; bp->l_freelist = NULL; bp->l_pn = pn; bp->l_blkno = log->base + (pn << (L2LOGPSIZE - log->l2bsize)); bp->l_ceor = 0; return bp; } /* * lbmFree() * * release a log buffer to freelist */ static void lbmFree(struct lbuf * bp) { unsigned long flags; LCACHE_LOCK(flags); lbmfree(bp); LCACHE_UNLOCK(flags); } static void lbmfree(struct lbuf * bp) { struct jfs_log *log = bp->l_log; assert(bp->l_wqnext == NULL); /* * return the buffer to head of freelist */ bp->l_freelist = log->lbuf_free; log->lbuf_free = bp; wake_up(&log->free_wait); return; } /* * NAME: lbmRedrive * * FUNCTION: add a log buffer to the log redrive list * * PARAMETER: * bp - log buffer * * NOTES: * Takes log_redrive_lock. */ static inline void lbmRedrive(struct lbuf *bp) { unsigned long flags; spin_lock_irqsave(&log_redrive_lock, flags); bp->l_redrive_next = log_redrive_list; log_redrive_list = bp; spin_unlock_irqrestore(&log_redrive_lock, flags); wake_up_process(jfsIOthread); } /* * lbmRead() */ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp) { struct bio *bio; struct lbuf *bp; /* * allocate a log buffer */ *bpp = bp = lbmAllocate(log, pn); jfs_info("lbmRead: bp:0x%p pn:0x%x", bp, pn); bp->l_flag |= lbmREAD; bio = bio_alloc(file_bdev(log->bdev_file), 1, REQ_OP_READ, GFP_NOFS); bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); __bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset); BUG_ON(bio->bi_iter.bi_size != LOGPSIZE); bio->bi_end_io = lbmIODone; bio->bi_private = bp; /*check if journaling to disk has been disabled*/ if (log->no_integrity) { bio->bi_iter.bi_size = 0; lbmIODone(bio); } else { submit_bio(bio); } wait_event(bp->l_ioevent, (bp->l_flag != lbmREAD)); return 0; } /* * lbmWrite() * * buffer at head of pageout queue stays after completion of * partial-page pageout and redriven by explicit initiation of * pageout by caller until full-page pageout is completed and * released. * * device driver i/o done redrives pageout of new buffer at * head of pageout queue when current buffer at head of pageout * queue is released at the completion of its full-page pageout. * * LOGGC_LOCK() serializes lbmWrite() by lmNextPage() and lmGroupCommit(). * LCACHE_LOCK() serializes xflag between lbmWrite() and lbmIODone() */ static void lbmWrite(struct jfs_log * log, struct lbuf * bp, int flag, int cant_block) { struct lbuf *tail; unsigned long flags; jfs_info("lbmWrite: bp:0x%p flag:0x%x pn:0x%x", bp, flag, bp->l_pn); /* map the logical block address to physical block address */ bp->l_blkno = log->base + (bp->l_pn << (L2LOGPSIZE - log->l2bsize)); LCACHE_LOCK(flags); /* disable+lock */ /* * initialize buffer for device driver */ bp->l_flag = flag; /* * insert bp at tail of write queue associated with log * * (request is either for bp already/currently at head of queue * or new bp to be inserted at tail) */ tail = log->wqueue; /* is buffer not already on write queue ? */ if (bp->l_wqnext == NULL) { /* insert at tail of wqueue */ if (tail == NULL) { log->wqueue = bp; bp->l_wqnext = bp; } else { log->wqueue = bp; bp->l_wqnext = tail->l_wqnext; tail->l_wqnext = bp; } tail = bp; } /* is buffer at head of wqueue and for write ? */ if ((bp != tail->l_wqnext) || !(flag & lbmWRITE)) { LCACHE_UNLOCK(flags); /* unlock+enable */ return; } LCACHE_UNLOCK(flags); /* unlock+enable */ if (cant_block) lbmRedrive(bp); else if (flag & lbmSYNC) lbmStartIO(bp); else { LOGGC_UNLOCK(log); lbmStartIO(bp); LOGGC_LOCK(log); } } /* * lbmDirectWrite() * * initiate pageout bypassing write queue for sidestream * (e.g., log superblock) write; */ static void lbmDirectWrite(struct jfs_log * log, struct lbuf * bp, int flag) { jfs_info("lbmDirectWrite: bp:0x%p flag:0x%x pn:0x%x", bp, flag, bp->l_pn); /* * initialize buffer for device driver */ bp->l_flag = flag | lbmDIRECT; /* map the logical block address to physical block address */ bp->l_blkno = log->base + (bp->l_pn << (L2LOGPSIZE - log->l2bsize)); /* * initiate pageout of the page */ lbmStartIO(bp); } /* * NAME: lbmStartIO() * * FUNCTION: Interface to DD strategy routine * * RETURN: none * * serialization: LCACHE_LOCK() is NOT held during log i/o; */ static void lbmStartIO(struct lbuf * bp) { struct bio *bio; struct jfs_log *log = bp->l_log; struct block_device *bdev = NULL; jfs_info("lbmStartIO"); if (!log->no_integrity) bdev = file_bdev(log->bdev_file); bio = bio_alloc(bdev, 1, REQ_OP_WRITE | REQ_SYNC, GFP_NOFS); bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); __bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset); BUG_ON(bio->bi_iter.bi_size != LOGPSIZE); bio->bi_end_io = lbmIODone; bio->bi_private = bp; /* check if journaling to disk has been disabled */ if (log->no_integrity) { bio->bi_iter.bi_size = 0; lbmIODone(bio); } else { submit_bio(bio); INCREMENT(lmStat.submitted); } } /* * lbmIOWait() */ static int lbmIOWait(struct lbuf * bp, int flag) { unsigned long flags; int rc = 0; jfs_info("lbmIOWait1: bp:0x%p flag:0x%x:0x%x", bp, bp->l_flag, flag); LCACHE_LOCK(flags); /* disable+lock */ LCACHE_SLEEP_COND(bp->l_ioevent, (bp->l_flag & lbmDONE), flags); rc = (bp->l_flag & lbmERROR) ? -EIO : 0; if (flag & lbmFREE) lbmfree(bp); LCACHE_UNLOCK(flags); /* unlock+enable */ jfs_info("lbmIOWait2: bp:0x%p flag:0x%x:0x%x", bp, bp->l_flag, flag); return rc; } /* * lbmIODone() * * executed at INTIODONE level */ static void lbmIODone(struct bio *bio) { struct lbuf *bp = bio->bi_private; struct lbuf *nextbp, *tail; struct jfs_log *log; unsigned long flags; /* * get back jfs buffer bound to the i/o buffer */ jfs_info("lbmIODone: bp:0x%p flag:0x%x", bp, bp->l_flag); LCACHE_LOCK(flags); /* disable+lock */ bp->l_flag |= lbmDONE; if (bio->bi_status) { bp->l_flag |= lbmERROR; jfs_err("lbmIODone: I/O error in JFS log"); } bio_put(bio); /* * pagein completion */ if (bp->l_flag & lbmREAD) { bp->l_flag &= ~lbmREAD; LCACHE_UNLOCK(flags); /* unlock+enable */ /* wakeup I/O initiator */ LCACHE_WAKEUP(&bp->l_ioevent); return; } /* * pageout completion * * the bp at the head of write queue has completed pageout. * * if single-commit/full-page pageout, remove the current buffer * from head of pageout queue, and redrive pageout with * the new buffer at head of pageout queue; * otherwise, the partial-page pageout buffer stays at * the head of pageout queue to be redriven for pageout * by lmGroupCommit() until full-page pageout is completed. */ bp->l_flag &= ~lbmWRITE; INCREMENT(lmStat.pagedone); /* update committed lsn */ log = bp->l_log; log->clsn = (bp->l_pn << L2LOGPSIZE) + bp->l_ceor; if (bp->l_flag & lbmDIRECT) { LCACHE_WAKEUP(&bp->l_ioevent); LCACHE_UNLOCK(flags); return; } tail = log->wqueue; /* single element queue */ if (bp == tail) { /* remove head buffer of full-page pageout * from log device write queue */ if (bp->l_flag & lbmRELEASE) { log->wqueue = NULL; bp->l_wqnext = NULL; } } /* multi element queue */ else { /* remove head buffer of full-page pageout * from log device write queue */ if (bp->l_flag & lbmRELEASE) { nextbp = tail->l_wqnext = bp->l_wqnext; bp->l_wqnext = NULL; /* * redrive pageout of next page at head of write queue: * redrive next page without any bound tblk * (i.e., page w/o any COMMIT records), or * first page of new group commit which has been * queued after current page (subsequent pageout * is performed synchronously, except page without * any COMMITs) by lmGroupCommit() as indicated * by lbmWRITE flag; */ if (nextbp->l_flag & lbmWRITE) { /* * We can't do the I/O at interrupt time. * The jfsIO thread can do it */ lbmRedrive(nextbp); } } } /* * synchronous pageout: * * buffer has not necessarily been removed from write queue * (e.g., synchronous write of partial-page with COMMIT): * leave buffer for i/o initiator to dispose */ if (bp->l_flag & lbmSYNC) { LCACHE_UNLOCK(flags); /* unlock+enable */ /* wakeup I/O initiator */ LCACHE_WAKEUP(&bp->l_ioevent); } /* * Group Commit pageout: */ else if (bp->l_flag & lbmGC) { LCACHE_UNLOCK(flags); lmPostGC(bp); } /* * asynchronous pageout: * * buffer must have been removed from write queue: * insert buffer at head of freelist where it can be recycled */ else { assert(bp->l_flag & lbmRELEASE); assert(bp->l_flag & lbmFREE); lbmfree(bp); LCACHE_UNLOCK(flags); /* unlock+enable */ } } int jfsIOWait(void *arg) { struct lbuf *bp; do { spin_lock_irq(&log_redrive_lock); while ((bp = log_redrive_list)) { log_redrive_list = bp->l_redrive_next; bp->l_redrive_next = NULL; spin_unlock_irq(&log_redrive_lock); lbmStartIO(bp); spin_lock_irq(&log_redrive_lock); } if (freezing(current)) { spin_unlock_irq(&log_redrive_lock); try_to_freeze(); } else { set_current_state(TASK_INTERRUPTIBLE); spin_unlock_irq(&log_redrive_lock); schedule(); } } while (!kthread_should_stop()); jfs_info("jfsIOWait being killed!"); return 0; } /* * NAME: lmLogFormat()/jfs_logform() * * FUNCTION: format file system log * * PARAMETERS: * log - volume log * logAddress - start address of log space in FS block * logSize - length of log space in FS block; * * RETURN: 0 - success * -EIO - i/o error * * XXX: We're synchronously writing one page at a time. This needs to * be improved by writing multiple pages at once. */ int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize) { int rc = -EIO; struct jfs_sb_info *sbi; struct logsuper *logsuper; struct logpage *lp; int lspn; /* log sequence page number */ struct lrd *lrd_ptr; int npages = 0; struct lbuf *bp; jfs_info("lmLogFormat: logAddress:%Ld logSize:%d", (long long)logAddress, logSize); sbi = list_entry(log->sb_list.next, struct jfs_sb_info, log_list); /* allocate a log buffer */ bp = lbmAllocate(log, 1); npages = logSize >> sbi->l2nbperpage; /* * log space: * * page 0 - reserved; * page 1 - log superblock; * page 2 - log data page: A SYNC log record is written * into this page at logform time; * pages 3-N - log data page: set to empty log data pages; */ /* * init log superblock: log page 1 */ logsuper = (struct logsuper *) bp->l_ldata; logsuper->magic = cpu_to_le32(LOGMAGIC); logsuper->version = cpu_to_le32(LOGVERSION); logsuper->state = cpu_to_le32(LOGREDONE); logsuper->flag = cpu_to_le32(sbi->mntflag); /* ? */ logsuper->size = cpu_to_le32(npages); logsuper->bsize = cpu_to_le32(sbi->bsize); logsuper->l2bsize = cpu_to_le32(sbi->l2bsize); logsuper->end = cpu_to_le32(2 * LOGPSIZE + LOGPHDRSIZE + LOGRDSIZE); bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT; bp->l_blkno = logAddress + sbi->nbperpage; lbmStartIO(bp); if ((rc = lbmIOWait(bp, 0))) goto exit; /* * init pages 2 to npages-1 as log data pages: * * log page sequence number (lpsn) initialization: * * pn: 0 1 2 3 n-1 * +-----+-----+=====+=====+===.....===+=====+ * lspn: N-1 0 1 N-2 * <--- N page circular file ----> * * the N (= npages-2) data pages of the log is maintained as * a circular file for the log records; * lpsn grows by 1 monotonically as each log page is written * to the circular file of the log; * and setLogpage() will not reset the page number even if * the eor is equal to LOGPHDRSIZE. In order for binary search * still work in find log end process, we have to simulate the * log wrap situation at the log format time. * The 1st log page written will have the highest lpsn. Then * the succeeding log pages will have ascending order of * the lspn starting from 0, ... (N-2) */ lp = (struct logpage *) bp->l_ldata; /* * initialize 1st log page to be written: lpsn = N - 1, * write a SYNCPT log record is written to this page */ lp->h.page = lp->t.page = cpu_to_le32(npages - 3); lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE + LOGRDSIZE); lrd_ptr = (struct lrd *) &lp->data; lrd_ptr->logtid = 0; lrd_ptr->backchain = 0; lrd_ptr->type = cpu_to_le16(LOG_SYNCPT); lrd_ptr->length = 0; lrd_ptr->log.syncpt.sync = 0; bp->l_blkno += sbi->nbperpage; bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT; lbmStartIO(bp); if ((rc = lbmIOWait(bp, 0))) goto exit; /* * initialize succeeding log pages: lpsn = 0, 1, ..., (N-2) */ for (lspn = 0; lspn < npages - 3; lspn++) { lp->h.page = lp->t.page = cpu_to_le32(lspn); lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE); bp->l_blkno += sbi->nbperpage; bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT; lbmStartIO(bp); if ((rc = lbmIOWait(bp, 0))) goto exit; } rc = 0; exit: /* * finalize log */ /* release the buffer */ lbmFree(bp); return rc; } #ifdef CONFIG_JFS_STATISTICS int jfs_lmstats_proc_show(struct seq_file *m, void *v) { seq_printf(m, "JFS Logmgr stats\n" "================\n" "commits = %d\n" "writes submitted = %d\n" "writes completed = %d\n" "full pages submitted = %d\n" "partial pages submitted = %d\n", lmStat.commit, lmStat.submitted, lmStat.pagedone, lmStat.full_page, lmStat.partial_page); return 0; } #endif /* CONFIG_JFS_STATISTICS */ |
6 17 11 3 23 14 17 6 23 23 18 10 21 20 32 24 24 39 4 8 2 3 52 52 3 23 40 40 26 14 50 50 49 43 40 25 15 13 23 23 23 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2014 Fraunhofer ITWM * * Written by: * Phoebe Buckheister <phoebe.buckheister@itwm.fraunhofer.de> */ #include <linux/ieee802154.h> #include <net/mac802154.h> #include <net/ieee802154_netdev.h> static int ieee802154_hdr_push_addr(u8 *buf, const struct ieee802154_addr *addr, bool omit_pan) { int pos = 0; if (addr->mode == IEEE802154_ADDR_NONE) return 0; if (!omit_pan) { memcpy(buf + pos, &addr->pan_id, 2); pos += 2; } switch (addr->mode) { case IEEE802154_ADDR_SHORT: memcpy(buf + pos, &addr->short_addr, 2); pos += 2; break; case IEEE802154_ADDR_LONG: memcpy(buf + pos, &addr->extended_addr, IEEE802154_ADDR_LEN); pos += IEEE802154_ADDR_LEN; break; default: return -EINVAL; } return pos; } static int ieee802154_hdr_push_sechdr(u8 *buf, const struct ieee802154_sechdr *hdr) { int pos = 5; memcpy(buf, hdr, 1); memcpy(buf + 1, &hdr->frame_counter, 4); switch (hdr->key_id_mode) { case IEEE802154_SCF_KEY_IMPLICIT: return pos; case IEEE802154_SCF_KEY_INDEX: break; case IEEE802154_SCF_KEY_SHORT_INDEX: memcpy(buf + pos, &hdr->short_src, 4); pos += 4; break; case IEEE802154_SCF_KEY_HW_INDEX: memcpy(buf + pos, &hdr->extended_src, IEEE802154_ADDR_LEN); pos += IEEE802154_ADDR_LEN; break; } buf[pos++] = hdr->key_id; return pos; } int ieee802154_hdr_push(struct sk_buff *skb, struct ieee802154_hdr *hdr) { u8 buf[IEEE802154_MAX_HEADER_LEN]; int pos = 2; int rc; struct ieee802154_hdr_fc *fc = &hdr->fc; buf[pos++] = hdr->seq; fc->dest_addr_mode = hdr->dest.mode; rc = ieee802154_hdr_push_addr(buf + pos, &hdr->dest, false); if (rc < 0) return -EINVAL; pos += rc; fc->source_addr_mode = hdr->source.mode; if (hdr->source.pan_id == hdr->dest.pan_id && hdr->dest.mode != IEEE802154_ADDR_NONE) fc->intra_pan = true; rc = ieee802154_hdr_push_addr(buf + pos, &hdr->source, fc->intra_pan); if (rc < 0) return -EINVAL; pos += rc; if (fc->security_enabled) { fc->version = 1; rc = ieee802154_hdr_push_sechdr(buf + pos, &hdr->sec); if (rc < 0) return -EINVAL; pos += rc; } memcpy(buf, fc, 2); memcpy(skb_push(skb, pos), buf, pos); return pos; } EXPORT_SYMBOL_GPL(ieee802154_hdr_push); int ieee802154_mac_cmd_push(struct sk_buff *skb, void *f, const void *pl, unsigned int pl_len) { struct ieee802154_mac_cmd_frame *frame = f; struct ieee802154_mac_cmd_pl *mac_pl = &frame->mac_pl; struct ieee802154_hdr *mhr = &frame->mhr; int ret; skb_reserve(skb, sizeof(*mhr)); ret = ieee802154_hdr_push(skb, mhr); if (ret < 0) return ret; skb_reset_mac_header(skb); skb->mac_len = ret; skb_put_data(skb, mac_pl, sizeof(*mac_pl)); skb_put_data(skb, pl, pl_len); return 0; } EXPORT_SYMBOL_GPL(ieee802154_mac_cmd_push); int ieee802154_beacon_push(struct sk_buff *skb, struct ieee802154_beacon_frame *beacon) { struct ieee802154_beacon_hdr *mac_pl = &beacon->mac_pl; struct ieee802154_hdr *mhr = &beacon->mhr; int ret; skb_reserve(skb, sizeof(*mhr)); ret = ieee802154_hdr_push(skb, mhr); if (ret < 0) return ret; skb_reset_mac_header(skb); skb->mac_len = ret; skb_put_data(skb, mac_pl, sizeof(*mac_pl)); if (mac_pl->pend_short_addr_count || mac_pl->pend_ext_addr_count) return -EOPNOTSUPP; return 0; } EXPORT_SYMBOL_GPL(ieee802154_beacon_push); static int ieee802154_hdr_get_addr(const u8 *buf, int mode, bool omit_pan, struct ieee802154_addr *addr) { int pos = 0; addr->mode = mode; if (mode == IEEE802154_ADDR_NONE) return 0; if (!omit_pan) { memcpy(&addr->pan_id, buf + pos, 2); pos += 2; } if (mode == IEEE802154_ADDR_SHORT) { memcpy(&addr->short_addr, buf + pos, 2); return pos + 2; } else { memcpy(&addr->extended_addr, buf + pos, IEEE802154_ADDR_LEN); return pos + IEEE802154_ADDR_LEN; } } static int ieee802154_hdr_addr_len(int mode, bool omit_pan) { int pan_len = omit_pan ? 0 : 2; switch (mode) { case IEEE802154_ADDR_NONE: return 0; case IEEE802154_ADDR_SHORT: return 2 + pan_len; case IEEE802154_ADDR_LONG: return IEEE802154_ADDR_LEN + pan_len; default: return -EINVAL; } } static int ieee802154_hdr_get_sechdr(const u8 *buf, struct ieee802154_sechdr *hdr) { int pos = 5; memcpy(hdr, buf, 1); memcpy(&hdr->frame_counter, buf + 1, 4); switch (hdr->key_id_mode) { case IEEE802154_SCF_KEY_IMPLICIT: return pos; case IEEE802154_SCF_KEY_INDEX: break; case IEEE802154_SCF_KEY_SHORT_INDEX: memcpy(&hdr->short_src, buf + pos, 4); pos += 4; break; case IEEE802154_SCF_KEY_HW_INDEX: memcpy(&hdr->extended_src, buf + pos, IEEE802154_ADDR_LEN); pos += IEEE802154_ADDR_LEN; break; } hdr->key_id = buf[pos++]; return pos; } static int ieee802154_sechdr_lengths[4] = { [IEEE802154_SCF_KEY_IMPLICIT] = 5, [IEEE802154_SCF_KEY_INDEX] = 6, [IEEE802154_SCF_KEY_SHORT_INDEX] = 10, [IEEE802154_SCF_KEY_HW_INDEX] = 14, }; static int ieee802154_hdr_sechdr_len(u8 sc) { return ieee802154_sechdr_lengths[IEEE802154_SCF_KEY_ID_MODE(sc)]; } static int ieee802154_hdr_minlen(const struct ieee802154_hdr *hdr) { int dlen, slen; dlen = ieee802154_hdr_addr_len(hdr->fc.dest_addr_mode, false); slen = ieee802154_hdr_addr_len(hdr->fc.source_addr_mode, hdr->fc.intra_pan); if (slen < 0 || dlen < 0) return -EINVAL; return 3 + dlen + slen + hdr->fc.security_enabled; } static int ieee802154_hdr_get_addrs(const u8 *buf, struct ieee802154_hdr *hdr) { int pos = 0; pos += ieee802154_hdr_get_addr(buf + pos, hdr->fc.dest_addr_mode, false, &hdr->dest); pos += ieee802154_hdr_get_addr(buf + pos, hdr->fc.source_addr_mode, hdr->fc.intra_pan, &hdr->source); if (hdr->fc.intra_pan) hdr->source.pan_id = hdr->dest.pan_id; return pos; } int ieee802154_hdr_pull(struct sk_buff *skb, struct ieee802154_hdr *hdr) { int pos = 3, rc; if (!pskb_may_pull(skb, 3)) return -EINVAL; memcpy(hdr, skb->data, 3); rc = ieee802154_hdr_minlen(hdr); if (rc < 0 || !pskb_may_pull(skb, rc)) return -EINVAL; pos += ieee802154_hdr_get_addrs(skb->data + pos, hdr); if (hdr->fc.security_enabled) { int want = pos + ieee802154_hdr_sechdr_len(skb->data[pos]); if (!pskb_may_pull(skb, want)) return -EINVAL; pos += ieee802154_hdr_get_sechdr(skb->data + pos, &hdr->sec); } skb_pull(skb, pos); return pos; } EXPORT_SYMBOL_GPL(ieee802154_hdr_pull); int ieee802154_mac_cmd_pl_pull(struct sk_buff *skb, struct ieee802154_mac_cmd_pl *mac_pl) { if (!pskb_may_pull(skb, sizeof(*mac_pl))) return -EINVAL; memcpy(mac_pl, skb->data, sizeof(*mac_pl)); skb_pull(skb, sizeof(*mac_pl)); return 0; } EXPORT_SYMBOL_GPL(ieee802154_mac_cmd_pl_pull); int ieee802154_hdr_peek_addrs(const struct sk_buff *skb, struct ieee802154_hdr *hdr) { const u8 *buf = skb_mac_header(skb); int pos = 3, rc; if (buf + 3 > skb_tail_pointer(skb)) return -EINVAL; memcpy(hdr, buf, 3); rc = ieee802154_hdr_minlen(hdr); if (rc < 0 || buf + rc > skb_tail_pointer(skb)) return -EINVAL; pos += ieee802154_hdr_get_addrs(buf + pos, hdr); return pos; } EXPORT_SYMBOL_GPL(ieee802154_hdr_peek_addrs); int ieee802154_hdr_peek(const struct sk_buff *skb, struct ieee802154_hdr *hdr) { const u8 *buf = skb_mac_header(skb); int pos; pos = ieee802154_hdr_peek_addrs(skb, hdr); if (pos < 0) return -EINVAL; if (hdr->fc.security_enabled) { u8 key_id_mode = IEEE802154_SCF_KEY_ID_MODE(*(buf + pos)); int want = pos + ieee802154_sechdr_lengths[key_id_mode]; if (buf + want > skb_tail_pointer(skb)) return -EINVAL; pos += ieee802154_hdr_get_sechdr(buf + pos, &hdr->sec); } return pos; } EXPORT_SYMBOL_GPL(ieee802154_hdr_peek); int ieee802154_max_payload(const struct ieee802154_hdr *hdr) { int hlen = ieee802154_hdr_minlen(hdr); if (hdr->fc.security_enabled) { hlen += ieee802154_sechdr_lengths[hdr->sec.key_id_mode] - 1; hlen += ieee802154_sechdr_authtag_len(&hdr->sec); } return IEEE802154_MTU - hlen - IEEE802154_MFR_SIZE; } EXPORT_SYMBOL_GPL(ieee802154_max_payload); |
17 17 4 13 13 13 3 5 1 4 5 1 1 1 1 1 1 1 1 2 1 2 1 3 1 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 | // SPDX-License-Identifier: GPL-2.0 /* * cfg80211 wext compat for managed mode. * * Copyright 2009 Johannes Berg <johannes@sipsolutions.net> * Copyright (C) 2009, 2020-2023 Intel Corporation */ #include <linux/export.h> #include <linux/etherdevice.h> #include <linux/if_arp.h> #include <linux/slab.h> #include <net/cfg80211.h> #include <net/cfg80211-wext.h> #include "wext-compat.h" #include "nl80211.h" int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { struct cfg80211_cached_keys *ck = NULL; const u8 *prev_bssid = NULL; int err, i; ASSERT_RTNL(); lockdep_assert_wiphy(wdev->wiphy); if (!netif_running(wdev->netdev)) return 0; wdev->wext.connect.ie = wdev->wext.ie; wdev->wext.connect.ie_len = wdev->wext.ie_len; /* Use default background scan period */ wdev->wext.connect.bg_scan_period = -1; if (wdev->wext.keys) { wdev->wext.keys->def = wdev->wext.default_key; if (wdev->wext.default_key != -1) wdev->wext.connect.privacy = true; } if (!wdev->wext.connect.ssid_len) return 0; if (wdev->wext.keys && wdev->wext.keys->def != -1) { ck = kmemdup(wdev->wext.keys, sizeof(*ck), GFP_KERNEL); if (!ck) return -ENOMEM; for (i = 0; i < 4; i++) ck->params[i].key = ck->data[i]; } if (wdev->wext.prev_bssid_valid) prev_bssid = wdev->wext.prev_bssid; err = cfg80211_connect(rdev, wdev->netdev, &wdev->wext.connect, ck, prev_bssid); if (err) kfree_sensitive(ck); return err; } int cfg80211_mgd_wext_siwfreq(struct net_device *dev, struct iw_request_info *info, struct iw_freq *wextfreq, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct ieee80211_channel *chan = NULL; int err, freq; /* call only for station! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) return -EINVAL; freq = cfg80211_wext_freq(wextfreq); if (freq < 0) return freq; if (freq) { chan = ieee80211_get_channel(wdev->wiphy, freq); if (!chan) return -EINVAL; if (chan->flags & IEEE80211_CHAN_DISABLED) return -EINVAL; } if (wdev->conn) { bool event = true; if (wdev->wext.connect.channel == chan) return 0; /* if SSID set, we'll try right again, avoid event */ if (wdev->wext.connect.ssid_len) event = false; err = cfg80211_disconnect(rdev, dev, WLAN_REASON_DEAUTH_LEAVING, event); if (err) return err; } wdev->wext.connect.channel = chan; return cfg80211_mgd_wext_connect(rdev, wdev); } int cfg80211_mgd_wext_giwfreq(struct net_device *dev, struct iw_request_info *info, struct iw_freq *freq, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct ieee80211_channel *chan = NULL; /* call only for station! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) return -EINVAL; if (wdev->valid_links) return -EOPNOTSUPP; if (wdev->links[0].client.current_bss) chan = wdev->links[0].client.current_bss->pub.channel; else if (wdev->wext.connect.channel) chan = wdev->wext.connect.channel; if (chan) { freq->m = chan->center_freq; freq->e = 6; return 0; } /* no channel if not joining */ return -EINVAL; } int cfg80211_mgd_wext_siwessid(struct net_device *dev, struct iw_request_info *info, struct iw_point *data, char *ssid) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); size_t len = data->length; int err; /* call only for station! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) return -EINVAL; if (!data->flags) len = 0; /* iwconfig uses nul termination in SSID.. */ if (len > 0 && ssid[len - 1] == '\0') len--; if (wdev->conn) { bool event = true; if (wdev->wext.connect.ssid && len && len == wdev->wext.connect.ssid_len && memcmp(wdev->wext.connect.ssid, ssid, len) == 0) return 0; /* if SSID set now, we'll try to connect, avoid event */ if (len) event = false; err = cfg80211_disconnect(rdev, dev, WLAN_REASON_DEAUTH_LEAVING, event); if (err) return err; } wdev->wext.prev_bssid_valid = false; wdev->wext.connect.ssid = wdev->wext.ssid; memcpy(wdev->wext.ssid, ssid, len); wdev->wext.connect.ssid_len = len; wdev->wext.connect.crypto.control_port = false; wdev->wext.connect.crypto.control_port_ethertype = cpu_to_be16(ETH_P_PAE); return cfg80211_mgd_wext_connect(rdev, wdev); } int cfg80211_mgd_wext_giwessid(struct net_device *dev, struct iw_request_info *info, struct iw_point *data, char *ssid) { struct wireless_dev *wdev = dev->ieee80211_ptr; int ret = 0; /* call only for station! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) return -EINVAL; if (wdev->valid_links) return -EINVAL; data->flags = 0; if (wdev->links[0].client.current_bss) { const struct element *ssid_elem; rcu_read_lock(); ssid_elem = ieee80211_bss_get_elem( &wdev->links[0].client.current_bss->pub, WLAN_EID_SSID); if (ssid_elem) { data->flags = 1; data->length = ssid_elem->datalen; if (data->length > IW_ESSID_MAX_SIZE) ret = -EINVAL; else memcpy(ssid, ssid_elem->data, data->length); } rcu_read_unlock(); } else if (wdev->wext.connect.ssid && wdev->wext.connect.ssid_len) { data->flags = 1; data->length = wdev->wext.connect.ssid_len; memcpy(ssid, wdev->wext.connect.ssid, data->length); } return ret; } int cfg80211_mgd_wext_siwap(struct net_device *dev, struct iw_request_info *info, struct sockaddr *ap_addr, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); u8 *bssid = ap_addr->sa_data; int err; /* call only for station! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) return -EINVAL; if (ap_addr->sa_family != ARPHRD_ETHER) return -EINVAL; /* automatic mode */ if (is_zero_ether_addr(bssid) || is_broadcast_ether_addr(bssid)) bssid = NULL; if (wdev->conn) { /* both automatic */ if (!bssid && !wdev->wext.connect.bssid) return 0; /* fixed already - and no change */ if (wdev->wext.connect.bssid && bssid && ether_addr_equal(bssid, wdev->wext.connect.bssid)) return 0; err = cfg80211_disconnect(rdev, dev, WLAN_REASON_DEAUTH_LEAVING, false); if (err) return err; } if (bssid) { memcpy(wdev->wext.bssid, bssid, ETH_ALEN); wdev->wext.connect.bssid = wdev->wext.bssid; } else wdev->wext.connect.bssid = NULL; return cfg80211_mgd_wext_connect(rdev, wdev); } int cfg80211_mgd_wext_giwap(struct net_device *dev, struct iw_request_info *info, struct sockaddr *ap_addr, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; /* call only for station! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) return -EINVAL; ap_addr->sa_family = ARPHRD_ETHER; if (wdev->valid_links) return -EOPNOTSUPP; if (wdev->links[0].client.current_bss) memcpy(ap_addr->sa_data, wdev->links[0].client.current_bss->pub.bssid, ETH_ALEN); else eth_zero_addr(ap_addr->sa_data); return 0; } int cfg80211_wext_siwgenie(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_point *data = &wrqu->data; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); u8 *ie = extra; int ie_len = data->length, err; if (wdev->iftype != NL80211_IFTYPE_STATION) return -EOPNOTSUPP; if (!ie_len) ie = NULL; wiphy_lock(wdev->wiphy); /* no change */ err = 0; if (wdev->wext.ie_len == ie_len && memcmp(wdev->wext.ie, ie, ie_len) == 0) goto out; if (ie_len) { ie = kmemdup(extra, ie_len, GFP_KERNEL); if (!ie) { err = -ENOMEM; goto out; } } else ie = NULL; kfree(wdev->wext.ie); wdev->wext.ie = ie; wdev->wext.ie_len = ie_len; if (wdev->conn) { err = cfg80211_disconnect(rdev, dev, WLAN_REASON_DEAUTH_LEAVING, false); if (err) goto out; } /* userspace better not think we'll reconnect */ err = 0; out: wiphy_unlock(wdev->wiphy); return err; } int cfg80211_wext_siwmlme(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct iw_mlme *mlme = (struct iw_mlme *)extra; struct cfg80211_registered_device *rdev; int err; if (!wdev) return -EOPNOTSUPP; rdev = wiphy_to_rdev(wdev->wiphy); if (wdev->iftype != NL80211_IFTYPE_STATION) return -EINVAL; if (mlme->addr.sa_family != ARPHRD_ETHER) return -EINVAL; wiphy_lock(&rdev->wiphy); switch (mlme->cmd) { case IW_MLME_DEAUTH: case IW_MLME_DISASSOC: err = cfg80211_disconnect(rdev, dev, mlme->reason_code, true); break; default: err = -EOPNOTSUPP; break; } wiphy_unlock(&rdev->wiphy); return err; } |
330 330 13 13 13 4186 4192 14 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 | // SPDX-License-Identifier: GPL-2.0 /* * Block rq-qos policy for assigning an I/O priority class to requests. * * Using an rq-qos policy for assigning I/O priority class has two advantages * over using the ioprio_set() system call: * * - This policy is cgroup based so it has all the advantages of cgroups. * - While ioprio_set() does not affect page cache writeback I/O, this rq-qos * controller affects page cache writeback I/O for filesystems that support * assiociating a cgroup with writeback I/O. See also * Documentation/admin-guide/cgroup-v2.rst. */ #include <linux/blk-mq.h> #include <linux/blk_types.h> #include <linux/kernel.h> #include <linux/module.h> #include "blk-cgroup.h" #include "blk-ioprio.h" #include "blk-rq-qos.h" /** * enum prio_policy - I/O priority class policy. * @POLICY_NO_CHANGE: (default) do not modify the I/O priority class. * @POLICY_PROMOTE_TO_RT: modify no-IOPRIO_CLASS_RT to IOPRIO_CLASS_RT. * @POLICY_RESTRICT_TO_BE: modify IOPRIO_CLASS_NONE and IOPRIO_CLASS_RT into * IOPRIO_CLASS_BE. * @POLICY_ALL_TO_IDLE: change the I/O priority class into IOPRIO_CLASS_IDLE. * @POLICY_NONE_TO_RT: an alias for POLICY_PROMOTE_TO_RT. * * See also <linux/ioprio.h>. */ enum prio_policy { POLICY_NO_CHANGE = 0, POLICY_PROMOTE_TO_RT = 1, POLICY_RESTRICT_TO_BE = 2, POLICY_ALL_TO_IDLE = 3, POLICY_NONE_TO_RT = 4, }; static const char *policy_name[] = { [POLICY_NO_CHANGE] = "no-change", [POLICY_PROMOTE_TO_RT] = "promote-to-rt", [POLICY_RESTRICT_TO_BE] = "restrict-to-be", [POLICY_ALL_TO_IDLE] = "idle", [POLICY_NONE_TO_RT] = "none-to-rt", }; static struct blkcg_policy ioprio_policy; /** * struct ioprio_blkg - Per (cgroup, request queue) data. * @pd: blkg_policy_data structure. */ struct ioprio_blkg { struct blkg_policy_data pd; }; /** * struct ioprio_blkcg - Per cgroup data. * @cpd: blkcg_policy_data structure. * @prio_policy: One of the IOPRIO_CLASS_* values. See also <linux/ioprio.h>. */ struct ioprio_blkcg { struct blkcg_policy_data cpd; enum prio_policy prio_policy; }; static inline struct ioprio_blkg *pd_to_ioprio(struct blkg_policy_data *pd) { return pd ? container_of(pd, struct ioprio_blkg, pd) : NULL; } static struct ioprio_blkcg *blkcg_to_ioprio_blkcg(struct blkcg *blkcg) { return container_of(blkcg_to_cpd(blkcg, &ioprio_policy), struct ioprio_blkcg, cpd); } static struct ioprio_blkcg * ioprio_blkcg_from_css(struct cgroup_subsys_state *css) { return blkcg_to_ioprio_blkcg(css_to_blkcg(css)); } static struct ioprio_blkcg *ioprio_blkcg_from_bio(struct bio *bio) { struct blkg_policy_data *pd = blkg_to_pd(bio->bi_blkg, &ioprio_policy); if (!pd) return NULL; return blkcg_to_ioprio_blkcg(pd->blkg->blkcg); } static int ioprio_show_prio_policy(struct seq_file *sf, void *v) { struct ioprio_blkcg *blkcg = ioprio_blkcg_from_css(seq_css(sf)); seq_printf(sf, "%s\n", policy_name[blkcg->prio_policy]); return 0; } static ssize_t ioprio_set_prio_policy(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct ioprio_blkcg *blkcg = ioprio_blkcg_from_css(of_css(of)); int ret; if (off != 0) return -EIO; /* kernfs_fop_write_iter() terminates 'buf' with '\0'. */ ret = sysfs_match_string(policy_name, buf); if (ret < 0) return ret; blkcg->prio_policy = ret; return nbytes; } static struct blkg_policy_data * ioprio_alloc_pd(struct gendisk *disk, struct blkcg *blkcg, gfp_t gfp) { struct ioprio_blkg *ioprio_blkg; ioprio_blkg = kzalloc(sizeof(*ioprio_blkg), gfp); if (!ioprio_blkg) return NULL; return &ioprio_blkg->pd; } static void ioprio_free_pd(struct blkg_policy_data *pd) { struct ioprio_blkg *ioprio_blkg = pd_to_ioprio(pd); kfree(ioprio_blkg); } static struct blkcg_policy_data *ioprio_alloc_cpd(gfp_t gfp) { struct ioprio_blkcg *blkcg; blkcg = kzalloc(sizeof(*blkcg), gfp); if (!blkcg) return NULL; blkcg->prio_policy = POLICY_NO_CHANGE; return &blkcg->cpd; } static void ioprio_free_cpd(struct blkcg_policy_data *cpd) { struct ioprio_blkcg *blkcg = container_of(cpd, typeof(*blkcg), cpd); kfree(blkcg); } #define IOPRIO_ATTRS \ { \ .name = "prio.class", \ .seq_show = ioprio_show_prio_policy, \ .write = ioprio_set_prio_policy, \ }, \ { } /* sentinel */ /* cgroup v2 attributes */ static struct cftype ioprio_files[] = { IOPRIO_ATTRS }; /* cgroup v1 attributes */ static struct cftype ioprio_legacy_files[] = { IOPRIO_ATTRS }; static struct blkcg_policy ioprio_policy = { .dfl_cftypes = ioprio_files, .legacy_cftypes = ioprio_legacy_files, .cpd_alloc_fn = ioprio_alloc_cpd, .cpd_free_fn = ioprio_free_cpd, .pd_alloc_fn = ioprio_alloc_pd, .pd_free_fn = ioprio_free_pd, }; void blkcg_set_ioprio(struct bio *bio) { struct ioprio_blkcg *blkcg = ioprio_blkcg_from_bio(bio); u16 prio; if (!blkcg || blkcg->prio_policy == POLICY_NO_CHANGE) return; if (blkcg->prio_policy == POLICY_PROMOTE_TO_RT || blkcg->prio_policy == POLICY_NONE_TO_RT) { /* * For RT threads, the default priority level is 4 because * task_nice is 0. By promoting non-RT io-priority to RT-class * and default level 4, those requests that are already * RT-class but need a higher io-priority can use ioprio_set() * to achieve this. */ if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) != IOPRIO_CLASS_RT) bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 4); return; } /* * Except for IOPRIO_CLASS_NONE, higher I/O priority numbers * correspond to a lower priority. Hence, the max_t() below selects * the lower priority of bi_ioprio and the cgroup I/O priority class. * If the bio I/O priority equals IOPRIO_CLASS_NONE, the cgroup I/O * priority is assigned to the bio. */ prio = max_t(u16, bio->bi_ioprio, IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0)); if (prio > bio->bi_ioprio) bio->bi_ioprio = prio; } void blk_ioprio_exit(struct gendisk *disk) { blkcg_deactivate_policy(disk, &ioprio_policy); } int blk_ioprio_init(struct gendisk *disk) { return blkcg_activate_policy(disk, &ioprio_policy); } static int __init ioprio_init(void) { return blkcg_policy_register(&ioprio_policy); } static void __exit ioprio_exit(void) { blkcg_policy_unregister(&ioprio_policy); } module_init(ioprio_init); module_exit(ioprio_exit); |
1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 | // SPDX-License-Identifier: GPL-2.0-or-later /* * USB HID driver for Glorious PC Gaming Race * Glorious Model O, O- and D mice. * * Copyright (c) 2020 Samuel ÄŒavoj <sammko@sammserver.com> */ /* */ #include <linux/hid.h> #include <linux/module.h> #include "hid-ids.h" MODULE_AUTHOR("Samuel ÄŒavoj <sammko@sammserver.com>"); MODULE_DESCRIPTION("HID driver for Glorious PC Gaming Race mice"); /* * Glorious Model O and O- specify the const flag in the consumer input * report descriptor, which leads to inputs being ignored. Fix this * by patching the descriptor. * * Glorious Model I incorrectly specifes the Usage Minimum for its * keyboard HID report, causing keycodes to be misinterpreted. * Fix this by setting Usage Minimum to 0 in that report. */ static __u8 *glorious_report_fixup(struct hid_device *hdev, __u8 *rdesc, unsigned int *rsize) { if (*rsize == 213 && rdesc[84] == 129 && rdesc[112] == 129 && rdesc[140] == 129 && rdesc[85] == 3 && rdesc[113] == 3 && rdesc[141] == 3) { hid_info(hdev, "patching Glorious Model O consumer control report descriptor\n"); rdesc[85] = rdesc[113] = rdesc[141] = \ HID_MAIN_ITEM_VARIABLE | HID_MAIN_ITEM_RELATIVE; } if (*rsize == 156 && rdesc[41] == 1) { hid_info(hdev, "patching Glorious Model I keyboard report descriptor\n"); rdesc[41] = 0; } return rdesc; } static void glorious_update_name(struct hid_device *hdev) { const char *model = "Device"; switch (hdev->product) { case USB_DEVICE_ID_GLORIOUS_MODEL_O: model = "Model O"; break; case USB_DEVICE_ID_GLORIOUS_MODEL_D: model = "Model D"; break; case USB_DEVICE_ID_GLORIOUS_MODEL_I: model = "Model I"; break; } snprintf(hdev->name, sizeof(hdev->name), "%s %s", "Glorious", model); } static int glorious_probe(struct hid_device *hdev, const struct hid_device_id *id) { int ret; hdev->quirks |= HID_QUIRK_INPUT_PER_APP; ret = hid_parse(hdev); if (ret) return ret; glorious_update_name(hdev); return hid_hw_start(hdev, HID_CONNECT_DEFAULT); } static const struct hid_device_id glorious_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_SINOWEALTH, USB_DEVICE_ID_GLORIOUS_MODEL_O) }, { HID_USB_DEVICE(USB_VENDOR_ID_SINOWEALTH, USB_DEVICE_ID_GLORIOUS_MODEL_D) }, { HID_USB_DEVICE(USB_VENDOR_ID_LAVIEW, USB_DEVICE_ID_GLORIOUS_MODEL_I) }, { } }; MODULE_DEVICE_TABLE(hid, glorious_devices); static struct hid_driver glorious_driver = { .name = "glorious", .id_table = glorious_devices, .probe = glorious_probe, .report_fixup = glorious_report_fixup }; module_hid_driver(glorious_driver); MODULE_LICENSE("GPL"); |
1321 279 1672 85 10941 337 20 6 694 279 270 279 2191 1611 1614 605 1611 6 1613 12574 12057 11 2459 1354 270 12572 12059 32 2 279 2 428 1 378 428 686 687 685 1611 1613 1592 12 284 1613 6 6 6 1613 1613 1611 1614 1613 1613 284 1301 2 1420 605 1612 2326 1088 1240 913 5 12318 3199 219 638 782 626 197 1349 1351 586 988 1351 1292 155 1292 1289 1291 1 5 1352 7 331 4 5 7 7 7 1868 49 853 1335 1077 337 279 1 218 279 279 1299 1300 1299 1301 278 278 1 279 279 34 88 1 216 118 3 120 121 121 282 282 282 28 9 282 247 115 282 282 277 325 1469 277 282 282 14 282 279 282 279 282 2 282 278 10 108 102 117 24 230 85 2 84 3 2 277 2 217 62 248 2 247 248 222 20 2040 1975 604 2040 2040 3 3 3 3 3 28 14 20 28 28 279 28 26 14 14 28 1 279 10 10 28 10 10 10 10 279 275 6 279 278 279 279 278 278 279 279 2 279 279 278 279 2 279 279 279 279 279 279 279 278 279 4 279 14 27 28 27 27 14 25 6 5 27 3482 12317 12287 11134 10174 10209 12317 11989 12263 28 28 3 10 28 28 28 28 28 28 28 28 28 10 21 21 21 21 21 19 7 21 27 11 11 10 3 3 3 3 270 270 270 270 270 270 1 269 218 58 270 268 41 235 270 270 270 269 270 270 270 29 269 114 31 31 184 30 184 210 184 31 270 270 113 113 270 270 31 184 5 5 276 11 270 5 15 15 15 416 417 417 299 417 417 1059 20 157 13 164 20 2039 2041 2041 2038 1204 1655 1181 160 20 20 1 15 2 7 11 1053 10774 10819 692 686 687 692 687 98 98 98 20 19 20 20 20 20 20 5 15 20 1452 1369 470 1298 1299 161 1452 1450 979 1106 1 1448 82 203 1370 1449 1298 159 1450 48 29 19 15 996 168 59 3 730 400 1688 1482 154 1688 1526 1091 1326 162 4 601 587 24 601 600 277 277 2017 1386 177 644 270 1842 1482 1455 277 1794 1796 1046 20 1668 98 612 613 171 171 545 612 613 613 611 56 56 20 41 57 57 2001 2002 467 1621 1621 2001 57 2001 1998 1002 122 426 427 1 426 428 423 424 428 426 4810 4822 4328 258 1543 1205 751 4710 667 4702 427 4706 4710 625 1298 478 478 477 477 478 156 477 478 478 478 1 352 9 397 80 478 478 83 545 545 545 545 545 5 545 5 3379 910 3371 3368 844 842 83 545 5 613 613 545 544 544 544 545 477 478 477 478 478 83 478 4 478 477 31 32 32 32 361 329 32 32 32 32 361 2 359 1259 13 1492 905 158 158 158 920 921 921 919 1454 1453 1451 1454 1456 1454 1457 1436 167 5 1348 1326 1296 52 267 1244 17 1417 1413 2192 2191 2170 286 286 965 966 967 967 2002 2 2003 221 220 1967 2 1962 905 906 4 2165 4 2 846 848 1564 1671 843 2297 2303 191 190 191 191 1524 1525 687 688 689 688 692 690 38 38 38 38 613 612 191 190 158 157 158 158 158 158 158 158 158 158 158 158 157 158 1048 899 261 58 58 10805 10823 150 10842 10777 3177 212 3070 3063 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 | // SPDX-License-Identifier: GPL-2.0+ /* * Maple Tree implementation * Copyright (c) 2018-2022 Oracle Corporation * Authors: Liam R. Howlett <Liam.Howlett@oracle.com> * Matthew Wilcox <willy@infradead.org> * Copyright (c) 2023 ByteDance * Author: Peng Zhang <zhangpeng.00@bytedance.com> */ /* * DOC: Interesting implementation details of the Maple Tree * * Each node type has a number of slots for entries and a number of slots for * pivots. In the case of dense nodes, the pivots are implied by the position * and are simply the slot index + the minimum of the node. * * In regular B-Tree terms, pivots are called keys. The term pivot is used to * indicate that the tree is specifying ranges. Pivots may appear in the * subtree with an entry attached to the value whereas keys are unique to a * specific position of a B-tree. Pivot values are inclusive of the slot with * the same index. * * * The following illustrates the layout of a range64 nodes slots and pivots. * * * Slots -> | 0 | 1 | 2 | ... | 12 | 13 | 14 | 15 | * ┬ ┬ ┬ ┬ ┬ ┬ ┬ ┬ ┬ * │ │ │ │ │ │ │ │ └─ Implied maximum * │ │ │ │ │ │ │ └─ Pivot 14 * │ │ │ │ │ │ └─ Pivot 13 * │ │ │ │ │ └─ Pivot 12 * │ │ │ │ └─ Pivot 11 * │ │ │ └─ Pivot 2 * │ │ └─ Pivot 1 * │ └─ Pivot 0 * └─ Implied minimum * * Slot contents: * Internal (non-leaf) nodes contain pointers to other nodes. * Leaf nodes contain entries. * * The location of interest is often referred to as an offset. All offsets have * a slot, but the last offset has an implied pivot from the node above (or * UINT_MAX for the root node. * * Ranges complicate certain write activities. When modifying any of * the B-tree variants, it is known that one entry will either be added or * deleted. When modifying the Maple Tree, one store operation may overwrite * the entire data set, or one half of the tree, or the middle half of the tree. * */ #include <linux/maple_tree.h> #include <linux/xarray.h> #include <linux/types.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/limits.h> #include <asm/barrier.h> #define CREATE_TRACE_POINTS #include <trace/events/maple_tree.h> #define MA_ROOT_PARENT 1 /* * Maple state flags * * MA_STATE_BULK - Bulk insert mode * * MA_STATE_REBALANCE - Indicate a rebalance during bulk insert * * MA_STATE_PREALLOC - Preallocated nodes, WARN_ON allocation */ #define MA_STATE_BULK 1 #define MA_STATE_REBALANCE 2 #define MA_STATE_PREALLOC 4 #define ma_parent_ptr(x) ((struct maple_pnode *)(x)) #define mas_tree_parent(x) ((unsigned long)(x->tree) | MA_ROOT_PARENT) #define ma_mnode_ptr(x) ((struct maple_node *)(x)) #define ma_enode_ptr(x) ((struct maple_enode *)(x)) static struct kmem_cache *maple_node_cache; #ifdef CONFIG_DEBUG_MAPLE_TREE static const unsigned long mt_max[] = { [maple_dense] = MAPLE_NODE_SLOTS, [maple_leaf_64] = ULONG_MAX, [maple_range_64] = ULONG_MAX, [maple_arange_64] = ULONG_MAX, }; #define mt_node_max(x) mt_max[mte_node_type(x)] #endif static const unsigned char mt_slots[] = { [maple_dense] = MAPLE_NODE_SLOTS, [maple_leaf_64] = MAPLE_RANGE64_SLOTS, [maple_range_64] = MAPLE_RANGE64_SLOTS, [maple_arange_64] = MAPLE_ARANGE64_SLOTS, }; #define mt_slot_count(x) mt_slots[mte_node_type(x)] static const unsigned char mt_pivots[] = { [maple_dense] = 0, [maple_leaf_64] = MAPLE_RANGE64_SLOTS - 1, [maple_range_64] = MAPLE_RANGE64_SLOTS - 1, [maple_arange_64] = MAPLE_ARANGE64_SLOTS - 1, }; #define mt_pivot_count(x) mt_pivots[mte_node_type(x)] static const unsigned char mt_min_slots[] = { [maple_dense] = MAPLE_NODE_SLOTS / 2, [maple_leaf_64] = (MAPLE_RANGE64_SLOTS / 2) - 2, [maple_range_64] = (MAPLE_RANGE64_SLOTS / 2) - 2, [maple_arange_64] = (MAPLE_ARANGE64_SLOTS / 2) - 1, }; #define mt_min_slot_count(x) mt_min_slots[mte_node_type(x)] #define MAPLE_BIG_NODE_SLOTS (MAPLE_RANGE64_SLOTS * 2 + 2) #define MAPLE_BIG_NODE_GAPS (MAPLE_ARANGE64_SLOTS * 2 + 1) struct maple_big_node { struct maple_pnode *parent; unsigned long pivot[MAPLE_BIG_NODE_SLOTS - 1]; union { struct maple_enode *slot[MAPLE_BIG_NODE_SLOTS]; struct { unsigned long padding[MAPLE_BIG_NODE_GAPS]; unsigned long gap[MAPLE_BIG_NODE_GAPS]; }; }; unsigned char b_end; enum maple_type type; }; /* * The maple_subtree_state is used to build a tree to replace a segment of an * existing tree in a more atomic way. Any walkers of the older tree will hit a * dead node and restart on updates. */ struct maple_subtree_state { struct ma_state *orig_l; /* Original left side of subtree */ struct ma_state *orig_r; /* Original right side of subtree */ struct ma_state *l; /* New left side of subtree */ struct ma_state *m; /* New middle of subtree (rare) */ struct ma_state *r; /* New right side of subtree */ struct ma_topiary *free; /* nodes to be freed */ struct ma_topiary *destroy; /* Nodes to be destroyed (walked and freed) */ struct maple_big_node *bn; }; #ifdef CONFIG_KASAN_STACK /* Prevent mas_wr_bnode() from exceeding the stack frame limit */ #define noinline_for_kasan noinline_for_stack #else #define noinline_for_kasan inline #endif /* Functions */ static inline struct maple_node *mt_alloc_one(gfp_t gfp) { return kmem_cache_alloc(maple_node_cache, gfp); } static inline int mt_alloc_bulk(gfp_t gfp, size_t size, void **nodes) { return kmem_cache_alloc_bulk(maple_node_cache, gfp, size, nodes); } static inline void mt_free_one(struct maple_node *node) { kmem_cache_free(maple_node_cache, node); } static inline void mt_free_bulk(size_t size, void __rcu **nodes) { kmem_cache_free_bulk(maple_node_cache, size, (void **)nodes); } static void mt_free_rcu(struct rcu_head *head) { struct maple_node *node = container_of(head, struct maple_node, rcu); kmem_cache_free(maple_node_cache, node); } /* * ma_free_rcu() - Use rcu callback to free a maple node * @node: The node to free * * The maple tree uses the parent pointer to indicate this node is no longer in * use and will be freed. */ static void ma_free_rcu(struct maple_node *node) { WARN_ON(node->parent != ma_parent_ptr(node)); call_rcu(&node->rcu, mt_free_rcu); } static void mas_set_height(struct ma_state *mas) { unsigned int new_flags = mas->tree->ma_flags; new_flags &= ~MT_FLAGS_HEIGHT_MASK; MAS_BUG_ON(mas, mas->depth > MAPLE_HEIGHT_MAX); new_flags |= mas->depth << MT_FLAGS_HEIGHT_OFFSET; mas->tree->ma_flags = new_flags; } static unsigned int mas_mt_height(struct ma_state *mas) { return mt_height(mas->tree); } static inline unsigned int mt_attr(struct maple_tree *mt) { return mt->ma_flags & ~MT_FLAGS_HEIGHT_MASK; } static __always_inline enum maple_type mte_node_type( const struct maple_enode *entry) { return ((unsigned long)entry >> MAPLE_NODE_TYPE_SHIFT) & MAPLE_NODE_TYPE_MASK; } static __always_inline bool ma_is_dense(const enum maple_type type) { return type < maple_leaf_64; } static __always_inline bool ma_is_leaf(const enum maple_type type) { return type < maple_range_64; } static __always_inline bool mte_is_leaf(const struct maple_enode *entry) { return ma_is_leaf(mte_node_type(entry)); } /* * We also reserve values with the bottom two bits set to '10' which are * below 4096 */ static __always_inline bool mt_is_reserved(const void *entry) { return ((unsigned long)entry < MAPLE_RESERVED_RANGE) && xa_is_internal(entry); } static __always_inline void mas_set_err(struct ma_state *mas, long err) { mas->node = MA_ERROR(err); mas->status = ma_error; } static __always_inline bool mas_is_ptr(const struct ma_state *mas) { return mas->status == ma_root; } static __always_inline bool mas_is_start(const struct ma_state *mas) { return mas->status == ma_start; } static __always_inline bool mas_is_none(const struct ma_state *mas) { return mas->status == ma_none; } static __always_inline bool mas_is_paused(const struct ma_state *mas) { return mas->status == ma_pause; } static __always_inline bool mas_is_overflow(struct ma_state *mas) { return mas->status == ma_overflow; } static inline bool mas_is_underflow(struct ma_state *mas) { return mas->status == ma_underflow; } static __always_inline struct maple_node *mte_to_node( const struct maple_enode *entry) { return (struct maple_node *)((unsigned long)entry & ~MAPLE_NODE_MASK); } /* * mte_to_mat() - Convert a maple encoded node to a maple topiary node. * @entry: The maple encoded node * * Return: a maple topiary pointer */ static inline struct maple_topiary *mte_to_mat(const struct maple_enode *entry) { return (struct maple_topiary *) ((unsigned long)entry & ~MAPLE_NODE_MASK); } /* * mas_mn() - Get the maple state node. * @mas: The maple state * * Return: the maple node (not encoded - bare pointer). */ static inline struct maple_node *mas_mn(const struct ma_state *mas) { return mte_to_node(mas->node); } /* * mte_set_node_dead() - Set a maple encoded node as dead. * @mn: The maple encoded node. */ static inline void mte_set_node_dead(struct maple_enode *mn) { mte_to_node(mn)->parent = ma_parent_ptr(mte_to_node(mn)); smp_wmb(); /* Needed for RCU */ } /* Bit 1 indicates the root is a node */ #define MAPLE_ROOT_NODE 0x02 /* maple_type stored bit 3-6 */ #define MAPLE_ENODE_TYPE_SHIFT 0x03 /* Bit 2 means a NULL somewhere below */ #define MAPLE_ENODE_NULL 0x04 static inline struct maple_enode *mt_mk_node(const struct maple_node *node, enum maple_type type) { return (void *)((unsigned long)node | (type << MAPLE_ENODE_TYPE_SHIFT) | MAPLE_ENODE_NULL); } static inline void *mte_mk_root(const struct maple_enode *node) { return (void *)((unsigned long)node | MAPLE_ROOT_NODE); } static inline void *mte_safe_root(const struct maple_enode *node) { return (void *)((unsigned long)node & ~MAPLE_ROOT_NODE); } static inline void *mte_set_full(const struct maple_enode *node) { return (void *)((unsigned long)node & ~MAPLE_ENODE_NULL); } static inline void *mte_clear_full(const struct maple_enode *node) { return (void *)((unsigned long)node | MAPLE_ENODE_NULL); } static inline bool mte_has_null(const struct maple_enode *node) { return (unsigned long)node & MAPLE_ENODE_NULL; } static __always_inline bool ma_is_root(struct maple_node *node) { return ((unsigned long)node->parent & MA_ROOT_PARENT); } static __always_inline bool mte_is_root(const struct maple_enode *node) { return ma_is_root(mte_to_node(node)); } static inline bool mas_is_root_limits(const struct ma_state *mas) { return !mas->min && mas->max == ULONG_MAX; } static __always_inline bool mt_is_alloc(struct maple_tree *mt) { return (mt->ma_flags & MT_FLAGS_ALLOC_RANGE); } /* * The Parent Pointer * Excluding root, the parent pointer is 256B aligned like all other tree nodes. * When storing a 32 or 64 bit values, the offset can fit into 5 bits. The 16 * bit values need an extra bit to store the offset. This extra bit comes from * a reuse of the last bit in the node type. This is possible by using bit 1 to * indicate if bit 2 is part of the type or the slot. * * Note types: * 0x??1 = Root * 0x?00 = 16 bit nodes * 0x010 = 32 bit nodes * 0x110 = 64 bit nodes * * Slot size and alignment * 0b??1 : Root * 0b?00 : 16 bit values, type in 0-1, slot in 2-7 * 0b010 : 32 bit values, type in 0-2, slot in 3-7 * 0b110 : 64 bit values, type in 0-2, slot in 3-7 */ #define MAPLE_PARENT_ROOT 0x01 #define MAPLE_PARENT_SLOT_SHIFT 0x03 #define MAPLE_PARENT_SLOT_MASK 0xF8 #define MAPLE_PARENT_16B_SLOT_SHIFT 0x02 #define MAPLE_PARENT_16B_SLOT_MASK 0xFC #define MAPLE_PARENT_RANGE64 0x06 #define MAPLE_PARENT_RANGE32 0x04 #define MAPLE_PARENT_NOT_RANGE16 0x02 /* * mte_parent_shift() - Get the parent shift for the slot storage. * @parent: The parent pointer cast as an unsigned long * Return: The shift into that pointer to the star to of the slot */ static inline unsigned long mte_parent_shift(unsigned long parent) { /* Note bit 1 == 0 means 16B */ if (likely(parent & MAPLE_PARENT_NOT_RANGE16)) return MAPLE_PARENT_SLOT_SHIFT; return MAPLE_PARENT_16B_SLOT_SHIFT; } /* * mte_parent_slot_mask() - Get the slot mask for the parent. * @parent: The parent pointer cast as an unsigned long. * Return: The slot mask for that parent. */ static inline unsigned long mte_parent_slot_mask(unsigned long parent) { /* Note bit 1 == 0 means 16B */ if (likely(parent & MAPLE_PARENT_NOT_RANGE16)) return MAPLE_PARENT_SLOT_MASK; return MAPLE_PARENT_16B_SLOT_MASK; } /* * mas_parent_type() - Return the maple_type of the parent from the stored * parent type. * @mas: The maple state * @enode: The maple_enode to extract the parent's enum * Return: The node->parent maple_type */ static inline enum maple_type mas_parent_type(struct ma_state *mas, struct maple_enode *enode) { unsigned long p_type; p_type = (unsigned long)mte_to_node(enode)->parent; if (WARN_ON(p_type & MAPLE_PARENT_ROOT)) return 0; p_type &= MAPLE_NODE_MASK; p_type &= ~mte_parent_slot_mask(p_type); switch (p_type) { case MAPLE_PARENT_RANGE64: /* or MAPLE_PARENT_ARANGE64 */ if (mt_is_alloc(mas->tree)) return maple_arange_64; return maple_range_64; } return 0; } /* * mas_set_parent() - Set the parent node and encode the slot * @enode: The encoded maple node. * @parent: The encoded maple node that is the parent of @enode. * @slot: The slot that @enode resides in @parent. * * Slot number is encoded in the enode->parent bit 3-6 or 2-6, depending on the * parent type. */ static inline void mas_set_parent(struct ma_state *mas, struct maple_enode *enode, const struct maple_enode *parent, unsigned char slot) { unsigned long val = (unsigned long)parent; unsigned long shift; unsigned long type; enum maple_type p_type = mte_node_type(parent); MAS_BUG_ON(mas, p_type == maple_dense); MAS_BUG_ON(mas, p_type == maple_leaf_64); switch (p_type) { case maple_range_64: case maple_arange_64: shift = MAPLE_PARENT_SLOT_SHIFT; type = MAPLE_PARENT_RANGE64; break; default: case maple_dense: case maple_leaf_64: shift = type = 0; break; } val &= ~MAPLE_NODE_MASK; /* Clear all node metadata in parent */ val |= (slot << shift) | type; mte_to_node(enode)->parent = ma_parent_ptr(val); } /* * mte_parent_slot() - get the parent slot of @enode. * @enode: The encoded maple node. * * Return: The slot in the parent node where @enode resides. */ static __always_inline unsigned int mte_parent_slot(const struct maple_enode *enode) { unsigned long val = (unsigned long)mte_to_node(enode)->parent; if (unlikely(val & MA_ROOT_PARENT)) return 0; /* * Okay to use MAPLE_PARENT_16B_SLOT_MASK as the last bit will be lost * by shift if the parent shift is MAPLE_PARENT_SLOT_SHIFT */ return (val & MAPLE_PARENT_16B_SLOT_MASK) >> mte_parent_shift(val); } /* * mte_parent() - Get the parent of @node. * @node: The encoded maple node. * * Return: The parent maple node. */ static __always_inline struct maple_node *mte_parent(const struct maple_enode *enode) { return (void *)((unsigned long) (mte_to_node(enode)->parent) & ~MAPLE_NODE_MASK); } /* * ma_dead_node() - check if the @enode is dead. * @enode: The encoded maple node * * Return: true if dead, false otherwise. */ static __always_inline bool ma_dead_node(const struct maple_node *node) { struct maple_node *parent; /* Do not reorder reads from the node prior to the parent check */ smp_rmb(); parent = (void *)((unsigned long) node->parent & ~MAPLE_NODE_MASK); return (parent == node); } /* * mte_dead_node() - check if the @enode is dead. * @enode: The encoded maple node * * Return: true if dead, false otherwise. */ static __always_inline bool mte_dead_node(const struct maple_enode *enode) { struct maple_node *parent, *node; node = mte_to_node(enode); /* Do not reorder reads from the node prior to the parent check */ smp_rmb(); parent = mte_parent(enode); return (parent == node); } /* * mas_allocated() - Get the number of nodes allocated in a maple state. * @mas: The maple state * * The ma_state alloc member is overloaded to hold a pointer to the first * allocated node or to the number of requested nodes to allocate. If bit 0 is * set, then the alloc contains the number of requested nodes. If there is an * allocated node, then the total allocated nodes is in that node. * * Return: The total number of nodes allocated */ static inline unsigned long mas_allocated(const struct ma_state *mas) { if (!mas->alloc || ((unsigned long)mas->alloc & 0x1)) return 0; return mas->alloc->total; } /* * mas_set_alloc_req() - Set the requested number of allocations. * @mas: the maple state * @count: the number of allocations. * * The requested number of allocations is either in the first allocated node, * located in @mas->alloc->request_count, or directly in @mas->alloc if there is * no allocated node. Set the request either in the node or do the necessary * encoding to store in @mas->alloc directly. */ static inline void mas_set_alloc_req(struct ma_state *mas, unsigned long count) { if (!mas->alloc || ((unsigned long)mas->alloc & 0x1)) { if (!count) mas->alloc = NULL; else mas->alloc = (struct maple_alloc *)(((count) << 1U) | 1U); return; } mas->alloc->request_count = count; } /* * mas_alloc_req() - get the requested number of allocations. * @mas: The maple state * * The alloc count is either stored directly in @mas, or in * @mas->alloc->request_count if there is at least one node allocated. Decode * the request count if it's stored directly in @mas->alloc. * * Return: The allocation request count. */ static inline unsigned int mas_alloc_req(const struct ma_state *mas) { if ((unsigned long)mas->alloc & 0x1) return (unsigned long)(mas->alloc) >> 1; else if (mas->alloc) return mas->alloc->request_count; return 0; } /* * ma_pivots() - Get a pointer to the maple node pivots. * @node - the maple node * @type - the node type * * In the event of a dead node, this array may be %NULL * * Return: A pointer to the maple node pivots */ static inline unsigned long *ma_pivots(struct maple_node *node, enum maple_type type) { switch (type) { case maple_arange_64: return node->ma64.pivot; case maple_range_64: case maple_leaf_64: return node->mr64.pivot; case maple_dense: return NULL; } return NULL; } /* * ma_gaps() - Get a pointer to the maple node gaps. * @node - the maple node * @type - the node type * * Return: A pointer to the maple node gaps */ static inline unsigned long *ma_gaps(struct maple_node *node, enum maple_type type) { switch (type) { case maple_arange_64: return node->ma64.gap; case maple_range_64: case maple_leaf_64: case maple_dense: return NULL; } return NULL; } /* * mas_safe_pivot() - get the pivot at @piv or mas->max. * @mas: The maple state * @pivots: The pointer to the maple node pivots * @piv: The pivot to fetch * @type: The maple node type * * Return: The pivot at @piv within the limit of the @pivots array, @mas->max * otherwise. */ static __always_inline unsigned long mas_safe_pivot(const struct ma_state *mas, unsigned long *pivots, unsigned char piv, enum maple_type type) { if (piv >= mt_pivots[type]) return mas->max; return pivots[piv]; } /* * mas_safe_min() - Return the minimum for a given offset. * @mas: The maple state * @pivots: The pointer to the maple node pivots * @offset: The offset into the pivot array * * Return: The minimum range value that is contained in @offset. */ static inline unsigned long mas_safe_min(struct ma_state *mas, unsigned long *pivots, unsigned char offset) { if (likely(offset)) return pivots[offset - 1] + 1; return mas->min; } /* * mte_set_pivot() - Set a pivot to a value in an encoded maple node. * @mn: The encoded maple node * @piv: The pivot offset * @val: The value of the pivot */ static inline void mte_set_pivot(struct maple_enode *mn, unsigned char piv, unsigned long val) { struct maple_node *node = mte_to_node(mn); enum maple_type type = mte_node_type(mn); BUG_ON(piv >= mt_pivots[type]); switch (type) { case maple_range_64: case maple_leaf_64: node->mr64.pivot[piv] = val; break; case maple_arange_64: node->ma64.pivot[piv] = val; break; case maple_dense: break; } } /* * ma_slots() - Get a pointer to the maple node slots. * @mn: The maple node * @mt: The maple node type * * Return: A pointer to the maple node slots */ static inline void __rcu **ma_slots(struct maple_node *mn, enum maple_type mt) { switch (mt) { case maple_arange_64: return mn->ma64.slot; case maple_range_64: case maple_leaf_64: return mn->mr64.slot; case maple_dense: return mn->slot; } return NULL; } static inline bool mt_write_locked(const struct maple_tree *mt) { return mt_external_lock(mt) ? mt_write_lock_is_held(mt) : lockdep_is_held(&mt->ma_lock); } static __always_inline bool mt_locked(const struct maple_tree *mt) { return mt_external_lock(mt) ? mt_lock_is_held(mt) : lockdep_is_held(&mt->ma_lock); } static __always_inline void *mt_slot(const struct maple_tree *mt, void __rcu **slots, unsigned char offset) { return rcu_dereference_check(slots[offset], mt_locked(mt)); } static __always_inline void *mt_slot_locked(struct maple_tree *mt, void __rcu **slots, unsigned char offset) { return rcu_dereference_protected(slots[offset], mt_write_locked(mt)); } /* * mas_slot_locked() - Get the slot value when holding the maple tree lock. * @mas: The maple state * @slots: The pointer to the slots * @offset: The offset into the slots array to fetch * * Return: The entry stored in @slots at the @offset. */ static __always_inline void *mas_slot_locked(struct ma_state *mas, void __rcu **slots, unsigned char offset) { return mt_slot_locked(mas->tree, slots, offset); } /* * mas_slot() - Get the slot value when not holding the maple tree lock. * @mas: The maple state * @slots: The pointer to the slots * @offset: The offset into the slots array to fetch * * Return: The entry stored in @slots at the @offset */ static __always_inline void *mas_slot(struct ma_state *mas, void __rcu **slots, unsigned char offset) { return mt_slot(mas->tree, slots, offset); } /* * mas_root() - Get the maple tree root. * @mas: The maple state. * * Return: The pointer to the root of the tree */ static __always_inline void *mas_root(struct ma_state *mas) { return rcu_dereference_check(mas->tree->ma_root, mt_locked(mas->tree)); } static inline void *mt_root_locked(struct maple_tree *mt) { return rcu_dereference_protected(mt->ma_root, mt_write_locked(mt)); } /* * mas_root_locked() - Get the maple tree root when holding the maple tree lock. * @mas: The maple state. * * Return: The pointer to the root of the tree */ static inline void *mas_root_locked(struct ma_state *mas) { return mt_root_locked(mas->tree); } static inline struct maple_metadata *ma_meta(struct maple_node *mn, enum maple_type mt) { switch (mt) { case maple_arange_64: return &mn->ma64.meta; default: return &mn->mr64.meta; } } /* * ma_set_meta() - Set the metadata information of a node. * @mn: The maple node * @mt: The maple node type * @offset: The offset of the highest sub-gap in this node. * @end: The end of the data in this node. */ static inline void ma_set_meta(struct maple_node *mn, enum maple_type mt, unsigned char offset, unsigned char end) { struct maple_metadata *meta = ma_meta(mn, mt); meta->gap = offset; meta->end = end; } /* * mt_clear_meta() - clear the metadata information of a node, if it exists * @mt: The maple tree * @mn: The maple node * @type: The maple node type * @offset: The offset of the highest sub-gap in this node. * @end: The end of the data in this node. */ static inline void mt_clear_meta(struct maple_tree *mt, struct maple_node *mn, enum maple_type type) { struct maple_metadata *meta; unsigned long *pivots; void __rcu **slots; void *next; switch (type) { case maple_range_64: pivots = mn->mr64.pivot; if (unlikely(pivots[MAPLE_RANGE64_SLOTS - 2])) { slots = mn->mr64.slot; next = mt_slot_locked(mt, slots, MAPLE_RANGE64_SLOTS - 1); if (unlikely((mte_to_node(next) && mte_node_type(next)))) return; /* no metadata, could be node */ } fallthrough; case maple_arange_64: meta = ma_meta(mn, type); break; default: return; } meta->gap = 0; meta->end = 0; } /* * ma_meta_end() - Get the data end of a node from the metadata * @mn: The maple node * @mt: The maple node type */ static inline unsigned char ma_meta_end(struct maple_node *mn, enum maple_type mt) { struct maple_metadata *meta = ma_meta(mn, mt); return meta->end; } /* * ma_meta_gap() - Get the largest gap location of a node from the metadata * @mn: The maple node */ static inline unsigned char ma_meta_gap(struct maple_node *mn) { return mn->ma64.meta.gap; } /* * ma_set_meta_gap() - Set the largest gap location in a nodes metadata * @mn: The maple node * @mn: The maple node type * @offset: The location of the largest gap. */ static inline void ma_set_meta_gap(struct maple_node *mn, enum maple_type mt, unsigned char offset) { struct maple_metadata *meta = ma_meta(mn, mt); meta->gap = offset; } /* * mat_add() - Add a @dead_enode to the ma_topiary of a list of dead nodes. * @mat - the ma_topiary, a linked list of dead nodes. * @dead_enode - the node to be marked as dead and added to the tail of the list * * Add the @dead_enode to the linked list in @mat. */ static inline void mat_add(struct ma_topiary *mat, struct maple_enode *dead_enode) { mte_set_node_dead(dead_enode); mte_to_mat(dead_enode)->next = NULL; if (!mat->tail) { mat->tail = mat->head = dead_enode; return; } mte_to_mat(mat->tail)->next = dead_enode; mat->tail = dead_enode; } static void mt_free_walk(struct rcu_head *head); static void mt_destroy_walk(struct maple_enode *enode, struct maple_tree *mt, bool free); /* * mas_mat_destroy() - Free all nodes and subtrees in a dead list. * @mas - the maple state * @mat - the ma_topiary linked list of dead nodes to free. * * Destroy walk a dead list. */ static void mas_mat_destroy(struct ma_state *mas, struct ma_topiary *mat) { struct maple_enode *next; struct maple_node *node; bool in_rcu = mt_in_rcu(mas->tree); while (mat->head) { next = mte_to_mat(mat->head)->next; node = mte_to_node(mat->head); mt_destroy_walk(mat->head, mas->tree, !in_rcu); if (in_rcu) call_rcu(&node->rcu, mt_free_walk); mat->head = next; } } /* * mas_descend() - Descend into the slot stored in the ma_state. * @mas - the maple state. * * Note: Not RCU safe, only use in write side or debug code. */ static inline void mas_descend(struct ma_state *mas) { enum maple_type type; unsigned long *pivots; struct maple_node *node; void __rcu **slots; node = mas_mn(mas); type = mte_node_type(mas->node); pivots = ma_pivots(node, type); slots = ma_slots(node, type); if (mas->offset) mas->min = pivots[mas->offset - 1] + 1; mas->max = mas_safe_pivot(mas, pivots, mas->offset, type); mas->node = mas_slot(mas, slots, mas->offset); } /* * mte_set_gap() - Set a maple node gap. * @mn: The encoded maple node * @gap: The offset of the gap to set * @val: The gap value */ static inline void mte_set_gap(const struct maple_enode *mn, unsigned char gap, unsigned long val) { switch (mte_node_type(mn)) { default: break; case maple_arange_64: mte_to_node(mn)->ma64.gap[gap] = val; break; } } /* * mas_ascend() - Walk up a level of the tree. * @mas: The maple state * * Sets the @mas->max and @mas->min to the correct values when walking up. This * may cause several levels of walking up to find the correct min and max. * May find a dead node which will cause a premature return. * Return: 1 on dead node, 0 otherwise */ static int mas_ascend(struct ma_state *mas) { struct maple_enode *p_enode; /* parent enode. */ struct maple_enode *a_enode; /* ancestor enode. */ struct maple_node *a_node; /* ancestor node. */ struct maple_node *p_node; /* parent node. */ unsigned char a_slot; enum maple_type a_type; unsigned long min, max; unsigned long *pivots; bool set_max = false, set_min = false; a_node = mas_mn(mas); if (ma_is_root(a_node)) { mas->offset = 0; return 0; } p_node = mte_parent(mas->node); if (unlikely(a_node == p_node)) return 1; a_type = mas_parent_type(mas, mas->node); mas->offset = mte_parent_slot(mas->node); a_enode = mt_mk_node(p_node, a_type); /* Check to make sure all parent information is still accurate */ if (p_node != mte_parent(mas->node)) return 1; mas->node = a_enode; if (mte_is_root(a_enode)) { mas->max = ULONG_MAX; mas->min = 0; return 0; } min = 0; max = ULONG_MAX; if (!mas->offset) { min = mas->min; set_min = true; } if (mas->max == ULONG_MAX) set_max = true; do { p_enode = a_enode; a_type = mas_parent_type(mas, p_enode); a_node = mte_parent(p_enode); a_slot = mte_parent_slot(p_enode); a_enode = mt_mk_node(a_node, a_type); pivots = ma_pivots(a_node, a_type); if (unlikely(ma_dead_node(a_node))) return 1; if (!set_min && a_slot) { set_min = true; min = pivots[a_slot - 1] + 1; } if (!set_max && a_slot < mt_pivots[a_type]) { set_max = true; max = pivots[a_slot]; } if (unlikely(ma_dead_node(a_node))) return 1; if (unlikely(ma_is_root(a_node))) break; } while (!set_min || !set_max); mas->max = max; mas->min = min; return 0; } /* * mas_pop_node() - Get a previously allocated maple node from the maple state. * @mas: The maple state * * Return: A pointer to a maple node. */ static inline struct maple_node *mas_pop_node(struct ma_state *mas) { struct maple_alloc *ret, *node = mas->alloc; unsigned long total = mas_allocated(mas); unsigned int req = mas_alloc_req(mas); /* nothing or a request pending. */ if (WARN_ON(!total)) return NULL; if (total == 1) { /* single allocation in this ma_state */ mas->alloc = NULL; ret = node; goto single_node; } if (node->node_count == 1) { /* Single allocation in this node. */ mas->alloc = node->slot[0]; mas->alloc->total = node->total - 1; ret = node; goto new_head; } node->total--; ret = node->slot[--node->node_count]; node->slot[node->node_count] = NULL; single_node: new_head: if (req) { req++; mas_set_alloc_req(mas, req); } memset(ret, 0, sizeof(*ret)); return (struct maple_node *)ret; } /* * mas_push_node() - Push a node back on the maple state allocation. * @mas: The maple state * @used: The used maple node * * Stores the maple node back into @mas->alloc for reuse. Updates allocated and * requested node count as necessary. */ static inline void mas_push_node(struct ma_state *mas, struct maple_node *used) { struct maple_alloc *reuse = (struct maple_alloc *)used; struct maple_alloc *head = mas->alloc; unsigned long count; unsigned int requested = mas_alloc_req(mas); count = mas_allocated(mas); reuse->request_count = 0; reuse->node_count = 0; if (count && (head->node_count < MAPLE_ALLOC_SLOTS)) { head->slot[head->node_count++] = reuse; head->total++; goto done; } reuse->total = 1; if ((head) && !((unsigned long)head & 0x1)) { reuse->slot[0] = head; reuse->node_count = 1; reuse->total += head->total; } mas->alloc = reuse; done: if (requested > 1) mas_set_alloc_req(mas, requested - 1); } /* * mas_alloc_nodes() - Allocate nodes into a maple state * @mas: The maple state * @gfp: The GFP Flags */ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) { struct maple_alloc *node; unsigned long allocated = mas_allocated(mas); unsigned int requested = mas_alloc_req(mas); unsigned int count; void **slots = NULL; unsigned int max_req = 0; if (!requested) return; mas_set_alloc_req(mas, 0); if (mas->mas_flags & MA_STATE_PREALLOC) { if (allocated) return; BUG_ON(!allocated); WARN_ON(!allocated); } if (!allocated || mas->alloc->node_count == MAPLE_ALLOC_SLOTS) { node = (struct maple_alloc *)mt_alloc_one(gfp); if (!node) goto nomem_one; if (allocated) { node->slot[0] = mas->alloc; node->node_count = 1; } else { node->node_count = 0; } mas->alloc = node; node->total = ++allocated; requested--; } node = mas->alloc; node->request_count = 0; while (requested) { max_req = MAPLE_ALLOC_SLOTS - node->node_count; slots = (void **)&node->slot[node->node_count]; max_req = min(requested, max_req); count = mt_alloc_bulk(gfp, max_req, slots); if (!count) goto nomem_bulk; if (node->node_count == 0) { node->slot[0]->node_count = 0; node->slot[0]->request_count = 0; } node->node_count += count; allocated += count; node = node->slot[0]; requested -= count; } mas->alloc->total = allocated; return; nomem_bulk: /* Clean up potential freed allocations on bulk failure */ memset(slots, 0, max_req * sizeof(unsigned long)); nomem_one: mas_set_alloc_req(mas, requested); if (mas->alloc && !(((unsigned long)mas->alloc & 0x1))) mas->alloc->total = allocated; mas_set_err(mas, -ENOMEM); } /* * mas_free() - Free an encoded maple node * @mas: The maple state * @used: The encoded maple node to free. * * Uses rcu free if necessary, pushes @used back on the maple state allocations * otherwise. */ static inline void mas_free(struct ma_state *mas, struct maple_enode *used) { struct maple_node *tmp = mte_to_node(used); if (mt_in_rcu(mas->tree)) ma_free_rcu(tmp); else mas_push_node(mas, tmp); } /* * mas_node_count_gfp() - Check if enough nodes are allocated and request more * if there is not enough nodes. * @mas: The maple state * @count: The number of nodes needed * @gfp: the gfp flags */ static void mas_node_count_gfp(struct ma_state *mas, int count, gfp_t gfp) { unsigned long allocated = mas_allocated(mas); if (allocated < count) { mas_set_alloc_req(mas, count - allocated); mas_alloc_nodes(mas, gfp); } } /* * mas_node_count() - Check if enough nodes are allocated and request more if * there is not enough nodes. * @mas: The maple state * @count: The number of nodes needed * * Note: Uses GFP_NOWAIT | __GFP_NOWARN for gfp flags. */ static void mas_node_count(struct ma_state *mas, int count) { return mas_node_count_gfp(mas, count, GFP_NOWAIT | __GFP_NOWARN); } /* * mas_start() - Sets up maple state for operations. * @mas: The maple state. * * If mas->status == mas_start, then set the min, max and depth to * defaults. * * Return: * - If mas->node is an error or not mas_start, return NULL. * - If it's an empty tree: NULL & mas->status == ma_none * - If it's a single entry: The entry & mas->status == mas_root * - If it's a tree: NULL & mas->status == safe root node. */ static inline struct maple_enode *mas_start(struct ma_state *mas) { if (likely(mas_is_start(mas))) { struct maple_enode *root; mas->min = 0; mas->max = ULONG_MAX; retry: mas->depth = 0; root = mas_root(mas); /* Tree with nodes */ if (likely(xa_is_node(root))) { mas->depth = 1; mas->status = ma_active; mas->node = mte_safe_root(root); mas->offset = 0; if (mte_dead_node(mas->node)) goto retry; return NULL; } /* empty tree */ if (unlikely(!root)) { mas->node = NULL; mas->status = ma_none; mas->offset = MAPLE_NODE_SLOTS; return NULL; } /* Single entry tree */ mas->status = ma_root; mas->offset = MAPLE_NODE_SLOTS; /* Single entry tree. */ if (mas->index > 0) return NULL; return root; } return NULL; } /* * ma_data_end() - Find the end of the data in a node. * @node: The maple node * @type: The maple node type * @pivots: The array of pivots in the node * @max: The maximum value in the node * * Uses metadata to find the end of the data when possible. * Return: The zero indexed last slot with data (may be null). */ static __always_inline unsigned char ma_data_end(struct maple_node *node, enum maple_type type, unsigned long *pivots, unsigned long max) { unsigned char offset; if (!pivots) return 0; if (type == maple_arange_64) return ma_meta_end(node, type); offset = mt_pivots[type] - 1; if (likely(!pivots[offset])) return ma_meta_end(node, type); if (likely(pivots[offset] == max)) return offset; return mt_pivots[type]; } /* * mas_data_end() - Find the end of the data (slot). * @mas: the maple state * * This method is optimized to check the metadata of a node if the node type * supports data end metadata. * * Return: The zero indexed last slot with data (may be null). */ static inline unsigned char mas_data_end(struct ma_state *mas) { enum maple_type type; struct maple_node *node; unsigned char offset; unsigned long *pivots; type = mte_node_type(mas->node); node = mas_mn(mas); if (type == maple_arange_64) return ma_meta_end(node, type); pivots = ma_pivots(node, type); if (unlikely(ma_dead_node(node))) return 0; offset = mt_pivots[type] - 1; if (likely(!pivots[offset])) return ma_meta_end(node, type); if (likely(pivots[offset] == mas->max)) return offset; return mt_pivots[type]; } /* * mas_leaf_max_gap() - Returns the largest gap in a leaf node * @mas - the maple state * * Return: The maximum gap in the leaf. */ static unsigned long mas_leaf_max_gap(struct ma_state *mas) { enum maple_type mt; unsigned long pstart, gap, max_gap; struct maple_node *mn; unsigned long *pivots; void __rcu **slots; unsigned char i; unsigned char max_piv; mt = mte_node_type(mas->node); mn = mas_mn(mas); slots = ma_slots(mn, mt); max_gap = 0; if (unlikely(ma_is_dense(mt))) { gap = 0; for (i = 0; i < mt_slots[mt]; i++) { if (slots[i]) { if (gap > max_gap) max_gap = gap; gap = 0; } else { gap++; } } if (gap > max_gap) max_gap = gap; return max_gap; } /* * Check the first implied pivot optimizes the loop below and slot 1 may * be skipped if there is a gap in slot 0. */ pivots = ma_pivots(mn, mt); if (likely(!slots[0])) { max_gap = pivots[0] - mas->min + 1; i = 2; } else { i = 1; } /* reduce max_piv as the special case is checked before the loop */ max_piv = ma_data_end(mn, mt, pivots, mas->max) - 1; /* * Check end implied pivot which can only be a gap on the right most * node. */ if (unlikely(mas->max == ULONG_MAX) && !slots[max_piv + 1]) { gap = ULONG_MAX - pivots[max_piv]; if (gap > max_gap) max_gap = gap; if (max_gap > pivots[max_piv] - mas->min) return max_gap; } for (; i <= max_piv; i++) { /* data == no gap. */ if (likely(slots[i])) continue; pstart = pivots[i - 1]; gap = pivots[i] - pstart; if (gap > max_gap) max_gap = gap; /* There cannot be two gaps in a row. */ i++; } return max_gap; } /* * ma_max_gap() - Get the maximum gap in a maple node (non-leaf) * @node: The maple node * @gaps: The pointer to the gaps * @mt: The maple node type * @*off: Pointer to store the offset location of the gap. * * Uses the metadata data end to scan backwards across set gaps. * * Return: The maximum gap value */ static inline unsigned long ma_max_gap(struct maple_node *node, unsigned long *gaps, enum maple_type mt, unsigned char *off) { unsigned char offset, i; unsigned long max_gap = 0; i = offset = ma_meta_end(node, mt); do { if (gaps[i] > max_gap) { max_gap = gaps[i]; offset = i; } } while (i--); *off = offset; return max_gap; } /* * mas_max_gap() - find the largest gap in a non-leaf node and set the slot. * @mas: The maple state. * * Return: The gap value. */ static inline unsigned long mas_max_gap(struct ma_state *mas) { unsigned long *gaps; unsigned char offset; enum maple_type mt; struct maple_node *node; mt = mte_node_type(mas->node); if (ma_is_leaf(mt)) return mas_leaf_max_gap(mas); node = mas_mn(mas); MAS_BUG_ON(mas, mt != maple_arange_64); offset = ma_meta_gap(node); gaps = ma_gaps(node, mt); return gaps[offset]; } /* * mas_parent_gap() - Set the parent gap and any gaps above, as needed * @mas: The maple state * @offset: The gap offset in the parent to set * @new: The new gap value. * * Set the parent gap then continue to set the gap upwards, using the metadata * of the parent to see if it is necessary to check the node above. */ static inline void mas_parent_gap(struct ma_state *mas, unsigned char offset, unsigned long new) { unsigned long meta_gap = 0; struct maple_node *pnode; struct maple_enode *penode; unsigned long *pgaps; unsigned char meta_offset; enum maple_type pmt; pnode = mte_parent(mas->node); pmt = mas_parent_type(mas, mas->node); penode = mt_mk_node(pnode, pmt); pgaps = ma_gaps(pnode, pmt); ascend: MAS_BUG_ON(mas, pmt != maple_arange_64); meta_offset = ma_meta_gap(pnode); meta_gap = pgaps[meta_offset]; pgaps[offset] = new; if (meta_gap == new) return; if (offset != meta_offset) { if (meta_gap > new) return; ma_set_meta_gap(pnode, pmt, offset); } else if (new < meta_gap) { new = ma_max_gap(pnode, pgaps, pmt, &meta_offset); ma_set_meta_gap(pnode, pmt, meta_offset); } if (ma_is_root(pnode)) return; /* Go to the parent node. */ pnode = mte_parent(penode); pmt = mas_parent_type(mas, penode); pgaps = ma_gaps(pnode, pmt); offset = mte_parent_slot(penode); penode = mt_mk_node(pnode, pmt); goto ascend; } /* * mas_update_gap() - Update a nodes gaps and propagate up if necessary. * @mas - the maple state. */ static inline void mas_update_gap(struct ma_state *mas) { unsigned char pslot; unsigned long p_gap; unsigned long max_gap; if (!mt_is_alloc(mas->tree)) return; if (mte_is_root(mas->node)) return; max_gap = mas_max_gap(mas); pslot = mte_parent_slot(mas->node); p_gap = ma_gaps(mte_parent(mas->node), mas_parent_type(mas, mas->node))[pslot]; if (p_gap != max_gap) mas_parent_gap(mas, pslot, max_gap); } /* * mas_adopt_children() - Set the parent pointer of all nodes in @parent to * @parent with the slot encoded. * @mas - the maple state (for the tree) * @parent - the maple encoded node containing the children. */ static inline void mas_adopt_children(struct ma_state *mas, struct maple_enode *parent) { enum maple_type type = mte_node_type(parent); struct maple_node *node = mte_to_node(parent); void __rcu **slots = ma_slots(node, type); unsigned long *pivots = ma_pivots(node, type); struct maple_enode *child; unsigned char offset; offset = ma_data_end(node, type, pivots, mas->max); do { child = mas_slot_locked(mas, slots, offset); mas_set_parent(mas, child, parent, offset); } while (offset--); } /* * mas_put_in_tree() - Put a new node in the tree, smp_wmb(), and mark the old * node as dead. * @mas - the maple state with the new node * @old_enode - The old maple encoded node to replace. */ static inline void mas_put_in_tree(struct ma_state *mas, struct maple_enode *old_enode) __must_hold(mas->tree->ma_lock) { unsigned char offset; void __rcu **slots; if (mte_is_root(mas->node)) { mas_mn(mas)->parent = ma_parent_ptr(mas_tree_parent(mas)); rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); mas_set_height(mas); } else { offset = mte_parent_slot(mas->node); slots = ma_slots(mte_parent(mas->node), mas_parent_type(mas, mas->node)); rcu_assign_pointer(slots[offset], mas->node); } mte_set_node_dead(old_enode); } /* * mas_replace_node() - Replace a node by putting it in the tree, marking it * dead, and freeing it. * the parent encoding to locate the maple node in the tree. * @mas - the ma_state with @mas->node pointing to the new node. * @old_enode - The old maple encoded node. */ static inline void mas_replace_node(struct ma_state *mas, struct maple_enode *old_enode) __must_hold(mas->tree->ma_lock) { mas_put_in_tree(mas, old_enode); mas_free(mas, old_enode); } /* * mas_find_child() - Find a child who has the parent @mas->node. * @mas: the maple state with the parent. * @child: the maple state to store the child. */ static inline bool mas_find_child(struct ma_state *mas, struct ma_state *child) __must_hold(mas->tree->ma_lock) { enum maple_type mt; unsigned char offset; unsigned char end; unsigned long *pivots; struct maple_enode *entry; struct maple_node *node; void __rcu **slots; mt = mte_node_type(mas->node); node = mas_mn(mas); slots = ma_slots(node, mt); pivots = ma_pivots(node, mt); end = ma_data_end(node, mt, pivots, mas->max); for (offset = mas->offset; offset <= end; offset++) { entry = mas_slot_locked(mas, slots, offset); if (mte_parent(entry) == node) { *child = *mas; mas->offset = offset + 1; child->offset = offset; mas_descend(child); child->offset = 0; return true; } } return false; } /* * mab_shift_right() - Shift the data in mab right. Note, does not clean out the * old data or set b_node->b_end. * @b_node: the maple_big_node * @shift: the shift count */ static inline void mab_shift_right(struct maple_big_node *b_node, unsigned char shift) { unsigned long size = b_node->b_end * sizeof(unsigned long); memmove(b_node->pivot + shift, b_node->pivot, size); memmove(b_node->slot + shift, b_node->slot, size); if (b_node->type == maple_arange_64) memmove(b_node->gap + shift, b_node->gap, size); } /* * mab_middle_node() - Check if a middle node is needed (unlikely) * @b_node: the maple_big_node that contains the data. * @size: the amount of data in the b_node * @split: the potential split location * @slot_count: the size that can be stored in a single node being considered. * * Return: true if a middle node is required. */ static inline bool mab_middle_node(struct maple_big_node *b_node, int split, unsigned char slot_count) { unsigned char size = b_node->b_end; if (size >= 2 * slot_count) return true; if (!b_node->slot[split] && (size >= 2 * slot_count - 1)) return true; return false; } /* * mab_no_null_split() - ensure the split doesn't fall on a NULL * @b_node: the maple_big_node with the data * @split: the suggested split location * @slot_count: the number of slots in the node being considered. * * Return: the split location. */ static inline int mab_no_null_split(struct maple_big_node *b_node, unsigned char split, unsigned char slot_count) { if (!b_node->slot[split]) { /* * If the split is less than the max slot && the right side will * still be sufficient, then increment the split on NULL. */ if ((split < slot_count - 1) && (b_node->b_end - split) > (mt_min_slots[b_node->type])) split++; else split--; } return split; } /* * mab_calc_split() - Calculate the split location and if there needs to be two * splits. * @bn: The maple_big_node with the data * @mid_split: The second split, if required. 0 otherwise. * * Return: The first split location. The middle split is set in @mid_split. */ static inline int mab_calc_split(struct ma_state *mas, struct maple_big_node *bn, unsigned char *mid_split, unsigned long min) { unsigned char b_end = bn->b_end; int split = b_end / 2; /* Assume equal split. */ unsigned char slot_min, slot_count = mt_slots[bn->type]; /* * To support gap tracking, all NULL entries are kept together and a node cannot * end on a NULL entry, with the exception of the left-most leaf. The * limitation means that the split of a node must be checked for this condition * and be able to put more data in one direction or the other. */ if (unlikely((mas->mas_flags & MA_STATE_BULK))) { *mid_split = 0; split = b_end - mt_min_slots[bn->type]; if (!ma_is_leaf(bn->type)) return split; mas->mas_flags |= MA_STATE_REBALANCE; if (!bn->slot[split]) split--; return split; } /* * Although extremely rare, it is possible to enter what is known as the 3-way * split scenario. The 3-way split comes about by means of a store of a range * that overwrites the end and beginning of two full nodes. The result is a set * of entries that cannot be stored in 2 nodes. Sometimes, these two nodes can * also be located in different parent nodes which are also full. This can * carry upwards all the way to the root in the worst case. */ if (unlikely(mab_middle_node(bn, split, slot_count))) { split = b_end / 3; *mid_split = split * 2; } else { slot_min = mt_min_slots[bn->type]; *mid_split = 0; /* * Avoid having a range less than the slot count unless it * causes one node to be deficient. * NOTE: mt_min_slots is 1 based, b_end and split are zero. */ while ((split < slot_count - 1) && ((bn->pivot[split] - min) < slot_count - 1) && (b_end - split > slot_min)) split++; } /* Avoid ending a node on a NULL entry */ split = mab_no_null_split(bn, split, slot_count); if (unlikely(*mid_split)) *mid_split = mab_no_null_split(bn, *mid_split, slot_count); return split; } /* * mas_mab_cp() - Copy data from a maple state inclusively to a maple_big_node * and set @b_node->b_end to the next free slot. * @mas: The maple state * @mas_start: The starting slot to copy * @mas_end: The end slot to copy (inclusively) * @b_node: The maple_big_node to place the data * @mab_start: The starting location in maple_big_node to store the data. */ static inline void mas_mab_cp(struct ma_state *mas, unsigned char mas_start, unsigned char mas_end, struct maple_big_node *b_node, unsigned char mab_start) { enum maple_type mt; struct maple_node *node; void __rcu **slots; unsigned long *pivots, *gaps; int i = mas_start, j = mab_start; unsigned char piv_end; node = mas_mn(mas); mt = mte_node_type(mas->node); pivots = ma_pivots(node, mt); if (!i) { b_node->pivot[j] = pivots[i++]; if (unlikely(i > mas_end)) goto complete; j++; } piv_end = min(mas_end, mt_pivots[mt]); for (; i < piv_end; i++, j++) { b_node->pivot[j] = pivots[i]; if (unlikely(!b_node->pivot[j])) break; if (unlikely(mas->max == b_node->pivot[j])) goto complete; } if (likely(i <= mas_end)) b_node->pivot[j] = mas_safe_pivot(mas, pivots, i, mt); complete: b_node->b_end = ++j; j -= mab_start; slots = ma_slots(node, mt); memcpy(b_node->slot + mab_start, slots + mas_start, sizeof(void *) * j); if (!ma_is_leaf(mt) && mt_is_alloc(mas->tree)) { gaps = ma_gaps(node, mt); memcpy(b_node->gap + mab_start, gaps + mas_start, sizeof(unsigned long) * j); } } /* * mas_leaf_set_meta() - Set the metadata of a leaf if possible. * @node: The maple node * @mt: The maple type * @end: The node end */ static inline void mas_leaf_set_meta(struct maple_node *node, enum maple_type mt, unsigned char end) { if (end < mt_slots[mt] - 1) ma_set_meta(node, mt, 0, end); } /* * mab_mas_cp() - Copy data from maple_big_node to a maple encoded node. * @b_node: the maple_big_node that has the data * @mab_start: the start location in @b_node. * @mab_end: The end location in @b_node (inclusively) * @mas: The maple state with the maple encoded node. */ static inline void mab_mas_cp(struct maple_big_node *b_node, unsigned char mab_start, unsigned char mab_end, struct ma_state *mas, bool new_max) { int i, j = 0; enum maple_type mt = mte_node_type(mas->node); struct maple_node *node = mte_to_node(mas->node); void __rcu **slots = ma_slots(node, mt); unsigned long *pivots = ma_pivots(node, mt); unsigned long *gaps = NULL; unsigned char end; if (mab_end - mab_start > mt_pivots[mt]) mab_end--; if (!pivots[mt_pivots[mt] - 1]) slots[mt_pivots[mt]] = NULL; i = mab_start; do { pivots[j++] = b_node->pivot[i++]; } while (i <= mab_end && likely(b_node->pivot[i])); memcpy(slots, b_node->slot + mab_start, sizeof(void *) * (i - mab_start)); if (new_max) mas->max = b_node->pivot[i - 1]; end = j - 1; if (likely(!ma_is_leaf(mt) && mt_is_alloc(mas->tree))) { unsigned long max_gap = 0; unsigned char offset = 0; gaps = ma_gaps(node, mt); do { gaps[--j] = b_node->gap[--i]; if (gaps[j] > max_gap) { offset = j; max_gap = gaps[j]; } } while (j); ma_set_meta(node, mt, offset, end); } else { mas_leaf_set_meta(node, mt, end); } } /* * mas_bulk_rebalance() - Rebalance the end of a tree after a bulk insert. * @mas: The maple state * @end: The maple node end * @mt: The maple node type */ static inline void mas_bulk_rebalance(struct ma_state *mas, unsigned char end, enum maple_type mt) { if (!(mas->mas_flags & MA_STATE_BULK)) return; if (mte_is_root(mas->node)) return; if (end > mt_min_slots[mt]) { mas->mas_flags &= ~MA_STATE_REBALANCE; return; } } /* * mas_store_b_node() - Store an @entry into the b_node while also copying the * data from a maple encoded node. * @wr_mas: the maple write state * @b_node: the maple_big_node to fill with data * @offset_end: the offset to end copying * * Return: The actual end of the data stored in @b_node */ static noinline_for_kasan void mas_store_b_node(struct ma_wr_state *wr_mas, struct maple_big_node *b_node, unsigned char offset_end) { unsigned char slot; unsigned char b_end; /* Possible underflow of piv will wrap back to 0 before use. */ unsigned long piv; struct ma_state *mas = wr_mas->mas; b_node->type = wr_mas->type; b_end = 0; slot = mas->offset; if (slot) { /* Copy start data up to insert. */ mas_mab_cp(mas, 0, slot - 1, b_node, 0); b_end = b_node->b_end; piv = b_node->pivot[b_end - 1]; } else piv = mas->min - 1; if (piv + 1 < mas->index) { /* Handle range starting after old range */ b_node->slot[b_end] = wr_mas->content; if (!wr_mas->content) b_node->gap[b_end] = mas->index - 1 - piv; b_node->pivot[b_end++] = mas->index - 1; } /* Store the new entry. */ mas->offset = b_end; b_node->slot[b_end] = wr_mas->entry; b_node->pivot[b_end] = mas->last; /* Appended. */ if (mas->last >= mas->max) goto b_end; /* Handle new range ending before old range ends */ piv = mas_safe_pivot(mas, wr_mas->pivots, offset_end, wr_mas->type); if (piv > mas->last) { if (piv == ULONG_MAX) mas_bulk_rebalance(mas, b_node->b_end, wr_mas->type); if (offset_end != slot) wr_mas->content = mas_slot_locked(mas, wr_mas->slots, offset_end); b_node->slot[++b_end] = wr_mas->content; if (!wr_mas->content) b_node->gap[b_end] = piv - mas->last + 1; b_node->pivot[b_end] = piv; } slot = offset_end + 1; if (slot > mas->end) goto b_end; /* Copy end data to the end of the node. */ mas_mab_cp(mas, slot, mas->end + 1, b_node, ++b_end); b_node->b_end--; return; b_end: b_node->b_end = b_end; } /* * mas_prev_sibling() - Find the previous node with the same parent. * @mas: the maple state * * Return: True if there is a previous sibling, false otherwise. */ static inline bool mas_prev_sibling(struct ma_state *mas) { unsigned int p_slot = mte_parent_slot(mas->node); if (mte_is_root(mas->node)) return false; if (!p_slot) return false; mas_ascend(mas); mas->offset = p_slot - 1; mas_descend(mas); return true; } /* * mas_next_sibling() - Find the next node with the same parent. * @mas: the maple state * * Return: true if there is a next sibling, false otherwise. */ static inline bool mas_next_sibling(struct ma_state *mas) { MA_STATE(parent, mas->tree, mas->index, mas->last); if (mte_is_root(mas->node)) return false; parent = *mas; mas_ascend(&parent); parent.offset = mte_parent_slot(mas->node) + 1; if (parent.offset > mas_data_end(&parent)) return false; *mas = parent; mas_descend(mas); return true; } /* * mte_node_or_none() - Set the enode and state. * @enode: The encoded maple node. * * Set the node to the enode and the status. */ static inline void mas_node_or_none(struct ma_state *mas, struct maple_enode *enode) { if (enode) { mas->node = enode; mas->status = ma_active; } else { mas->node = NULL; mas->status = ma_none; } } /* * mas_wr_node_walk() - Find the correct offset for the index in the @mas. * @wr_mas: The maple write state * * Uses mas_slot_locked() and does not need to worry about dead nodes. */ static inline void mas_wr_node_walk(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char count, offset; if (unlikely(ma_is_dense(wr_mas->type))) { wr_mas->r_max = wr_mas->r_min = mas->index; mas->offset = mas->index = mas->min; return; } wr_mas->node = mas_mn(wr_mas->mas); wr_mas->pivots = ma_pivots(wr_mas->node, wr_mas->type); count = mas->end = ma_data_end(wr_mas->node, wr_mas->type, wr_mas->pivots, mas->max); offset = mas->offset; while (offset < count && mas->index > wr_mas->pivots[offset]) offset++; wr_mas->r_max = offset < count ? wr_mas->pivots[offset] : mas->max; wr_mas->r_min = mas_safe_min(mas, wr_mas->pivots, offset); wr_mas->offset_end = mas->offset = offset; } /* * mast_rebalance_next() - Rebalance against the next node * @mast: The maple subtree state * @old_r: The encoded maple node to the right (next node). */ static inline void mast_rebalance_next(struct maple_subtree_state *mast) { unsigned char b_end = mast->bn->b_end; mas_mab_cp(mast->orig_r, 0, mt_slot_count(mast->orig_r->node), mast->bn, b_end); mast->orig_r->last = mast->orig_r->max; } /* * mast_rebalance_prev() - Rebalance against the previous node * @mast: The maple subtree state * @old_l: The encoded maple node to the left (previous node) */ static inline void mast_rebalance_prev(struct maple_subtree_state *mast) { unsigned char end = mas_data_end(mast->orig_l) + 1; unsigned char b_end = mast->bn->b_end; mab_shift_right(mast->bn, end); mas_mab_cp(mast->orig_l, 0, end - 1, mast->bn, 0); mast->l->min = mast->orig_l->min; mast->orig_l->index = mast->orig_l->min; mast->bn->b_end = end + b_end; mast->l->offset += end; } /* * mast_spanning_rebalance() - Rebalance nodes with nearest neighbour favouring * the node to the right. Checking the nodes to the right then the left at each * level upwards until root is reached. * Data is copied into the @mast->bn. * @mast: The maple_subtree_state. */ static inline bool mast_spanning_rebalance(struct maple_subtree_state *mast) { struct ma_state r_tmp = *mast->orig_r; struct ma_state l_tmp = *mast->orig_l; unsigned char depth = 0; do { mas_ascend(mast->orig_r); mas_ascend(mast->orig_l); depth++; if (mast->orig_r->offset < mas_data_end(mast->orig_r)) { mast->orig_r->offset++; do { mas_descend(mast->orig_r); mast->orig_r->offset = 0; } while (--depth); mast_rebalance_next(mast); *mast->orig_l = l_tmp; return true; } else if (mast->orig_l->offset != 0) { mast->orig_l->offset--; do { mas_descend(mast->orig_l); mast->orig_l->offset = mas_data_end(mast->orig_l); } while (--depth); mast_rebalance_prev(mast); *mast->orig_r = r_tmp; return true; } } while (!mte_is_root(mast->orig_r->node)); *mast->orig_r = r_tmp; *mast->orig_l = l_tmp; return false; } /* * mast_ascend() - Ascend the original left and right maple states. * @mast: the maple subtree state. * * Ascend the original left and right sides. Set the offsets to point to the * data already in the new tree (@mast->l and @mast->r). */ static inline void mast_ascend(struct maple_subtree_state *mast) { MA_WR_STATE(wr_mas, mast->orig_r, NULL); mas_ascend(mast->orig_l); mas_ascend(mast->orig_r); mast->orig_r->offset = 0; mast->orig_r->index = mast->r->max; /* last should be larger than or equal to index */ if (mast->orig_r->last < mast->orig_r->index) mast->orig_r->last = mast->orig_r->index; wr_mas.type = mte_node_type(mast->orig_r->node); mas_wr_node_walk(&wr_mas); /* Set up the left side of things */ mast->orig_l->offset = 0; mast->orig_l->index = mast->l->min; wr_mas.mas = mast->orig_l; wr_mas.type = mte_node_type(mast->orig_l->node); mas_wr_node_walk(&wr_mas); mast->bn->type = wr_mas.type; } /* * mas_new_ma_node() - Create and return a new maple node. Helper function. * @mas: the maple state with the allocations. * @b_node: the maple_big_node with the type encoding. * * Use the node type from the maple_big_node to allocate a new node from the * ma_state. This function exists mainly for code readability. * * Return: A new maple encoded node */ static inline struct maple_enode *mas_new_ma_node(struct ma_state *mas, struct maple_big_node *b_node) { return mt_mk_node(ma_mnode_ptr(mas_pop_node(mas)), b_node->type); } /* * mas_mab_to_node() - Set up right and middle nodes * * @mas: the maple state that contains the allocations. * @b_node: the node which contains the data. * @left: The pointer which will have the left node * @right: The pointer which may have the right node * @middle: the pointer which may have the middle node (rare) * @mid_split: the split location for the middle node * * Return: the split of left. */ static inline unsigned char mas_mab_to_node(struct ma_state *mas, struct maple_big_node *b_node, struct maple_enode **left, struct maple_enode **right, struct maple_enode **middle, unsigned char *mid_split, unsigned long min) { unsigned char split = 0; unsigned char slot_count = mt_slots[b_node->type]; *left = mas_new_ma_node(mas, b_node); *right = NULL; *middle = NULL; *mid_split = 0; if (b_node->b_end < slot_count) { split = b_node->b_end; } else { split = mab_calc_split(mas, b_node, mid_split, min); *right = mas_new_ma_node(mas, b_node); } if (*mid_split) *middle = mas_new_ma_node(mas, b_node); return split; } /* * mab_set_b_end() - Add entry to b_node at b_node->b_end and increment the end * pointer. * @b_node - the big node to add the entry * @mas - the maple state to get the pivot (mas->max) * @entry - the entry to add, if NULL nothing happens. */ static inline void mab_set_b_end(struct maple_big_node *b_node, struct ma_state *mas, void *entry) { if (!entry) return; b_node->slot[b_node->b_end] = entry; if (mt_is_alloc(mas->tree)) b_node->gap[b_node->b_end] = mas_max_gap(mas); b_node->pivot[b_node->b_end++] = mas->max; } /* * mas_set_split_parent() - combine_then_separate helper function. Sets the parent * of @mas->node to either @left or @right, depending on @slot and @split * * @mas - the maple state with the node that needs a parent * @left - possible parent 1 * @right - possible parent 2 * @slot - the slot the mas->node was placed * @split - the split location between @left and @right */ static inline void mas_set_split_parent(struct ma_state *mas, struct maple_enode *left, struct maple_enode *right, unsigned char *slot, unsigned char split) { if (mas_is_none(mas)) return; if ((*slot) <= split) mas_set_parent(mas, mas->node, left, *slot); else if (right) mas_set_parent(mas, mas->node, right, (*slot) - split - 1); (*slot)++; } /* * mte_mid_split_check() - Check if the next node passes the mid-split * @**l: Pointer to left encoded maple node. * @**m: Pointer to middle encoded maple node. * @**r: Pointer to right encoded maple node. * @slot: The offset * @*split: The split location. * @mid_split: The middle split. */ static inline void mte_mid_split_check(struct maple_enode **l, struct maple_enode **r, struct maple_enode *right, unsigned char slot, unsigned char *split, unsigned char mid_split) { if (*r == right) return; if (slot < mid_split) return; *l = *r; *r = right; *split = mid_split; } /* * mast_set_split_parents() - Helper function to set three nodes parents. Slot * is taken from @mast->l. * @mast - the maple subtree state * @left - the left node * @right - the right node * @split - the split location. */ static inline void mast_set_split_parents(struct maple_subtree_state *mast, struct maple_enode *left, struct maple_enode *middle, struct maple_enode *right, unsigned char split, unsigned char mid_split) { unsigned char slot; struct maple_enode *l = left; struct maple_enode *r = right; if (mas_is_none(mast->l)) return; if (middle) r = middle; slot = mast->l->offset; mte_mid_split_check(&l, &r, right, slot, &split, mid_split); mas_set_split_parent(mast->l, l, r, &slot, split); mte_mid_split_check(&l, &r, right, slot, &split, mid_split); mas_set_split_parent(mast->m, l, r, &slot, split); mte_mid_split_check(&l, &r, right, slot, &split, mid_split); mas_set_split_parent(mast->r, l, r, &slot, split); } /* * mas_topiary_node() - Dispose of a single node * @mas: The maple state for pushing nodes * @enode: The encoded maple node * @in_rcu: If the tree is in rcu mode * * The node will either be RCU freed or pushed back on the maple state. */ static inline void mas_topiary_node(struct ma_state *mas, struct ma_state *tmp_mas, bool in_rcu) { struct maple_node *tmp; struct maple_enode *enode; if (mas_is_none(tmp_mas)) return; enode = tmp_mas->node; tmp = mte_to_node(enode); mte_set_node_dead(enode); if (in_rcu) ma_free_rcu(tmp); else mas_push_node(mas, tmp); } /* * mas_topiary_replace() - Replace the data with new data, then repair the * parent links within the new tree. Iterate over the dead sub-tree and collect * the dead subtrees and topiary the nodes that are no longer of use. * * The new tree will have up to three children with the correct parent. Keep * track of the new entries as they need to be followed to find the next level * of new entries. * * The old tree will have up to three children with the old parent. Keep track * of the old entries as they may have more nodes below replaced. Nodes within * [index, last] are dead subtrees, others need to be freed and followed. * * @mas: The maple state pointing at the new data * @old_enode: The maple encoded node being replaced * */ static inline void mas_topiary_replace(struct ma_state *mas, struct maple_enode *old_enode) { struct ma_state tmp[3], tmp_next[3]; MA_TOPIARY(subtrees, mas->tree); bool in_rcu; int i, n; /* Place data in tree & then mark node as old */ mas_put_in_tree(mas, old_enode); /* Update the parent pointers in the tree */ tmp[0] = *mas; tmp[0].offset = 0; tmp[1].status = ma_none; tmp[2].status = ma_none; while (!mte_is_leaf(tmp[0].node)) { n = 0; for (i = 0; i < 3; i++) { if (mas_is_none(&tmp[i])) continue; while (n < 3) { if (!mas_find_child(&tmp[i], &tmp_next[n])) break; n++; } mas_adopt_children(&tmp[i], tmp[i].node); } if (MAS_WARN_ON(mas, n == 0)) break; while (n < 3) tmp_next[n++].status = ma_none; for (i = 0; i < 3; i++) tmp[i] = tmp_next[i]; } /* Collect the old nodes that need to be discarded */ if (mte_is_leaf(old_enode)) return mas_free(mas, old_enode); tmp[0] = *mas; tmp[0].offset = 0; tmp[0].node = old_enode; tmp[1].status = ma_none; tmp[2].status = ma_none; in_rcu = mt_in_rcu(mas->tree); do { n = 0; for (i = 0; i < 3; i++) { if (mas_is_none(&tmp[i])) continue; while (n < 3) { if (!mas_find_child(&tmp[i], &tmp_next[n])) break; if ((tmp_next[n].min >= tmp_next->index) && (tmp_next[n].max <= tmp_next->last)) { mat_add(&subtrees, tmp_next[n].node); tmp_next[n].status = ma_none; } else { n++; } } } if (MAS_WARN_ON(mas, n == 0)) break; while (n < 3) tmp_next[n++].status = ma_none; for (i = 0; i < 3; i++) { mas_topiary_node(mas, &tmp[i], in_rcu); tmp[i] = tmp_next[i]; } } while (!mte_is_leaf(tmp[0].node)); for (i = 0; i < 3; i++) mas_topiary_node(mas, &tmp[i], in_rcu); mas_mat_destroy(mas, &subtrees); } /* * mas_wmb_replace() - Write memory barrier and replace * @mas: The maple state * @old: The old maple encoded node that is being replaced. * * Updates gap as necessary. */ static inline void mas_wmb_replace(struct ma_state *mas, struct maple_enode *old_enode) { /* Insert the new data in the tree */ mas_topiary_replace(mas, old_enode); if (mte_is_leaf(mas->node)) return; mas_update_gap(mas); } /* * mast_cp_to_nodes() - Copy data out to nodes. * @mast: The maple subtree state * @left: The left encoded maple node * @middle: The middle encoded maple node * @right: The right encoded maple node * @split: The location to split between left and (middle ? middle : right) * @mid_split: The location to split between middle and right. */ static inline void mast_cp_to_nodes(struct maple_subtree_state *mast, struct maple_enode *left, struct maple_enode *middle, struct maple_enode *right, unsigned char split, unsigned char mid_split) { bool new_lmax = true; mas_node_or_none(mast->l, left); mas_node_or_none(mast->m, middle); mas_node_or_none(mast->r, right); mast->l->min = mast->orig_l->min; if (split == mast->bn->b_end) { mast->l->max = mast->orig_r->max; new_lmax = false; } mab_mas_cp(mast->bn, 0, split, mast->l, new_lmax); if (middle) { mab_mas_cp(mast->bn, 1 + split, mid_split, mast->m, true); mast->m->min = mast->bn->pivot[split] + 1; split = mid_split; } mast->r->max = mast->orig_r->max; if (right) { mab_mas_cp(mast->bn, 1 + split, mast->bn->b_end, mast->r, false); mast->r->min = mast->bn->pivot[split] + 1; } } /* * mast_combine_cp_left - Copy in the original left side of the tree into the * combined data set in the maple subtree state big node. * @mast: The maple subtree state */ static inline void mast_combine_cp_left(struct maple_subtree_state *mast) { unsigned char l_slot = mast->orig_l->offset; if (!l_slot) return; mas_mab_cp(mast->orig_l, 0, l_slot - 1, mast->bn, 0); } /* * mast_combine_cp_right: Copy in the original right side of the tree into the * combined data set in the maple subtree state big node. * @mast: The maple subtree state */ static inline void mast_combine_cp_right(struct maple_subtree_state *mast) { if (mast->bn->pivot[mast->bn->b_end - 1] >= mast->orig_r->max) return; mas_mab_cp(mast->orig_r, mast->orig_r->offset + 1, mt_slot_count(mast->orig_r->node), mast->bn, mast->bn->b_end); mast->orig_r->last = mast->orig_r->max; } /* * mast_sufficient: Check if the maple subtree state has enough data in the big * node to create at least one sufficient node * @mast: the maple subtree state */ static inline bool mast_sufficient(struct maple_subtree_state *mast) { if (mast->bn->b_end > mt_min_slot_count(mast->orig_l->node)) return true; return false; } /* * mast_overflow: Check if there is too much data in the subtree state for a * single node. * @mast: The maple subtree state */ static inline bool mast_overflow(struct maple_subtree_state *mast) { if (mast->bn->b_end >= mt_slot_count(mast->orig_l->node)) return true; return false; } static inline void *mtree_range_walk(struct ma_state *mas) { unsigned long *pivots; unsigned char offset; struct maple_node *node; struct maple_enode *next, *last; enum maple_type type; void __rcu **slots; unsigned char end; unsigned long max, min; unsigned long prev_max, prev_min; next = mas->node; min = mas->min; max = mas->max; do { last = next; node = mte_to_node(next); type = mte_node_type(next); pivots = ma_pivots(node, type); end = ma_data_end(node, type, pivots, max); prev_min = min; prev_max = max; if (pivots[0] >= mas->index) { offset = 0; max = pivots[0]; goto next; } offset = 1; while (offset < end) { if (pivots[offset] >= mas->index) { max = pivots[offset]; break; } offset++; } min = pivots[offset - 1] + 1; next: slots = ma_slots(node, type); next = mt_slot(mas->tree, slots, offset); if (unlikely(ma_dead_node(node))) goto dead_node; } while (!ma_is_leaf(type)); mas->end = end; mas->offset = offset; mas->index = min; mas->last = max; mas->min = prev_min; mas->max = prev_max; mas->node = last; return (void *)next; dead_node: mas_reset(mas); return NULL; } /* * mas_spanning_rebalance() - Rebalance across two nodes which may not be peers. * @mas: The starting maple state * @mast: The maple_subtree_state, keeps track of 4 maple states. * @count: The estimated count of iterations needed. * * Follow the tree upwards from @l_mas and @r_mas for @count, or until the root * is hit. First @b_node is split into two entries which are inserted into the * next iteration of the loop. @b_node is returned populated with the final * iteration. @mas is used to obtain allocations. orig_l_mas keeps track of the * nodes that will remain active by using orig_l_mas->index and orig_l_mas->last * to account of what has been copied into the new sub-tree. The update of * orig_l_mas->last is used in mas_consume to find the slots that will need to * be either freed or destroyed. orig_l_mas->depth keeps track of the height of * the new sub-tree in case the sub-tree becomes the full tree. * * Return: the number of elements in b_node during the last loop. */ static int mas_spanning_rebalance(struct ma_state *mas, struct maple_subtree_state *mast, unsigned char count) { unsigned char split, mid_split; unsigned char slot = 0; struct maple_enode *left = NULL, *middle = NULL, *right = NULL; struct maple_enode *old_enode; MA_STATE(l_mas, mas->tree, mas->index, mas->index); MA_STATE(r_mas, mas->tree, mas->index, mas->last); MA_STATE(m_mas, mas->tree, mas->index, mas->index); /* * The tree needs to be rebalanced and leaves need to be kept at the same level. * Rebalancing is done by use of the ``struct maple_topiary``. */ mast->l = &l_mas; mast->m = &m_mas; mast->r = &r_mas; l_mas.status = r_mas.status = m_mas.status = ma_none; /* Check if this is not root and has sufficient data. */ if (((mast->orig_l->min != 0) || (mast->orig_r->max != ULONG_MAX)) && unlikely(mast->bn->b_end <= mt_min_slots[mast->bn->type])) mast_spanning_rebalance(mast); l_mas.depth = 0; /* * Each level of the tree is examined and balanced, pushing data to the left or * right, or rebalancing against left or right nodes is employed to avoid * rippling up the tree to limit the amount of churn. Once a new sub-section of * the tree is created, there may be a mix of new and old nodes. The old nodes * will have the incorrect parent pointers and currently be in two trees: the * original tree and the partially new tree. To remedy the parent pointers in * the old tree, the new data is swapped into the active tree and a walk down * the tree is performed and the parent pointers are updated. * See mas_topiary_replace() for more information. */ while (count--) { mast->bn->b_end--; mast->bn->type = mte_node_type(mast->orig_l->node); split = mas_mab_to_node(mas, mast->bn, &left, &right, &middle, &mid_split, mast->orig_l->min); mast_set_split_parents(mast, left, middle, right, split, mid_split); mast_cp_to_nodes(mast, left, middle, right, split, mid_split); /* * Copy data from next level in the tree to mast->bn from next * iteration */ memset(mast->bn, 0, sizeof(struct maple_big_node)); mast->bn->type = mte_node_type(left); l_mas.depth++; /* Root already stored in l->node. */ if (mas_is_root_limits(mast->l)) goto new_root; mast_ascend(mast); mast_combine_cp_left(mast); l_mas.offset = mast->bn->b_end; mab_set_b_end(mast->bn, &l_mas, left); mab_set_b_end(mast->bn, &m_mas, middle); mab_set_b_end(mast->bn, &r_mas, right); /* Copy anything necessary out of the right node. */ mast_combine_cp_right(mast); mast->orig_l->last = mast->orig_l->max; if (mast_sufficient(mast)) continue; if (mast_overflow(mast)) continue; /* May be a new root stored in mast->bn */ if (mas_is_root_limits(mast->orig_l)) break; mast_spanning_rebalance(mast); /* rebalancing from other nodes may require another loop. */ if (!count) count++; } l_mas.node = mt_mk_node(ma_mnode_ptr(mas_pop_node(mas)), mte_node_type(mast->orig_l->node)); l_mas.depth++; mab_mas_cp(mast->bn, 0, mt_slots[mast->bn->type] - 1, &l_mas, true); mas_set_parent(mas, left, l_mas.node, slot); if (middle) mas_set_parent(mas, middle, l_mas.node, ++slot); if (right) mas_set_parent(mas, right, l_mas.node, ++slot); if (mas_is_root_limits(mast->l)) { new_root: mas_mn(mast->l)->parent = ma_parent_ptr(mas_tree_parent(mas)); while (!mte_is_root(mast->orig_l->node)) mast_ascend(mast); } else { mas_mn(&l_mas)->parent = mas_mn(mast->orig_l)->parent; } old_enode = mast->orig_l->node; mas->depth = l_mas.depth; mas->node = l_mas.node; mas->min = l_mas.min; mas->max = l_mas.max; mas->offset = l_mas.offset; mas_wmb_replace(mas, old_enode); mtree_range_walk(mas); return mast->bn->b_end; } /* * mas_rebalance() - Rebalance a given node. * @mas: The maple state * @b_node: The big maple node. * * Rebalance two nodes into a single node or two new nodes that are sufficient. * Continue upwards until tree is sufficient. * * Return: the number of elements in b_node during the last loop. */ static inline int mas_rebalance(struct ma_state *mas, struct maple_big_node *b_node) { char empty_count = mas_mt_height(mas); struct maple_subtree_state mast; unsigned char shift, b_end = ++b_node->b_end; MA_STATE(l_mas, mas->tree, mas->index, mas->last); MA_STATE(r_mas, mas->tree, mas->index, mas->last); trace_ma_op(__func__, mas); /* * Rebalancing occurs if a node is insufficient. Data is rebalanced * against the node to the right if it exists, otherwise the node to the * left of this node is rebalanced against this node. If rebalancing * causes just one node to be produced instead of two, then the parent * is also examined and rebalanced if it is insufficient. Every level * tries to combine the data in the same way. If one node contains the * entire range of the tree, then that node is used as a new root node. */ mas_node_count(mas, empty_count * 2 - 1); if (mas_is_err(mas)) return 0; mast.orig_l = &l_mas; mast.orig_r = &r_mas; mast.bn = b_node; mast.bn->type = mte_node_type(mas->node); l_mas = r_mas = *mas; if (mas_next_sibling(&r_mas)) { mas_mab_cp(&r_mas, 0, mt_slot_count(r_mas.node), b_node, b_end); r_mas.last = r_mas.index = r_mas.max; } else { mas_prev_sibling(&l_mas); shift = mas_data_end(&l_mas) + 1; mab_shift_right(b_node, shift); mas->offset += shift; mas_mab_cp(&l_mas, 0, shift - 1, b_node, 0); b_node->b_end = shift + b_end; l_mas.index = l_mas.last = l_mas.min; } return mas_spanning_rebalance(mas, &mast, empty_count); } /* * mas_destroy_rebalance() - Rebalance left-most node while destroying the maple * state. * @mas: The maple state * @end: The end of the left-most node. * * During a mass-insert event (such as forking), it may be necessary to * rebalance the left-most node when it is not sufficient. */ static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end) { enum maple_type mt = mte_node_type(mas->node); struct maple_node reuse, *newnode, *parent, *new_left, *left, *node; struct maple_enode *eparent, *old_eparent; unsigned char offset, tmp, split = mt_slots[mt] / 2; void __rcu **l_slots, **slots; unsigned long *l_pivs, *pivs, gap; bool in_rcu = mt_in_rcu(mas->tree); MA_STATE(l_mas, mas->tree, mas->index, mas->last); l_mas = *mas; mas_prev_sibling(&l_mas); /* set up node. */ if (in_rcu) { /* Allocate for both left and right as well as parent. */ mas_node_count(mas, 3); if (mas_is_err(mas)) return; newnode = mas_pop_node(mas); } else { newnode = &reuse; } node = mas_mn(mas); newnode->parent = node->parent; slots = ma_slots(newnode, mt); pivs = ma_pivots(newnode, mt); left = mas_mn(&l_mas); l_slots = ma_slots(left, mt); l_pivs = ma_pivots(left, mt); if (!l_slots[split]) split++; tmp = mas_data_end(&l_mas) - split; memcpy(slots, l_slots + split + 1, sizeof(void *) * tmp); memcpy(pivs, l_pivs + split + 1, sizeof(unsigned long) * tmp); pivs[tmp] = l_mas.max; memcpy(slots + tmp, ma_slots(node, mt), sizeof(void *) * end); memcpy(pivs + tmp, ma_pivots(node, mt), sizeof(unsigned long) * end); l_mas.max = l_pivs[split]; mas->min = l_mas.max + 1; old_eparent = mt_mk_node(mte_parent(l_mas.node), mas_parent_type(&l_mas, l_mas.node)); tmp += end; if (!in_rcu) { unsigned char max_p = mt_pivots[mt]; unsigned char max_s = mt_slots[mt]; if (tmp < max_p) memset(pivs + tmp, 0, sizeof(unsigned long) * (max_p - tmp)); if (tmp < mt_slots[mt]) memset(slots + tmp, 0, sizeof(void *) * (max_s - tmp)); memcpy(node, newnode, sizeof(struct maple_node)); ma_set_meta(node, mt, 0, tmp - 1); mte_set_pivot(old_eparent, mte_parent_slot(l_mas.node), l_pivs[split]); /* Remove data from l_pivs. */ tmp = split + 1; memset(l_pivs + tmp, 0, sizeof(unsigned long) * (max_p - tmp)); memset(l_slots + tmp, 0, sizeof(void *) * (max_s - tmp)); ma_set_meta(left, mt, 0, split); eparent = old_eparent; goto done; } /* RCU requires replacing both l_mas, mas, and parent. */ mas->node = mt_mk_node(newnode, mt); ma_set_meta(newnode, mt, 0, tmp); new_left = mas_pop_node(mas); new_left->parent = left->parent; mt = mte_node_type(l_mas.node); slots = ma_slots(new_left, mt); pivs = ma_pivots(new_left, mt); memcpy(slots, l_slots, sizeof(void *) * split); memcpy(pivs, l_pivs, sizeof(unsigned long) * split); ma_set_meta(new_left, mt, 0, split); l_mas.node = mt_mk_node(new_left, mt); /* replace parent. */ offset = mte_parent_slot(mas->node); mt = mas_parent_type(&l_mas, l_mas.node); parent = mas_pop_node(mas); slots = ma_slots(parent, mt); pivs = ma_pivots(parent, mt); memcpy(parent, mte_to_node(old_eparent), sizeof(struct maple_node)); rcu_assign_pointer(slots[offset], mas->node); rcu_assign_pointer(slots[offset - 1], l_mas.node); pivs[offset - 1] = l_mas.max; eparent = mt_mk_node(parent, mt); done: gap = mas_leaf_max_gap(mas); mte_set_gap(eparent, mte_parent_slot(mas->node), gap); gap = mas_leaf_max_gap(&l_mas); mte_set_gap(eparent, mte_parent_slot(l_mas.node), gap); mas_ascend(mas); if (in_rcu) { mas_replace_node(mas, old_eparent); mas_adopt_children(mas, mas->node); } mas_update_gap(mas); } /* * mas_split_final_node() - Split the final node in a subtree operation. * @mast: the maple subtree state * @mas: The maple state * @height: The height of the tree in case it's a new root. */ static inline void mas_split_final_node(struct maple_subtree_state *mast, struct ma_state *mas, int height) { struct maple_enode *ancestor; if (mte_is_root(mas->node)) { if (mt_is_alloc(mas->tree)) mast->bn->type = maple_arange_64; else mast->bn->type = maple_range_64; mas->depth = height; } /* * Only a single node is used here, could be root. * The Big_node data should just fit in a single node. */ ancestor = mas_new_ma_node(mas, mast->bn); mas_set_parent(mas, mast->l->node, ancestor, mast->l->offset); mas_set_parent(mas, mast->r->node, ancestor, mast->r->offset); mte_to_node(ancestor)->parent = mas_mn(mas)->parent; mast->l->node = ancestor; mab_mas_cp(mast->bn, 0, mt_slots[mast->bn->type] - 1, mast->l, true); mas->offset = mast->bn->b_end - 1; } /* * mast_fill_bnode() - Copy data into the big node in the subtree state * @mast: The maple subtree state * @mas: the maple state * @skip: The number of entries to skip for new nodes insertion. */ static inline void mast_fill_bnode(struct maple_subtree_state *mast, struct ma_state *mas, unsigned char skip) { bool cp = true; unsigned char split; memset(mast->bn->gap, 0, sizeof(unsigned long) * ARRAY_SIZE(mast->bn->gap)); memset(mast->bn->slot, 0, sizeof(unsigned long) * ARRAY_SIZE(mast->bn->slot)); memset(mast->bn->pivot, 0, sizeof(unsigned long) * ARRAY_SIZE(mast->bn->pivot)); mast->bn->b_end = 0; if (mte_is_root(mas->node)) { cp = false; } else { mas_ascend(mas); mas->offset = mte_parent_slot(mas->node); } if (cp && mast->l->offset) mas_mab_cp(mas, 0, mast->l->offset - 1, mast->bn, 0); split = mast->bn->b_end; mab_set_b_end(mast->bn, mast->l, mast->l->node); mast->r->offset = mast->bn->b_end; mab_set_b_end(mast->bn, mast->r, mast->r->node); if (mast->bn->pivot[mast->bn->b_end - 1] == mas->max) cp = false; if (cp) mas_mab_cp(mas, split + skip, mt_slot_count(mas->node) - 1, mast->bn, mast->bn->b_end); mast->bn->b_end--; mast->bn->type = mte_node_type(mas->node); } /* * mast_split_data() - Split the data in the subtree state big node into regular * nodes. * @mast: The maple subtree state * @mas: The maple state * @split: The location to split the big node */ static inline void mast_split_data(struct maple_subtree_state *mast, struct ma_state *mas, unsigned char split) { unsigned char p_slot; mab_mas_cp(mast->bn, 0, split, mast->l, true); mte_set_pivot(mast->r->node, 0, mast->r->max); mab_mas_cp(mast->bn, split + 1, mast->bn->b_end, mast->r, false); mast->l->offset = mte_parent_slot(mas->node); mast->l->max = mast->bn->pivot[split]; mast->r->min = mast->l->max + 1; if (mte_is_leaf(mas->node)) return; p_slot = mast->orig_l->offset; mas_set_split_parent(mast->orig_l, mast->l->node, mast->r->node, &p_slot, split); mas_set_split_parent(mast->orig_r, mast->l->node, mast->r->node, &p_slot, split); } /* * mas_push_data() - Instead of splitting a node, it is beneficial to push the * data to the right or left node if there is room. * @mas: The maple state * @height: The current height of the maple state * @mast: The maple subtree state * @left: Push left or not. * * Keeping the height of the tree low means faster lookups. * * Return: True if pushed, false otherwise. */ static inline bool mas_push_data(struct ma_state *mas, int height, struct maple_subtree_state *mast, bool left) { unsigned char slot_total = mast->bn->b_end; unsigned char end, space, split; MA_STATE(tmp_mas, mas->tree, mas->index, mas->last); tmp_mas = *mas; tmp_mas.depth = mast->l->depth; if (left && !mas_prev_sibling(&tmp_mas)) return false; else if (!left && !mas_next_sibling(&tmp_mas)) return false; end = mas_data_end(&tmp_mas); slot_total += end; space = 2 * mt_slot_count(mas->node) - 2; /* -2 instead of -1 to ensure there isn't a triple split */ if (ma_is_leaf(mast->bn->type)) space--; if (mas->max == ULONG_MAX) space--; if (slot_total >= space) return false; /* Get the data; Fill mast->bn */ mast->bn->b_end++; if (left) { mab_shift_right(mast->bn, end + 1); mas_mab_cp(&tmp_mas, 0, end, mast->bn, 0); mast->bn->b_end = slot_total + 1; } else { mas_mab_cp(&tmp_mas, 0, end, mast->bn, mast->bn->b_end); } /* Configure mast for splitting of mast->bn */ split = mt_slots[mast->bn->type] - 2; if (left) { /* Switch mas to prev node */ *mas = tmp_mas; /* Start using mast->l for the left side. */ tmp_mas.node = mast->l->node; *mast->l = tmp_mas; } else { tmp_mas.node = mast->r->node; *mast->r = tmp_mas; split = slot_total - split; } split = mab_no_null_split(mast->bn, split, mt_slots[mast->bn->type]); /* Update parent slot for split calculation. */ if (left) mast->orig_l->offset += end + 1; mast_split_data(mast, mas, split); mast_fill_bnode(mast, mas, 2); mas_split_final_node(mast, mas, height + 1); return true; } /* * mas_split() - Split data that is too big for one node into two. * @mas: The maple state * @b_node: The maple big node * Return: 1 on success, 0 on failure. */ static int mas_split(struct ma_state *mas, struct maple_big_node *b_node) { struct maple_subtree_state mast; int height = 0; unsigned char mid_split, split = 0; struct maple_enode *old; /* * Splitting is handled differently from any other B-tree; the Maple * Tree splits upwards. Splitting up means that the split operation * occurs when the walk of the tree hits the leaves and not on the way * down. The reason for splitting up is that it is impossible to know * how much space will be needed until the leaf is (or leaves are) * reached. Since overwriting data is allowed and a range could * overwrite more than one range or result in changing one entry into 3 * entries, it is impossible to know if a split is required until the * data is examined. * * Splitting is a balancing act between keeping allocations to a minimum * and avoiding a 'jitter' event where a tree is expanded to make room * for an entry followed by a contraction when the entry is removed. To * accomplish the balance, there are empty slots remaining in both left * and right nodes after a split. */ MA_STATE(l_mas, mas->tree, mas->index, mas->last); MA_STATE(r_mas, mas->tree, mas->index, mas->last); MA_STATE(prev_l_mas, mas->tree, mas->index, mas->last); MA_STATE(prev_r_mas, mas->tree, mas->index, mas->last); trace_ma_op(__func__, mas); mas->depth = mas_mt_height(mas); /* Allocation failures will happen early. */ mas_node_count(mas, 1 + mas->depth * 2); if (mas_is_err(mas)) return 0; mast.l = &l_mas; mast.r = &r_mas; mast.orig_l = &prev_l_mas; mast.orig_r = &prev_r_mas; mast.bn = b_node; while (height++ <= mas->depth) { if (mt_slots[b_node->type] > b_node->b_end) { mas_split_final_node(&mast, mas, height); break; } l_mas = r_mas = *mas; l_mas.node = mas_new_ma_node(mas, b_node); r_mas.node = mas_new_ma_node(mas, b_node); /* * Another way that 'jitter' is avoided is to terminate a split up early if the * left or right node has space to spare. This is referred to as "pushing left" * or "pushing right" and is similar to the B* tree, except the nodes left or * right can rarely be reused due to RCU, but the ripple upwards is halted which * is a significant savings. */ /* Try to push left. */ if (mas_push_data(mas, height, &mast, true)) break; /* Try to push right. */ if (mas_push_data(mas, height, &mast, false)) break; split = mab_calc_split(mas, b_node, &mid_split, prev_l_mas.min); mast_split_data(&mast, mas, split); /* * Usually correct, mab_mas_cp in the above call overwrites * r->max. */ mast.r->max = mas->max; mast_fill_bnode(&mast, mas, 1); prev_l_mas = *mast.l; prev_r_mas = *mast.r; } /* Set the original node as dead */ old = mas->node; mas->node = l_mas.node; mas_wmb_replace(mas, old); mtree_range_walk(mas); return 1; } /* * mas_reuse_node() - Reuse the node to store the data. * @wr_mas: The maple write state * @bn: The maple big node * @end: The end of the data. * * Will always return false in RCU mode. * * Return: True if node was reused, false otherwise. */ static inline bool mas_reuse_node(struct ma_wr_state *wr_mas, struct maple_big_node *bn, unsigned char end) { /* Need to be rcu safe. */ if (mt_in_rcu(wr_mas->mas->tree)) return false; if (end > bn->b_end) { int clear = mt_slots[wr_mas->type] - bn->b_end; memset(wr_mas->slots + bn->b_end, 0, sizeof(void *) * clear--); memset(wr_mas->pivots + bn->b_end, 0, sizeof(void *) * clear); } mab_mas_cp(bn, 0, bn->b_end, wr_mas->mas, false); return true; } /* * mas_commit_b_node() - Commit the big node into the tree. * @wr_mas: The maple write state * @b_node: The maple big node * @end: The end of the data. */ static noinline_for_kasan int mas_commit_b_node(struct ma_wr_state *wr_mas, struct maple_big_node *b_node, unsigned char end) { struct maple_node *node; struct maple_enode *old_enode; unsigned char b_end = b_node->b_end; enum maple_type b_type = b_node->type; old_enode = wr_mas->mas->node; if ((b_end < mt_min_slots[b_type]) && (!mte_is_root(old_enode)) && (mas_mt_height(wr_mas->mas) > 1)) return mas_rebalance(wr_mas->mas, b_node); if (b_end >= mt_slots[b_type]) return mas_split(wr_mas->mas, b_node); if (mas_reuse_node(wr_mas, b_node, end)) goto reuse_node; mas_node_count(wr_mas->mas, 1); if (mas_is_err(wr_mas->mas)) return 0; node = mas_pop_node(wr_mas->mas); node->parent = mas_mn(wr_mas->mas)->parent; wr_mas->mas->node = mt_mk_node(node, b_type); mab_mas_cp(b_node, 0, b_end, wr_mas->mas, false); mas_replace_node(wr_mas->mas, old_enode); reuse_node: mas_update_gap(wr_mas->mas); wr_mas->mas->end = b_end; return 1; } /* * mas_root_expand() - Expand a root to a node * @mas: The maple state * @entry: The entry to store into the tree */ static inline int mas_root_expand(struct ma_state *mas, void *entry) { void *contents = mas_root_locked(mas); enum maple_type type = maple_leaf_64; struct maple_node *node; void __rcu **slots; unsigned long *pivots; int slot = 0; mas_node_count(mas, 1); if (unlikely(mas_is_err(mas))) return 0; node = mas_pop_node(mas); pivots = ma_pivots(node, type); slots = ma_slots(node, type); node->parent = ma_parent_ptr(mas_tree_parent(mas)); mas->node = mt_mk_node(node, type); mas->status = ma_active; if (mas->index) { if (contents) { rcu_assign_pointer(slots[slot], contents); if (likely(mas->index > 1)) slot++; } pivots[slot++] = mas->index - 1; } rcu_assign_pointer(slots[slot], entry); mas->offset = slot; pivots[slot] = mas->last; if (mas->last != ULONG_MAX) pivots[++slot] = ULONG_MAX; mas->depth = 1; mas_set_height(mas); ma_set_meta(node, maple_leaf_64, 0, slot); /* swap the new root into the tree */ rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); return slot; } static inline void mas_store_root(struct ma_state *mas, void *entry) { if (likely((mas->last != 0) || (mas->index != 0))) mas_root_expand(mas, entry); else if (((unsigned long) (entry) & 3) == 2) mas_root_expand(mas, entry); else { rcu_assign_pointer(mas->tree->ma_root, entry); mas->status = ma_start; } } /* * mas_is_span_wr() - Check if the write needs to be treated as a write that * spans the node. * @mas: The maple state * @piv: The pivot value being written * @type: The maple node type * @entry: The data to write * * Spanning writes are writes that start in one node and end in another OR if * the write of a %NULL will cause the node to end with a %NULL. * * Return: True if this is a spanning write, false otherwise. */ static bool mas_is_span_wr(struct ma_wr_state *wr_mas) { unsigned long max = wr_mas->r_max; unsigned long last = wr_mas->mas->last; enum maple_type type = wr_mas->type; void *entry = wr_mas->entry; /* Contained in this pivot, fast path */ if (last < max) return false; if (ma_is_leaf(type)) { max = wr_mas->mas->max; if (last < max) return false; } if (last == max) { /* * The last entry of leaf node cannot be NULL unless it is the * rightmost node (writing ULONG_MAX), otherwise it spans slots. */ if (entry || last == ULONG_MAX) return false; } trace_ma_write(__func__, wr_mas->mas, wr_mas->r_max, entry); return true; } static inline void mas_wr_walk_descend(struct ma_wr_state *wr_mas) { wr_mas->type = mte_node_type(wr_mas->mas->node); mas_wr_node_walk(wr_mas); wr_mas->slots = ma_slots(wr_mas->node, wr_mas->type); } static inline void mas_wr_walk_traverse(struct ma_wr_state *wr_mas) { wr_mas->mas->max = wr_mas->r_max; wr_mas->mas->min = wr_mas->r_min; wr_mas->mas->node = wr_mas->content; wr_mas->mas->offset = 0; wr_mas->mas->depth++; } /* * mas_wr_walk() - Walk the tree for a write. * @wr_mas: The maple write state * * Uses mas_slot_locked() and does not need to worry about dead nodes. * * Return: True if it's contained in a node, false on spanning write. */ static bool mas_wr_walk(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; while (true) { mas_wr_walk_descend(wr_mas); if (unlikely(mas_is_span_wr(wr_mas))) return false; wr_mas->content = mas_slot_locked(mas, wr_mas->slots, mas->offset); if (ma_is_leaf(wr_mas->type)) return true; mas_wr_walk_traverse(wr_mas); } return true; } static bool mas_wr_walk_index(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; while (true) { mas_wr_walk_descend(wr_mas); wr_mas->content = mas_slot_locked(mas, wr_mas->slots, mas->offset); if (ma_is_leaf(wr_mas->type)) return true; mas_wr_walk_traverse(wr_mas); } return true; } /* * mas_extend_spanning_null() - Extend a store of a %NULL to include surrounding %NULLs. * @l_wr_mas: The left maple write state * @r_wr_mas: The right maple write state */ static inline void mas_extend_spanning_null(struct ma_wr_state *l_wr_mas, struct ma_wr_state *r_wr_mas) { struct ma_state *r_mas = r_wr_mas->mas; struct ma_state *l_mas = l_wr_mas->mas; unsigned char l_slot; l_slot = l_mas->offset; if (!l_wr_mas->content) l_mas->index = l_wr_mas->r_min; if ((l_mas->index == l_wr_mas->r_min) && (l_slot && !mas_slot_locked(l_mas, l_wr_mas->slots, l_slot - 1))) { if (l_slot > 1) l_mas->index = l_wr_mas->pivots[l_slot - 2] + 1; else l_mas->index = l_mas->min; l_mas->offset = l_slot - 1; } if (!r_wr_mas->content) { if (r_mas->last < r_wr_mas->r_max) r_mas->last = r_wr_mas->r_max; r_mas->offset++; } else if ((r_mas->last == r_wr_mas->r_max) && (r_mas->last < r_mas->max) && !mas_slot_locked(r_mas, r_wr_mas->slots, r_mas->offset + 1)) { r_mas->last = mas_safe_pivot(r_mas, r_wr_mas->pivots, r_wr_mas->type, r_mas->offset + 1); r_mas->offset++; } } static inline void *mas_state_walk(struct ma_state *mas) { void *entry; entry = mas_start(mas); if (mas_is_none(mas)) return NULL; if (mas_is_ptr(mas)) return entry; return mtree_range_walk(mas); } /* * mtree_lookup_walk() - Internal quick lookup that does not keep maple state up * to date. * * @mas: The maple state. * * Note: Leaves mas in undesirable state. * Return: The entry for @mas->index or %NULL on dead node. */ static inline void *mtree_lookup_walk(struct ma_state *mas) { unsigned long *pivots; unsigned char offset; struct maple_node *node; struct maple_enode *next; enum maple_type type; void __rcu **slots; unsigned char end; next = mas->node; do { node = mte_to_node(next); type = mte_node_type(next); pivots = ma_pivots(node, type); end = mt_pivots[type]; offset = 0; do { if (pivots[offset] >= mas->index) break; } while (++offset < end); slots = ma_slots(node, type); next = mt_slot(mas->tree, slots, offset); if (unlikely(ma_dead_node(node))) goto dead_node; } while (!ma_is_leaf(type)); return (void *)next; dead_node: mas_reset(mas); return NULL; } static void mte_destroy_walk(struct maple_enode *, struct maple_tree *); /* * mas_new_root() - Create a new root node that only contains the entry passed * in. * @mas: The maple state * @entry: The entry to store. * * Only valid when the index == 0 and the last == ULONG_MAX * * Return 0 on error, 1 on success. */ static inline int mas_new_root(struct ma_state *mas, void *entry) { struct maple_enode *root = mas_root_locked(mas); enum maple_type type = maple_leaf_64; struct maple_node *node; void __rcu **slots; unsigned long *pivots; if (!entry && !mas->index && mas->last == ULONG_MAX) { mas->depth = 0; mas_set_height(mas); rcu_assign_pointer(mas->tree->ma_root, entry); mas->status = ma_start; goto done; } mas_node_count(mas, 1); if (mas_is_err(mas)) return 0; node = mas_pop_node(mas); pivots = ma_pivots(node, type); slots = ma_slots(node, type); node->parent = ma_parent_ptr(mas_tree_parent(mas)); mas->node = mt_mk_node(node, type); mas->status = ma_active; rcu_assign_pointer(slots[0], entry); pivots[0] = mas->last; mas->depth = 1; mas_set_height(mas); rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); done: if (xa_is_node(root)) mte_destroy_walk(root, mas->tree); return 1; } /* * mas_wr_spanning_store() - Create a subtree with the store operation completed * and new nodes where necessary, then place the sub-tree in the actual tree. * Note that mas is expected to point to the node which caused the store to * span. * @wr_mas: The maple write state * * Return: 0 on error, positive on success. */ static inline int mas_wr_spanning_store(struct ma_wr_state *wr_mas) { struct maple_subtree_state mast; struct maple_big_node b_node; struct ma_state *mas; unsigned char height; /* Left and Right side of spanning store */ MA_STATE(l_mas, NULL, 0, 0); MA_STATE(r_mas, NULL, 0, 0); MA_WR_STATE(r_wr_mas, &r_mas, wr_mas->entry); MA_WR_STATE(l_wr_mas, &l_mas, wr_mas->entry); /* * A store operation that spans multiple nodes is called a spanning * store and is handled early in the store call stack by the function * mas_is_span_wr(). When a spanning store is identified, the maple * state is duplicated. The first maple state walks the left tree path * to ``index``, the duplicate walks the right tree path to ``last``. * The data in the two nodes are combined into a single node, two nodes, * or possibly three nodes (see the 3-way split above). A ``NULL`` * written to the last entry of a node is considered a spanning store as * a rebalance is required for the operation to complete and an overflow * of data may happen. */ mas = wr_mas->mas; trace_ma_op(__func__, mas); if (unlikely(!mas->index && mas->last == ULONG_MAX)) return mas_new_root(mas, wr_mas->entry); /* * Node rebalancing may occur due to this store, so there may be three new * entries per level plus a new root. */ height = mas_mt_height(mas); mas_node_count(mas, 1 + height * 3); if (mas_is_err(mas)) return 0; /* * Set up right side. Need to get to the next offset after the spanning * store to ensure it's not NULL and to combine both the next node and * the node with the start together. */ r_mas = *mas; /* Avoid overflow, walk to next slot in the tree. */ if (r_mas.last + 1) r_mas.last++; r_mas.index = r_mas.last; mas_wr_walk_index(&r_wr_mas); r_mas.last = r_mas.index = mas->last; /* Set up left side. */ l_mas = *mas; mas_wr_walk_index(&l_wr_mas); if (!wr_mas->entry) { mas_extend_spanning_null(&l_wr_mas, &r_wr_mas); mas->offset = l_mas.offset; mas->index = l_mas.index; mas->last = l_mas.last = r_mas.last; } /* expanding NULLs may make this cover the entire range */ if (!l_mas.index && r_mas.last == ULONG_MAX) { mas_set_range(mas, 0, ULONG_MAX); return mas_new_root(mas, wr_mas->entry); } memset(&b_node, 0, sizeof(struct maple_big_node)); /* Copy l_mas and store the value in b_node. */ mas_store_b_node(&l_wr_mas, &b_node, l_mas.end); /* Copy r_mas into b_node. */ if (r_mas.offset <= r_mas.end) mas_mab_cp(&r_mas, r_mas.offset, r_mas.end, &b_node, b_node.b_end + 1); else b_node.b_end++; /* Stop spanning searches by searching for just index. */ l_mas.index = l_mas.last = mas->index; mast.bn = &b_node; mast.orig_l = &l_mas; mast.orig_r = &r_mas; /* Combine l_mas and r_mas and split them up evenly again. */ return mas_spanning_rebalance(mas, &mast, height + 1); } /* * mas_wr_node_store() - Attempt to store the value in a node * @wr_mas: The maple write state * * Attempts to reuse the node, but may allocate. * * Return: True if stored, false otherwise */ static inline bool mas_wr_node_store(struct ma_wr_state *wr_mas, unsigned char new_end) { struct ma_state *mas = wr_mas->mas; void __rcu **dst_slots; unsigned long *dst_pivots; unsigned char dst_offset, offset_end = wr_mas->offset_end; struct maple_node reuse, *newnode; unsigned char copy_size, node_pivots = mt_pivots[wr_mas->type]; bool in_rcu = mt_in_rcu(mas->tree); /* Check if there is enough data. The room is enough. */ if (!mte_is_root(mas->node) && (new_end <= mt_min_slots[wr_mas->type]) && !(mas->mas_flags & MA_STATE_BULK)) return false; if (mas->last == wr_mas->end_piv) offset_end++; /* don't copy this offset */ else if (unlikely(wr_mas->r_max == ULONG_MAX)) mas_bulk_rebalance(mas, mas->end, wr_mas->type); /* set up node. */ if (in_rcu) { mas_node_count(mas, 1); if (mas_is_err(mas)) return false; newnode = mas_pop_node(mas); } else { memset(&reuse, 0, sizeof(struct maple_node)); newnode = &reuse; } newnode->parent = mas_mn(mas)->parent; dst_pivots = ma_pivots(newnode, wr_mas->type); dst_slots = ma_slots(newnode, wr_mas->type); /* Copy from start to insert point */ memcpy(dst_pivots, wr_mas->pivots, sizeof(unsigned long) * mas->offset); memcpy(dst_slots, wr_mas->slots, sizeof(void *) * mas->offset); /* Handle insert of new range starting after old range */ if (wr_mas->r_min < mas->index) { rcu_assign_pointer(dst_slots[mas->offset], wr_mas->content); dst_pivots[mas->offset++] = mas->index - 1; } /* Store the new entry and range end. */ if (mas->offset < node_pivots) dst_pivots[mas->offset] = mas->last; rcu_assign_pointer(dst_slots[mas->offset], wr_mas->entry); /* * this range wrote to the end of the node or it overwrote the rest of * the data */ if (offset_end > mas->end) goto done; dst_offset = mas->offset + 1; /* Copy to the end of node if necessary. */ copy_size = mas->end - offset_end + 1; memcpy(dst_slots + dst_offset, wr_mas->slots + offset_end, sizeof(void *) * copy_size); memcpy(dst_pivots + dst_offset, wr_mas->pivots + offset_end, sizeof(unsigned long) * (copy_size - 1)); if (new_end < node_pivots) dst_pivots[new_end] = mas->max; done: mas_leaf_set_meta(newnode, maple_leaf_64, new_end); if (in_rcu) { struct maple_enode *old_enode = mas->node; mas->node = mt_mk_node(newnode, wr_mas->type); mas_replace_node(mas, old_enode); } else { memcpy(wr_mas->node, newnode, sizeof(struct maple_node)); } trace_ma_write(__func__, mas, 0, wr_mas->entry); mas_update_gap(mas); mas->end = new_end; return true; } /* * mas_wr_slot_store: Attempt to store a value in a slot. * @wr_mas: the maple write state * * Return: True if stored, false otherwise */ static inline bool mas_wr_slot_store(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char offset = mas->offset; void __rcu **slots = wr_mas->slots; bool gap = false; gap |= !mt_slot_locked(mas->tree, slots, offset); gap |= !mt_slot_locked(mas->tree, slots, offset + 1); if (wr_mas->offset_end - offset == 1) { if (mas->index == wr_mas->r_min) { /* Overwriting the range and a part of the next one */ rcu_assign_pointer(slots[offset], wr_mas->entry); wr_mas->pivots[offset] = mas->last; } else { /* Overwriting a part of the range and the next one */ rcu_assign_pointer(slots[offset + 1], wr_mas->entry); wr_mas->pivots[offset] = mas->index - 1; mas->offset++; /* Keep mas accurate. */ } } else if (!mt_in_rcu(mas->tree)) { /* * Expand the range, only partially overwriting the previous and * next ranges */ gap |= !mt_slot_locked(mas->tree, slots, offset + 2); rcu_assign_pointer(slots[offset + 1], wr_mas->entry); wr_mas->pivots[offset] = mas->index - 1; wr_mas->pivots[offset + 1] = mas->last; mas->offset++; /* Keep mas accurate. */ } else { return false; } trace_ma_write(__func__, mas, 0, wr_mas->entry); /* * Only update gap when the new entry is empty or there is an empty * entry in the original two ranges. */ if (!wr_mas->entry || gap) mas_update_gap(mas); return true; } static inline void mas_wr_extend_null(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; if (!wr_mas->slots[wr_mas->offset_end]) { /* If this one is null, the next and prev are not */ mas->last = wr_mas->end_piv; } else { /* Check next slot(s) if we are overwriting the end */ if ((mas->last == wr_mas->end_piv) && (mas->end != wr_mas->offset_end) && !wr_mas->slots[wr_mas->offset_end + 1]) { wr_mas->offset_end++; if (wr_mas->offset_end == mas->end) mas->last = mas->max; else mas->last = wr_mas->pivots[wr_mas->offset_end]; wr_mas->end_piv = mas->last; } } if (!wr_mas->content) { /* If this one is null, the next and prev are not */ mas->index = wr_mas->r_min; } else { /* Check prev slot if we are overwriting the start */ if (mas->index == wr_mas->r_min && mas->offset && !wr_mas->slots[mas->offset - 1]) { mas->offset--; wr_mas->r_min = mas->index = mas_safe_min(mas, wr_mas->pivots, mas->offset); wr_mas->r_max = wr_mas->pivots[mas->offset]; } } } static inline void mas_wr_end_piv(struct ma_wr_state *wr_mas) { while ((wr_mas->offset_end < wr_mas->mas->end) && (wr_mas->mas->last > wr_mas->pivots[wr_mas->offset_end])) wr_mas->offset_end++; if (wr_mas->offset_end < wr_mas->mas->end) wr_mas->end_piv = wr_mas->pivots[wr_mas->offset_end]; else wr_mas->end_piv = wr_mas->mas->max; if (!wr_mas->entry) mas_wr_extend_null(wr_mas); } static inline unsigned char mas_wr_new_end(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char new_end = mas->end + 2; new_end -= wr_mas->offset_end - mas->offset; if (wr_mas->r_min == mas->index) new_end--; if (wr_mas->end_piv == mas->last) new_end--; return new_end; } /* * mas_wr_append: Attempt to append * @wr_mas: the maple write state * @new_end: The end of the node after the modification * * This is currently unsafe in rcu mode since the end of the node may be cached * by readers while the node contents may be updated which could result in * inaccurate information. * * Return: True if appended, false otherwise */ static inline bool mas_wr_append(struct ma_wr_state *wr_mas, unsigned char new_end) { struct ma_state *mas; void __rcu **slots; unsigned char end; mas = wr_mas->mas; if (mt_in_rcu(mas->tree)) return false; end = mas->end; if (mas->offset != end) return false; if (new_end < mt_pivots[wr_mas->type]) { wr_mas->pivots[new_end] = wr_mas->pivots[end]; ma_set_meta(wr_mas->node, wr_mas->type, 0, new_end); } slots = wr_mas->slots; if (new_end == end + 1) { if (mas->last == wr_mas->r_max) { /* Append to end of range */ rcu_assign_pointer(slots[new_end], wr_mas->entry); wr_mas->pivots[end] = mas->index - 1; mas->offset = new_end; } else { /* Append to start of range */ rcu_assign_pointer(slots[new_end], wr_mas->content); wr_mas->pivots[end] = mas->last; rcu_assign_pointer(slots[end], wr_mas->entry); } } else { /* Append to the range without touching any boundaries. */ rcu_assign_pointer(slots[new_end], wr_mas->content); wr_mas->pivots[end + 1] = mas->last; rcu_assign_pointer(slots[end + 1], wr_mas->entry); wr_mas->pivots[end] = mas->index - 1; mas->offset = end + 1; } if (!wr_mas->content || !wr_mas->entry) mas_update_gap(mas); mas->end = new_end; trace_ma_write(__func__, mas, new_end, wr_mas->entry); return true; } /* * mas_wr_bnode() - Slow path for a modification. * @wr_mas: The write maple state * * This is where split, rebalance end up. */ static void mas_wr_bnode(struct ma_wr_state *wr_mas) { struct maple_big_node b_node; trace_ma_write(__func__, wr_mas->mas, 0, wr_mas->entry); memset(&b_node, 0, sizeof(struct maple_big_node)); mas_store_b_node(wr_mas, &b_node, wr_mas->offset_end); mas_commit_b_node(wr_mas, &b_node, wr_mas->mas->end); } static inline void mas_wr_modify(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char new_end; /* Direct replacement */ if (wr_mas->r_min == mas->index && wr_mas->r_max == mas->last) { rcu_assign_pointer(wr_mas->slots[mas->offset], wr_mas->entry); if (!!wr_mas->entry ^ !!wr_mas->content) mas_update_gap(mas); return; } /* * new_end exceeds the size of the maple node and cannot enter the fast * path. */ new_end = mas_wr_new_end(wr_mas); if (new_end >= mt_slots[wr_mas->type]) goto slow_path; /* Attempt to append */ if (mas_wr_append(wr_mas, new_end)) return; if (new_end == mas->end && mas_wr_slot_store(wr_mas)) return; if (mas_wr_node_store(wr_mas, new_end)) return; if (mas_is_err(mas)) return; slow_path: mas_wr_bnode(wr_mas); } /* * mas_wr_store_entry() - Internal call to store a value * @mas: The maple state * @entry: The entry to store. * * Return: The contents that was stored at the index. */ static inline void mas_wr_store_entry(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; wr_mas->content = mas_start(mas); if (mas_is_none(mas) || mas_is_ptr(mas)) { mas_store_root(mas, wr_mas->entry); return; } if (unlikely(!mas_wr_walk(wr_mas))) { mas_wr_spanning_store(wr_mas); return; } /* At this point, we are at the leaf node that needs to be altered. */ mas_wr_end_piv(wr_mas); /* New root for a single pointer */ if (unlikely(!mas->index && mas->last == ULONG_MAX)) mas_new_root(mas, wr_mas->entry); else mas_wr_modify(wr_mas); } /** * mas_insert() - Internal call to insert a value * @mas: The maple state * @entry: The entry to store * * Return: %NULL or the contents that already exists at the requested index * otherwise. The maple state needs to be checked for error conditions. */ static inline void *mas_insert(struct ma_state *mas, void *entry) { MA_WR_STATE(wr_mas, mas, entry); /* * Inserting a new range inserts either 0, 1, or 2 pivots within the * tree. If the insert fits exactly into an existing gap with a value * of NULL, then the slot only needs to be written with the new value. * If the range being inserted is adjacent to another range, then only a * single pivot needs to be inserted (as well as writing the entry). If * the new range is within a gap but does not touch any other ranges, * then two pivots need to be inserted: the start - 1, and the end. As * usual, the entry must be written. Most operations require a new node * to be allocated and replace an existing node to ensure RCU safety, * when in RCU mode. The exception to requiring a newly allocated node * is when inserting at the end of a node (appending). When done * carefully, appending can reuse the node in place. */ wr_mas.content = mas_start(mas); if (wr_mas.content) goto exists; if (mas_is_none(mas) || mas_is_ptr(mas)) { mas_store_root(mas, entry); return NULL; } /* spanning writes always overwrite something */ if (!mas_wr_walk(&wr_mas)) goto exists; /* At this point, we are at the leaf node that needs to be altered. */ wr_mas.offset_end = mas->offset; wr_mas.end_piv = wr_mas.r_max; if (wr_mas.content || (mas->last > wr_mas.r_max)) goto exists; if (!entry) return NULL; mas_wr_modify(&wr_mas); return wr_mas.content; exists: mas_set_err(mas, -EEXIST); return wr_mas.content; } /** * mas_alloc_cyclic() - Internal call to find somewhere to store an entry * @mas: The maple state. * @startp: Pointer to ID. * @range_lo: Lower bound of range to search. * @range_hi: Upper bound of range to search. * @entry: The entry to store. * @next: Pointer to next ID to allocate. * @gfp: The GFP_FLAGS to use for allocations. * * Return: 0 if the allocation succeeded without wrapping, 1 if the * allocation succeeded after wrapping, or -EBUSY if there are no * free entries. */ int mas_alloc_cyclic(struct ma_state *mas, unsigned long *startp, void *entry, unsigned long range_lo, unsigned long range_hi, unsigned long *next, gfp_t gfp) { unsigned long min = range_lo; int ret = 0; range_lo = max(min, *next); ret = mas_empty_area(mas, range_lo, range_hi, 1); if ((mas->tree->ma_flags & MT_FLAGS_ALLOC_WRAPPED) && ret == 0) { mas->tree->ma_flags &= ~MT_FLAGS_ALLOC_WRAPPED; ret = 1; } if (ret < 0 && range_lo > min) { ret = mas_empty_area(mas, min, range_hi, 1); if (ret == 0) ret = 1; } if (ret < 0) return ret; do { mas_insert(mas, entry); } while (mas_nomem(mas, gfp)); if (mas_is_err(mas)) return xa_err(mas->node); *startp = mas->index; *next = *startp + 1; if (*next == 0) mas->tree->ma_flags |= MT_FLAGS_ALLOC_WRAPPED; return ret; } EXPORT_SYMBOL(mas_alloc_cyclic); static __always_inline void mas_rewalk(struct ma_state *mas, unsigned long index) { retry: mas_set(mas, index); mas_state_walk(mas); if (mas_is_start(mas)) goto retry; } static __always_inline bool mas_rewalk_if_dead(struct ma_state *mas, struct maple_node *node, const unsigned long index) { if (unlikely(ma_dead_node(node))) { mas_rewalk(mas, index); return true; } return false; } /* * mas_prev_node() - Find the prev non-null entry at the same level in the * tree. The prev value will be mas->node[mas->offset] or the status will be * ma_none. * @mas: The maple state * @min: The lower limit to search * * The prev node value will be mas->node[mas->offset] or the status will be * ma_none. * Return: 1 if the node is dead, 0 otherwise. */ static int mas_prev_node(struct ma_state *mas, unsigned long min) { enum maple_type mt; int offset, level; void __rcu **slots; struct maple_node *node; unsigned long *pivots; unsigned long max; node = mas_mn(mas); if (!mas->min) goto no_entry; max = mas->min - 1; if (max < min) goto no_entry; level = 0; do { if (ma_is_root(node)) goto no_entry; /* Walk up. */ if (unlikely(mas_ascend(mas))) return 1; offset = mas->offset; level++; node = mas_mn(mas); } while (!offset); offset--; mt = mte_node_type(mas->node); while (level > 1) { level--; slots = ma_slots(node, mt); mas->node = mas_slot(mas, slots, offset); if (unlikely(ma_dead_node(node))) return 1; mt = mte_node_type(mas->node); node = mas_mn(mas); pivots = ma_pivots(node, mt); offset = ma_data_end(node, mt, pivots, max); if (unlikely(ma_dead_node(node))) return 1; } slots = ma_slots(node, mt); mas->node = mas_slot(mas, slots, offset); pivots = ma_pivots(node, mt); if (unlikely(ma_dead_node(node))) return 1; if (likely(offset)) mas->min = pivots[offset - 1] + 1; mas->max = max; mas->offset = mas_data_end(mas); if (unlikely(mte_dead_node(mas->node))) return 1; mas->end = mas->offset; return 0; no_entry: if (unlikely(ma_dead_node(node))) return 1; mas->status = ma_underflow; return 0; } /* * mas_prev_slot() - Get the entry in the previous slot * * @mas: The maple state * @max: The minimum starting range * @empty: Can be empty * @set_underflow: Set the @mas->node to underflow state on limit. * * Return: The entry in the previous slot which is possibly NULL */ static void *mas_prev_slot(struct ma_state *mas, unsigned long min, bool empty) { void *entry; void __rcu **slots; unsigned long pivot; enum maple_type type; unsigned long *pivots; struct maple_node *node; unsigned long save_point = mas->index; retry: node = mas_mn(mas); type = mte_node_type(mas->node); pivots = ma_pivots(node, type); if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (mas->min <= min) { pivot = mas_safe_min(mas, pivots, mas->offset); if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (pivot <= min) goto underflow; } again: if (likely(mas->offset)) { mas->offset--; mas->last = mas->index - 1; mas->index = mas_safe_min(mas, pivots, mas->offset); } else { if (mas->index <= min) goto underflow; if (mas_prev_node(mas, min)) { mas_rewalk(mas, save_point); goto retry; } if (WARN_ON_ONCE(mas_is_underflow(mas))) return NULL; mas->last = mas->max; node = mas_mn(mas); type = mte_node_type(mas->node); pivots = ma_pivots(node, type); mas->index = pivots[mas->offset - 1] + 1; } slots = ma_slots(node, type); entry = mas_slot(mas, slots, mas->offset); if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (likely(entry)) return entry; if (!empty) { if (mas->index <= min) { mas->status = ma_underflow; return NULL; } goto again; } return entry; underflow: mas->status = ma_underflow; return NULL; } /* * mas_next_node() - Get the next node at the same level in the tree. * @mas: The maple state * @max: The maximum pivot value to check. * * The next value will be mas->node[mas->offset] or the status will have * overflowed. * Return: 1 on dead node, 0 otherwise. */ static int mas_next_node(struct ma_state *mas, struct maple_node *node, unsigned long max) { unsigned long min; unsigned long *pivots; struct maple_enode *enode; struct maple_node *tmp; int level = 0; unsigned char node_end; enum maple_type mt; void __rcu **slots; if (mas->max >= max) goto overflow; min = mas->max + 1; level = 0; do { if (ma_is_root(node)) goto overflow; /* Walk up. */ if (unlikely(mas_ascend(mas))) return 1; level++; node = mas_mn(mas); mt = mte_node_type(mas->node); pivots = ma_pivots(node, mt); node_end = ma_data_end(node, mt, pivots, mas->max); if (unlikely(ma_dead_node(node))) return 1; } while (unlikely(mas->offset == node_end)); slots = ma_slots(node, mt); mas->offset++; enode = mas_slot(mas, slots, mas->offset); if (unlikely(ma_dead_node(node))) return 1; if (level > 1) mas->offset = 0; while (unlikely(level > 1)) { level--; mas->node = enode; node = mas_mn(mas); mt = mte_node_type(mas->node); slots = ma_slots(node, mt); enode = mas_slot(mas, slots, 0); if (unlikely(ma_dead_node(node))) return 1; } if (!mas->offset) pivots = ma_pivots(node, mt); mas->max = mas_safe_pivot(mas, pivots, mas->offset, mt); tmp = mte_to_node(enode); mt = mte_node_type(enode); pivots = ma_pivots(tmp, mt); mas->end = ma_data_end(tmp, mt, pivots, mas->max); if (unlikely(ma_dead_node(node))) return 1; mas->node = enode; mas->min = min; return 0; overflow: if (unlikely(ma_dead_node(node))) return 1; mas->status = ma_overflow; return 0; } /* * mas_next_slot() - Get the entry in the next slot * * @mas: The maple state * @max: The maximum starting range * @empty: Can be empty * @set_overflow: Should @mas->node be set to overflow when the limit is * reached. * * Return: The entry in the next slot which is possibly NULL */ static void *mas_next_slot(struct ma_state *mas, unsigned long max, bool empty) { void __rcu **slots; unsigned long *pivots; unsigned long pivot; enum maple_type type; struct maple_node *node; unsigned long save_point = mas->last; void *entry; retry: node = mas_mn(mas); type = mte_node_type(mas->node); pivots = ma_pivots(node, type); if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (mas->max >= max) { if (likely(mas->offset < mas->end)) pivot = pivots[mas->offset]; else pivot = mas->max; if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (pivot >= max) { /* Was at the limit, next will extend beyond */ mas->status = ma_overflow; return NULL; } } if (likely(mas->offset < mas->end)) { mas->index = pivots[mas->offset] + 1; again: mas->offset++; if (likely(mas->offset < mas->end)) mas->last = pivots[mas->offset]; else mas->last = mas->max; } else { if (mas->last >= max) { mas->status = ma_overflow; return NULL; } if (mas_next_node(mas, node, max)) { mas_rewalk(mas, save_point); goto retry; } if (WARN_ON_ONCE(mas_is_overflow(mas))) return NULL; mas->offset = 0; mas->index = mas->min; node = mas_mn(mas); type = mte_node_type(mas->node); pivots = ma_pivots(node, type); mas->last = pivots[0]; } slots = ma_slots(node, type); entry = mt_slot(mas->tree, slots, mas->offset); if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (entry) return entry; if (!empty) { if (mas->last >= max) { mas->status = ma_overflow; return NULL; } mas->index = mas->last + 1; goto again; } return entry; } /* * mas_next_entry() - Internal function to get the next entry. * @mas: The maple state * @limit: The maximum range start. * * Set the @mas->node to the next entry and the range_start to * the beginning value for the entry. Does not check beyond @limit. * Sets @mas->index and @mas->last to the range, Does not update @mas->index and * @mas->last on overflow. * Restarts on dead nodes. * * Return: the next entry or %NULL. */ static inline void *mas_next_entry(struct ma_state *mas, unsigned long limit) { if (mas->last >= limit) { mas->status = ma_overflow; return NULL; } return mas_next_slot(mas, limit, false); } /* * mas_rev_awalk() - Internal function. Reverse allocation walk. Find the * highest gap address of a given size in a given node and descend. * @mas: The maple state * @size: The needed size. * * Return: True if found in a leaf, false otherwise. * */ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size, unsigned long *gap_min, unsigned long *gap_max) { enum maple_type type = mte_node_type(mas->node); struct maple_node *node = mas_mn(mas); unsigned long *pivots, *gaps; void __rcu **slots; unsigned long gap = 0; unsigned long max, min; unsigned char offset; if (unlikely(mas_is_err(mas))) return true; if (ma_is_dense(type)) { /* dense nodes. */ mas->offset = (unsigned char)(mas->index - mas->min); return true; } pivots = ma_pivots(node, type); slots = ma_slots(node, type); gaps = ma_gaps(node, type); offset = mas->offset; min = mas_safe_min(mas, pivots, offset); /* Skip out of bounds. */ while (mas->last < min) min = mas_safe_min(mas, pivots, --offset); max = mas_safe_pivot(mas, pivots, offset, type); while (mas->index <= max) { gap = 0; if (gaps) gap = gaps[offset]; else if (!mas_slot(mas, slots, offset)) gap = max - min + 1; if (gap) { if ((size <= gap) && (size <= mas->last - min + 1)) break; if (!gaps) { /* Skip the next slot, it cannot be a gap. */ if (offset < 2) goto ascend; offset -= 2; max = pivots[offset]; min = mas_safe_min(mas, pivots, offset); continue; } } if (!offset) goto ascend; offset--; max = min - 1; min = mas_safe_min(mas, pivots, offset); } if (unlikely((mas->index > max) || (size - 1 > max - mas->index))) goto no_space; if (unlikely(ma_is_leaf(type))) { mas->offset = offset; *gap_min = min; *gap_max = min + gap - 1; return true; } /* descend, only happens under lock. */ mas->node = mas_slot(mas, slots, offset); mas->min = min; mas->max = max; mas->offset = mas_data_end(mas); return false; ascend: if (!mte_is_root(mas->node)) return false; no_space: mas_set_err(mas, -EBUSY); return false; } static inline bool mas_anode_descend(struct ma_state *mas, unsigned long size) { enum maple_type type = mte_node_type(mas->node); unsigned long pivot, min, gap = 0; unsigned char offset, data_end; unsigned long *gaps, *pivots; void __rcu **slots; struct maple_node *node; bool found = false; if (ma_is_dense(type)) { mas->offset = (unsigned char)(mas->index - mas->min); return true; } node = mas_mn(mas); pivots = ma_pivots(node, type); slots = ma_slots(node, type); gaps = ma_gaps(node, type); offset = mas->offset; min = mas_safe_min(mas, pivots, offset); data_end = ma_data_end(node, type, pivots, mas->max); for (; offset <= data_end; offset++) { pivot = mas_safe_pivot(mas, pivots, offset, type); /* Not within lower bounds */ if (mas->index > pivot) goto next_slot; if (gaps) gap = gaps[offset]; else if (!mas_slot(mas, slots, offset)) gap = min(pivot, mas->last) - max(mas->index, min) + 1; else goto next_slot; if (gap >= size) { if (ma_is_leaf(type)) { found = true; goto done; } if (mas->index <= pivot) { mas->node = mas_slot(mas, slots, offset); mas->min = min; mas->max = pivot; offset = 0; break; } } next_slot: min = pivot + 1; if (mas->last <= pivot) { mas_set_err(mas, -EBUSY); return true; } } if (mte_is_root(mas->node)) found = true; done: mas->offset = offset; return found; } /** * mas_walk() - Search for @mas->index in the tree. * @mas: The maple state. * * mas->index and mas->last will be set to the range if there is a value. If * mas->status is ma_none, reset to ma_start * * Return: the entry at the location or %NULL. */ void *mas_walk(struct ma_state *mas) { void *entry; if (!mas_is_active(mas) || !mas_is_start(mas)) mas->status = ma_start; retry: entry = mas_state_walk(mas); if (mas_is_start(mas)) { goto retry; } else if (mas_is_none(mas)) { mas->index = 0; mas->last = ULONG_MAX; } else if (mas_is_ptr(mas)) { if (!mas->index) { mas->last = 0; return entry; } mas->index = 1; mas->last = ULONG_MAX; mas->status = ma_none; return NULL; } return entry; } EXPORT_SYMBOL_GPL(mas_walk); static inline bool mas_rewind_node(struct ma_state *mas) { unsigned char slot; do { if (mte_is_root(mas->node)) { slot = mas->offset; if (!slot) return false; } else { mas_ascend(mas); slot = mas->offset; } } while (!slot); mas->offset = --slot; return true; } /* * mas_skip_node() - Internal function. Skip over a node. * @mas: The maple state. * * Return: true if there is another node, false otherwise. */ static inline bool mas_skip_node(struct ma_state *mas) { if (mas_is_err(mas)) return false; do { if (mte_is_root(mas->node)) { if (mas->offset >= mas_data_end(mas)) { mas_set_err(mas, -EBUSY); return false; } } else { mas_ascend(mas); } } while (mas->offset >= mas_data_end(mas)); mas->offset++; return true; } /* * mas_awalk() - Allocation walk. Search from low address to high, for a gap of * @size * @mas: The maple state * @size: The size of the gap required * * Search between @mas->index and @mas->last for a gap of @size. */ static inline void mas_awalk(struct ma_state *mas, unsigned long size) { struct maple_enode *last = NULL; /* * There are 4 options: * go to child (descend) * go back to parent (ascend) * no gap found. (return, slot == MAPLE_NODE_SLOTS) * found the gap. (return, slot != MAPLE_NODE_SLOTS) */ while (!mas_is_err(mas) && !mas_anode_descend(mas, size)) { if (last == mas->node) mas_skip_node(mas); else last = mas->node; } } /* * mas_sparse_area() - Internal function. Return upper or lower limit when * searching for a gap in an empty tree. * @mas: The maple state * @min: the minimum range * @max: The maximum range * @size: The size of the gap * @fwd: Searching forward or back */ static inline int mas_sparse_area(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size, bool fwd) { if (!unlikely(mas_is_none(mas)) && min == 0) { min++; /* * At this time, min is increased, we need to recheck whether * the size is satisfied. */ if (min > max || max - min + 1 < size) return -EBUSY; } /* mas_is_ptr */ if (fwd) { mas->index = min; mas->last = min + size - 1; } else { mas->last = max; mas->index = max - size + 1; } return 0; } /* * mas_empty_area() - Get the lowest address within the range that is * sufficient for the size requested. * @mas: The maple state * @min: The lowest value of the range * @max: The highest value of the range * @size: The size needed */ int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size) { unsigned char offset; unsigned long *pivots; enum maple_type mt; struct maple_node *node; if (min > max) return -EINVAL; if (size == 0 || max - min < size - 1) return -EINVAL; if (mas_is_start(mas)) mas_start(mas); else if (mas->offset >= 2) mas->offset -= 2; else if (!mas_skip_node(mas)) return -EBUSY; /* Empty set */ if (mas_is_none(mas) || mas_is_ptr(mas)) return mas_sparse_area(mas, min, max, size, true); /* The start of the window can only be within these values */ mas->index = min; mas->last = max; mas_awalk(mas, size); if (unlikely(mas_is_err(mas))) return xa_err(mas->node); offset = mas->offset; if (unlikely(offset == MAPLE_NODE_SLOTS)) return -EBUSY; node = mas_mn(mas); mt = mte_node_type(mas->node); pivots = ma_pivots(node, mt); min = mas_safe_min(mas, pivots, offset); if (mas->index < min) mas->index = min; mas->last = mas->index + size - 1; mas->end = ma_data_end(node, mt, pivots, mas->max); return 0; } EXPORT_SYMBOL_GPL(mas_empty_area); /* * mas_empty_area_rev() - Get the highest address within the range that is * sufficient for the size requested. * @mas: The maple state * @min: The lowest value of the range * @max: The highest value of the range * @size: The size needed */ int mas_empty_area_rev(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size) { struct maple_enode *last = mas->node; if (min > max) return -EINVAL; if (size == 0 || max - min < size - 1) return -EINVAL; if (mas_is_start(mas)) mas_start(mas); else if ((mas->offset < 2) && (!mas_rewind_node(mas))) return -EBUSY; if (unlikely(mas_is_none(mas) || mas_is_ptr(mas))) return mas_sparse_area(mas, min, max, size, false); else if (mas->offset >= 2) mas->offset -= 2; else mas->offset = mas_data_end(mas); /* The start of the window can only be within these values. */ mas->index = min; mas->last = max; while (!mas_rev_awalk(mas, size, &min, &max)) { if (last == mas->node) { if (!mas_rewind_node(mas)) return -EBUSY; } else { last = mas->node; } } if (mas_is_err(mas)) return xa_err(mas->node); if (unlikely(mas->offset == MAPLE_NODE_SLOTS)) return -EBUSY; /* Trim the upper limit to the max. */ if (max < mas->last) mas->last = max; mas->index = mas->last - size + 1; mas->end = mas_data_end(mas); return 0; } EXPORT_SYMBOL_GPL(mas_empty_area_rev); /* * mte_dead_leaves() - Mark all leaves of a node as dead. * @mas: The maple state * @slots: Pointer to the slot array * @type: The maple node type * * Must hold the write lock. * * Return: The number of leaves marked as dead. */ static inline unsigned char mte_dead_leaves(struct maple_enode *enode, struct maple_tree *mt, void __rcu **slots) { struct maple_node *node; enum maple_type type; void *entry; int offset; for (offset = 0; offset < mt_slot_count(enode); offset++) { entry = mt_slot(mt, slots, offset); type = mte_node_type(entry); node = mte_to_node(entry); /* Use both node and type to catch LE & BE metadata */ if (!node || !type) break; mte_set_node_dead(entry); node->type = type; rcu_assign_pointer(slots[offset], node); } return offset; } /** * mte_dead_walk() - Walk down a dead tree to just before the leaves * @enode: The maple encoded node * @offset: The starting offset * * Note: This can only be used from the RCU callback context. */ static void __rcu **mte_dead_walk(struct maple_enode **enode, unsigned char offset) { struct maple_node *node, *next; void __rcu **slots = NULL; next = mte_to_node(*enode); do { *enode = ma_enode_ptr(next); node = mte_to_node(*enode); slots = ma_slots(node, node->type); next = rcu_dereference_protected(slots[offset], lock_is_held(&rcu_callback_map)); offset = 0; } while (!ma_is_leaf(next->type)); return slots; } /** * mt_free_walk() - Walk & free a tree in the RCU callback context * @head: The RCU head that's within the node. * * Note: This can only be used from the RCU callback context. */ static void mt_free_walk(struct rcu_head *head) { void __rcu **slots; struct maple_node *node, *start; struct maple_enode *enode; unsigned char offset; enum maple_type type; node = container_of(head, struct maple_node, rcu); if (ma_is_leaf(node->type)) goto free_leaf; start = node; enode = mt_mk_node(node, node->type); slots = mte_dead_walk(&enode, 0); node = mte_to_node(enode); do { mt_free_bulk(node->slot_len, slots); offset = node->parent_slot + 1; enode = node->piv_parent; if (mte_to_node(enode) == node) goto free_leaf; type = mte_node_type(enode); slots = ma_slots(mte_to_node(enode), type); if ((offset < mt_slots[type]) && rcu_dereference_protected(slots[offset], lock_is_held(&rcu_callback_map))) slots = mte_dead_walk(&enode, offset); node = mte_to_node(enode); } while ((node != start) || (node->slot_len < offset)); slots = ma_slots(node, node->type); mt_free_bulk(node->slot_len, slots); free_leaf: mt_free_rcu(&node->rcu); } static inline void __rcu **mte_destroy_descend(struct maple_enode **enode, struct maple_tree *mt, struct maple_enode *prev, unsigned char offset) { struct maple_node *node; struct maple_enode *next = *enode; void __rcu **slots = NULL; enum maple_type type; unsigned char next_offset = 0; do { *enode = next; node = mte_to_node(*enode); type = mte_node_type(*enode); slots = ma_slots(node, type); next = mt_slot_locked(mt, slots, next_offset); if ((mte_dead_node(next))) next = mt_slot_locked(mt, slots, ++next_offset); mte_set_node_dead(*enode); node->type = type; node->piv_parent = prev; node->parent_slot = offset; offset = next_offset; next_offset = 0; prev = *enode; } while (!mte_is_leaf(next)); return slots; } static void mt_destroy_walk(struct maple_enode *enode, struct maple_tree *mt, bool free) { void __rcu **slots; struct maple_node *node = mte_to_node(enode); struct maple_enode *start; if (mte_is_leaf(enode)) { node->type = mte_node_type(enode); goto free_leaf; } start = enode; slots = mte_destroy_descend(&enode, mt, start, 0); node = mte_to_node(enode); // Updated in the above call. do { enum maple_type type; unsigned char offset; struct maple_enode *parent, *tmp; node->slot_len = mte_dead_leaves(enode, mt, slots); if (free) mt_free_bulk(node->slot_len, slots); offset = node->parent_slot + 1; enode = node->piv_parent; if (mte_to_node(enode) == node) goto free_leaf; type = mte_node_type(enode); slots = ma_slots(mte_to_node(enode), type); if (offset >= mt_slots[type]) goto next; tmp = mt_slot_locked(mt, slots, offset); if (mte_node_type(tmp) && mte_to_node(tmp)) { parent = enode; enode = tmp; slots = mte_destroy_descend(&enode, mt, parent, offset); } next: node = mte_to_node(enode); } while (start != enode); node = mte_to_node(enode); node->slot_len = mte_dead_leaves(enode, mt, slots); if (free) mt_free_bulk(node->slot_len, slots); free_leaf: if (free) mt_free_rcu(&node->rcu); else mt_clear_meta(mt, node, node->type); } /* * mte_destroy_walk() - Free a tree or sub-tree. * @enode: the encoded maple node (maple_enode) to start * @mt: the tree to free - needed for node types. * * Must hold the write lock. */ static inline void mte_destroy_walk(struct maple_enode *enode, struct maple_tree *mt) { struct maple_node *node = mte_to_node(enode); if (mt_in_rcu(mt)) { mt_destroy_walk(enode, mt, false); call_rcu(&node->rcu, mt_free_walk); } else { mt_destroy_walk(enode, mt, true); } } static void mas_wr_store_setup(struct ma_wr_state *wr_mas) { if (!mas_is_active(wr_mas->mas)) { if (mas_is_start(wr_mas->mas)) return; if (unlikely(mas_is_paused(wr_mas->mas))) goto reset; if (unlikely(mas_is_none(wr_mas->mas))) goto reset; if (unlikely(mas_is_overflow(wr_mas->mas))) goto reset; if (unlikely(mas_is_underflow(wr_mas->mas))) goto reset; } /* * A less strict version of mas_is_span_wr() where we allow spanning * writes within this node. This is to stop partial walks in * mas_prealloc() from being reset. */ if (wr_mas->mas->last > wr_mas->mas->max) goto reset; if (wr_mas->entry) return; if (mte_is_leaf(wr_mas->mas->node) && wr_mas->mas->last == wr_mas->mas->max) goto reset; return; reset: mas_reset(wr_mas->mas); } /* Interface */ /** * mas_store() - Store an @entry. * @mas: The maple state. * @entry: The entry to store. * * The @mas->index and @mas->last is used to set the range for the @entry. * Note: The @mas should have pre-allocated entries to ensure there is memory to * store the entry. Please see mas_expected_entries()/mas_destroy() for more details. * * Return: the first entry between mas->index and mas->last or %NULL. */ void *mas_store(struct ma_state *mas, void *entry) { MA_WR_STATE(wr_mas, mas, entry); trace_ma_write(__func__, mas, 0, entry); #ifdef CONFIG_DEBUG_MAPLE_TREE if (MAS_WARN_ON(mas, mas->index > mas->last)) pr_err("Error %lX > %lX %p\n", mas->index, mas->last, entry); if (mas->index > mas->last) { mas_set_err(mas, -EINVAL); return NULL; } #endif /* * Storing is the same operation as insert with the added caveat that it * can overwrite entries. Although this seems simple enough, one may * want to examine what happens if a single store operation was to * overwrite multiple entries within a self-balancing B-Tree. */ mas_wr_store_setup(&wr_mas); mas_wr_store_entry(&wr_mas); return wr_mas.content; } EXPORT_SYMBOL_GPL(mas_store); /** * mas_store_gfp() - Store a value into the tree. * @mas: The maple state * @entry: The entry to store * @gfp: The GFP_FLAGS to use for allocations if necessary. * * Return: 0 on success, -EINVAL on invalid request, -ENOMEM if memory could not * be allocated. */ int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp) { MA_WR_STATE(wr_mas, mas, entry); mas_wr_store_setup(&wr_mas); trace_ma_write(__func__, mas, 0, entry); retry: mas_wr_store_entry(&wr_mas); if (unlikely(mas_nomem(mas, gfp))) goto retry; if (unlikely(mas_is_err(mas))) return xa_err(mas->node); return 0; } EXPORT_SYMBOL_GPL(mas_store_gfp); /** * mas_store_prealloc() - Store a value into the tree using memory * preallocated in the maple state. * @mas: The maple state * @entry: The entry to store. */ void mas_store_prealloc(struct ma_state *mas, void *entry) { MA_WR_STATE(wr_mas, mas, entry); mas_wr_store_setup(&wr_mas); trace_ma_write(__func__, mas, 0, entry); mas_wr_store_entry(&wr_mas); MAS_WR_BUG_ON(&wr_mas, mas_is_err(mas)); mas_destroy(mas); } EXPORT_SYMBOL_GPL(mas_store_prealloc); /** * mas_preallocate() - Preallocate enough nodes for a store operation * @mas: The maple state * @entry: The entry that will be stored * @gfp: The GFP_FLAGS to use for allocations. * * Return: 0 on success, -ENOMEM if memory could not be allocated. */ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) { MA_WR_STATE(wr_mas, mas, entry); unsigned char node_size; int request = 1; int ret; if (unlikely(!mas->index && mas->last == ULONG_MAX)) goto ask_now; mas_wr_store_setup(&wr_mas); wr_mas.content = mas_start(mas); /* Root expand */ if (unlikely(mas_is_none(mas) || mas_is_ptr(mas))) goto ask_now; if (unlikely(!mas_wr_walk(&wr_mas))) { /* Spanning store, use worst case for now */ request = 1 + mas_mt_height(mas) * 3; goto ask_now; } /* At this point, we are at the leaf node that needs to be altered. */ /* Exact fit, no nodes needed. */ if (wr_mas.r_min == mas->index && wr_mas.r_max == mas->last) return 0; mas_wr_end_piv(&wr_mas); node_size = mas_wr_new_end(&wr_mas); /* Slot store, does not require additional nodes */ if (node_size == mas->end) { /* reuse node */ if (!mt_in_rcu(mas->tree)) return 0; /* shifting boundary */ if (wr_mas.offset_end - mas->offset == 1) return 0; } if (node_size >= mt_slots[wr_mas.type]) { /* Split, worst case for now. */ request = 1 + mas_mt_height(mas) * 2; goto ask_now; } /* New root needs a single node */ if (unlikely(mte_is_root(mas->node))) goto ask_now; /* Potential spanning rebalance collapsing a node, use worst-case */ if (node_size - 1 <= mt_min_slots[wr_mas.type]) request = mas_mt_height(mas) * 2 - 1; /* node store, slot store needs one node */ ask_now: mas_node_count_gfp(mas, request, gfp); mas->mas_flags |= MA_STATE_PREALLOC; if (likely(!mas_is_err(mas))) return 0; mas_set_alloc_req(mas, 0); ret = xa_err(mas->node); mas_reset(mas); mas_destroy(mas); mas_reset(mas); return ret; } EXPORT_SYMBOL_GPL(mas_preallocate); /* * mas_destroy() - destroy a maple state. * @mas: The maple state * * Upon completion, check the left-most node and rebalance against the node to * the right if necessary. Frees any allocated nodes associated with this maple * state. */ void mas_destroy(struct ma_state *mas) { struct maple_alloc *node; unsigned long total; /* * When using mas_for_each() to insert an expected number of elements, * it is possible that the number inserted is less than the expected * number. To fix an invalid final node, a check is performed here to * rebalance the previous node with the final node. */ if (mas->mas_flags & MA_STATE_REBALANCE) { unsigned char end; mas_start(mas); mtree_range_walk(mas); end = mas->end + 1; if (end < mt_min_slot_count(mas->node) - 1) mas_destroy_rebalance(mas, end); mas->mas_flags &= ~MA_STATE_REBALANCE; } mas->mas_flags &= ~(MA_STATE_BULK|MA_STATE_PREALLOC); total = mas_allocated(mas); while (total) { node = mas->alloc; mas->alloc = node->slot[0]; if (node->node_count > 1) { size_t count = node->node_count - 1; mt_free_bulk(count, (void __rcu **)&node->slot[1]); total -= count; } mt_free_one(ma_mnode_ptr(node)); total--; } mas->alloc = NULL; } EXPORT_SYMBOL_GPL(mas_destroy); /* * mas_expected_entries() - Set the expected number of entries that will be inserted. * @mas: The maple state * @nr_entries: The number of expected entries. * * This will attempt to pre-allocate enough nodes to store the expected number * of entries. The allocations will occur using the bulk allocator interface * for speed. Please call mas_destroy() on the @mas after inserting the entries * to ensure any unused nodes are freed. * * Return: 0 on success, -ENOMEM if memory could not be allocated. */ int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries) { int nonleaf_cap = MAPLE_ARANGE64_SLOTS - 2; struct maple_enode *enode = mas->node; int nr_nodes; int ret; /* * Sometimes it is necessary to duplicate a tree to a new tree, such as * forking a process and duplicating the VMAs from one tree to a new * tree. When such a situation arises, it is known that the new tree is * not going to be used until the entire tree is populated. For * performance reasons, it is best to use a bulk load with RCU disabled. * This allows for optimistic splitting that favours the left and reuse * of nodes during the operation. */ /* Optimize splitting for bulk insert in-order */ mas->mas_flags |= MA_STATE_BULK; /* * Avoid overflow, assume a gap between each entry and a trailing null. * If this is wrong, it just means allocation can happen during * insertion of entries. */ nr_nodes = max(nr_entries, nr_entries * 2 + 1); if (!mt_is_alloc(mas->tree)) nonleaf_cap = MAPLE_RANGE64_SLOTS - 2; /* Leaves; reduce slots to keep space for expansion */ nr_nodes = DIV_ROUND_UP(nr_nodes, MAPLE_RANGE64_SLOTS - 2); /* Internal nodes */ nr_nodes += DIV_ROUND_UP(nr_nodes, nonleaf_cap); /* Add working room for split (2 nodes) + new parents */ mas_node_count_gfp(mas, nr_nodes + 3, GFP_KERNEL); /* Detect if allocations run out */ mas->mas_flags |= MA_STATE_PREALLOC; if (!mas_is_err(mas)) return 0; ret = xa_err(mas->node); mas->node = enode; mas_destroy(mas); return ret; } EXPORT_SYMBOL_GPL(mas_expected_entries); static bool mas_next_setup(struct ma_state *mas, unsigned long max, void **entry) { bool was_none = mas_is_none(mas); if (unlikely(mas->last >= max)) { mas->status = ma_overflow; return true; } switch (mas->status) { case ma_active: return false; case ma_none: fallthrough; case ma_pause: mas->status = ma_start; fallthrough; case ma_start: mas_walk(mas); /* Retries on dead nodes handled by mas_walk */ break; case ma_overflow: /* Overflowed before, but the max changed */ mas->status = ma_active; break; case ma_underflow: /* The user expects the mas to be one before where it is */ mas->status = ma_active; *entry = mas_walk(mas); if (*entry) return true; break; case ma_root: break; case ma_error: return true; } if (likely(mas_is_active(mas))) /* Fast path */ return false; if (mas_is_ptr(mas)) { *entry = NULL; if (was_none && mas->index == 0) { mas->index = mas->last = 0; return true; } mas->index = 1; mas->last = ULONG_MAX; mas->status = ma_none; return true; } if (mas_is_none(mas)) return true; return false; } /** * mas_next() - Get the next entry. * @mas: The maple state * @max: The maximum index to check. * * Returns the next entry after @mas->index. * Must hold rcu_read_lock or the write lock. * Can return the zero entry. * * Return: The next entry or %NULL */ void *mas_next(struct ma_state *mas, unsigned long max) { void *entry = NULL; if (mas_next_setup(mas, max, &entry)) return entry; /* Retries on dead nodes handled by mas_next_slot */ return mas_next_slot(mas, max, false); } EXPORT_SYMBOL_GPL(mas_next); /** * mas_next_range() - Advance the maple state to the next range * @mas: The maple state * @max: The maximum index to check. * * Sets @mas->index and @mas->last to the range. * Must hold rcu_read_lock or the write lock. * Can return the zero entry. * * Return: The next entry or %NULL */ void *mas_next_range(struct ma_state *mas, unsigned long max) { void *entry = NULL; if (mas_next_setup(mas, max, &entry)) return entry; /* Retries on dead nodes handled by mas_next_slot */ return mas_next_slot(mas, max, true); } EXPORT_SYMBOL_GPL(mas_next_range); /** * mt_next() - get the next value in the maple tree * @mt: The maple tree * @index: The start index * @max: The maximum index to check * * Takes RCU read lock internally to protect the search, which does not * protect the returned pointer after dropping RCU read lock. * See also: Documentation/core-api/maple_tree.rst * * Return: The entry higher than @index or %NULL if nothing is found. */ void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max) { void *entry = NULL; MA_STATE(mas, mt, index, index); rcu_read_lock(); entry = mas_next(&mas, max); rcu_read_unlock(); return entry; } EXPORT_SYMBOL_GPL(mt_next); static bool mas_prev_setup(struct ma_state *mas, unsigned long min, void **entry) { if (unlikely(mas->index <= min)) { mas->status = ma_underflow; return true; } switch (mas->status) { case ma_active: return false; case ma_start: break; case ma_none: fallthrough; case ma_pause: mas->status = ma_start; break; case ma_underflow: /* underflowed before but the min changed */ mas->status = ma_active; break; case ma_overflow: /* User expects mas to be one after where it is */ mas->status = ma_active; *entry = mas_walk(mas); if (*entry) return true; break; case ma_root: break; case ma_error: return true; } if (mas_is_start(mas)) mas_walk(mas); if (unlikely(mas_is_ptr(mas))) { if (!mas->index) { mas->status = ma_none; return true; } mas->index = mas->last = 0; *entry = mas_root(mas); return true; } if (mas_is_none(mas)) { if (mas->index) { /* Walked to out-of-range pointer? */ mas->index = mas->last = 0; mas->status = ma_root; *entry = mas_root(mas); return true; } return true; } return false; } /** * mas_prev() - Get the previous entry * @mas: The maple state * @min: The minimum value to check. * * Must hold rcu_read_lock or the write lock. * Will reset mas to ma_start if the status is ma_none. Will stop on not * searchable nodes. * * Return: the previous value or %NULL. */ void *mas_prev(struct ma_state *mas, unsigned long min) { void *entry = NULL; if (mas_prev_setup(mas, min, &entry)) return entry; return mas_prev_slot(mas, min, false); } EXPORT_SYMBOL_GPL(mas_prev); /** * mas_prev_range() - Advance to the previous range * @mas: The maple state * @min: The minimum value to check. * * Sets @mas->index and @mas->last to the range. * Must hold rcu_read_lock or the write lock. * Will reset mas to ma_start if the node is ma_none. Will stop on not * searchable nodes. * * Return: the previous value or %NULL. */ void *mas_prev_range(struct ma_state *mas, unsigned long min) { void *entry = NULL; if (mas_prev_setup(mas, min, &entry)) return entry; return mas_prev_slot(mas, min, true); } EXPORT_SYMBOL_GPL(mas_prev_range); /** * mt_prev() - get the previous value in the maple tree * @mt: The maple tree * @index: The start index * @min: The minimum index to check * * Takes RCU read lock internally to protect the search, which does not * protect the returned pointer after dropping RCU read lock. * See also: Documentation/core-api/maple_tree.rst * * Return: The entry before @index or %NULL if nothing is found. */ void *mt_prev(struct maple_tree *mt, unsigned long index, unsigned long min) { void *entry = NULL; MA_STATE(mas, mt, index, index); rcu_read_lock(); entry = mas_prev(&mas, min); rcu_read_unlock(); return entry; } EXPORT_SYMBOL_GPL(mt_prev); /** * mas_pause() - Pause a mas_find/mas_for_each to drop the lock. * @mas: The maple state to pause * * Some users need to pause a walk and drop the lock they're holding in * order to yield to a higher priority thread or carry out an operation * on an entry. Those users should call this function before they drop * the lock. It resets the @mas to be suitable for the next iteration * of the loop after the user has reacquired the lock. If most entries * found during a walk require you to call mas_pause(), the mt_for_each() * iterator may be more appropriate. * */ void mas_pause(struct ma_state *mas) { mas->status = ma_pause; mas->node = NULL; } EXPORT_SYMBOL_GPL(mas_pause); /** * mas_find_setup() - Internal function to set up mas_find*(). * @mas: The maple state * @max: The maximum index * @entry: Pointer to the entry * * Returns: True if entry is the answer, false otherwise. */ static __always_inline bool mas_find_setup(struct ma_state *mas, unsigned long max, void **entry) { switch (mas->status) { case ma_active: if (mas->last < max) return false; return true; case ma_start: break; case ma_pause: if (unlikely(mas->last >= max)) return true; mas->index = ++mas->last; mas->status = ma_start; break; case ma_none: if (unlikely(mas->last >= max)) return true; mas->index = mas->last; mas->status = ma_start; break; case ma_underflow: /* mas is pointing at entry before unable to go lower */ if (unlikely(mas->index >= max)) { mas->status = ma_overflow; return true; } mas->status = ma_active; *entry = mas_walk(mas); if (*entry) return true; break; case ma_overflow: if (unlikely(mas->last >= max)) return true; mas->status = ma_active; *entry = mas_walk(mas); if (*entry) return true; break; case ma_root: break; case ma_error: return true; } if (mas_is_start(mas)) { /* First run or continue */ if (mas->index > max) return true; *entry = mas_walk(mas); if (*entry) return true; } if (unlikely(mas_is_ptr(mas))) goto ptr_out_of_range; if (unlikely(mas_is_none(mas))) return true; if (mas->index == max) return true; return false; ptr_out_of_range: mas->status = ma_none; mas->index = 1; mas->last = ULONG_MAX; return true; } /** * mas_find() - On the first call, find the entry at or after mas->index up to * %max. Otherwise, find the entry after mas->index. * @mas: The maple state * @max: The maximum value to check. * * Must hold rcu_read_lock or the write lock. * If an entry exists, last and index are updated accordingly. * May set @mas->status to ma_overflow. * * Return: The entry or %NULL. */ void *mas_find(struct ma_state *mas, unsigned long max) { void *entry = NULL; if (mas_find_setup(mas, max, &entry)) return entry; /* Retries on dead nodes handled by mas_next_slot */ entry = mas_next_slot(mas, max, false); /* Ignore overflow */ mas->status = ma_active; return entry; } EXPORT_SYMBOL_GPL(mas_find); /** * mas_find_range() - On the first call, find the entry at or after * mas->index up to %max. Otherwise, advance to the next slot mas->index. * @mas: The maple state * @max: The maximum value to check. * * Must hold rcu_read_lock or the write lock. * If an entry exists, last and index are updated accordingly. * May set @mas->status to ma_overflow. * * Return: The entry or %NULL. */ void *mas_find_range(struct ma_state *mas, unsigned long max) { void *entry = NULL; if (mas_find_setup(mas, max, &entry)) return entry; /* Retries on dead nodes handled by mas_next_slot */ return mas_next_slot(mas, max, true); } EXPORT_SYMBOL_GPL(mas_find_range); /** * mas_find_rev_setup() - Internal function to set up mas_find_*_rev() * @mas: The maple state * @min: The minimum index * @entry: Pointer to the entry * * Returns: True if entry is the answer, false otherwise. */ static bool mas_find_rev_setup(struct ma_state *mas, unsigned long min, void **entry) { switch (mas->status) { case ma_active: goto active; case ma_start: break; case ma_pause: if (unlikely(mas->index <= min)) { mas->status = ma_underflow; return true; } mas->last = --mas->index; mas->status = ma_start; break; case ma_none: if (mas->index <= min) goto none; mas->last = mas->index; mas->status = ma_start; break; case ma_overflow: /* user expects the mas to be one after where it is */ if (unlikely(mas->index <= min)) { mas->status = ma_underflow; return true; } mas->status = ma_active; break; case ma_underflow: /* user expects the mas to be one before where it is */ if (unlikely(mas->index <= min)) return true; mas->status = ma_active; break; case ma_root: break; case ma_error: return true; } if (mas_is_start(mas)) { /* First run or continue */ if (mas->index < min) return true; *entry = mas_walk(mas); if (*entry) return true; } if (unlikely(mas_is_ptr(mas))) goto none; if (unlikely(mas_is_none(mas))) { /* * Walked to the location, and there was nothing so the previous * location is 0. */ mas->last = mas->index = 0; mas->status = ma_root; *entry = mas_root(mas); return true; } active: if (mas->index < min) return true; return false; none: mas->status = ma_none; return true; } /** * mas_find_rev: On the first call, find the first non-null entry at or below * mas->index down to %min. Otherwise find the first non-null entry below * mas->index down to %min. * @mas: The maple state * @min: The minimum value to check. * * Must hold rcu_read_lock or the write lock. * If an entry exists, last and index are updated accordingly. * May set @mas->status to ma_underflow. * * Return: The entry or %NULL. */ void *mas_find_rev(struct ma_state *mas, unsigned long min) { void *entry = NULL; if (mas_find_rev_setup(mas, min, &entry)) return entry; /* Retries on dead nodes handled by mas_prev_slot */ return mas_prev_slot(mas, min, false); } EXPORT_SYMBOL_GPL(mas_find_rev); /** * mas_find_range_rev: On the first call, find the first non-null entry at or * below mas->index down to %min. Otherwise advance to the previous slot after * mas->index down to %min. * @mas: The maple state * @min: The minimum value to check. * * Must hold rcu_read_lock or the write lock. * If an entry exists, last and index are updated accordingly. * May set @mas->status to ma_underflow. * * Return: The entry or %NULL. */ void *mas_find_range_rev(struct ma_state *mas, unsigned long min) { void *entry = NULL; if (mas_find_rev_setup(mas, min, &entry)) return entry; /* Retries on dead nodes handled by mas_prev_slot */ return mas_prev_slot(mas, min, true); } EXPORT_SYMBOL_GPL(mas_find_range_rev); /** * mas_erase() - Find the range in which index resides and erase the entire * range. * @mas: The maple state * * Must hold the write lock. * Searches for @mas->index, sets @mas->index and @mas->last to the range and * erases that range. * * Return: the entry that was erased or %NULL, @mas->index and @mas->last are updated. */ void *mas_erase(struct ma_state *mas) { void *entry; MA_WR_STATE(wr_mas, mas, NULL); if (!mas_is_active(mas) || !mas_is_start(mas)) mas->status = ma_start; /* Retry unnecessary when holding the write lock. */ entry = mas_state_walk(mas); if (!entry) return NULL; write_retry: /* Must reset to ensure spanning writes of last slot are detected */ mas_reset(mas); mas_wr_store_setup(&wr_mas); mas_wr_store_entry(&wr_mas); if (mas_nomem(mas, GFP_KERNEL)) goto write_retry; return entry; } EXPORT_SYMBOL_GPL(mas_erase); /** * mas_nomem() - Check if there was an error allocating and do the allocation * if necessary If there are allocations, then free them. * @mas: The maple state * @gfp: The GFP_FLAGS to use for allocations * Return: true on allocation, false otherwise. */ bool mas_nomem(struct ma_state *mas, gfp_t gfp) __must_hold(mas->tree->ma_lock) { if (likely(mas->node != MA_ERROR(-ENOMEM))) { mas_destroy(mas); return false; } if (gfpflags_allow_blocking(gfp) && !mt_external_lock(mas->tree)) { mtree_unlock(mas->tree); mas_alloc_nodes(mas, gfp); mtree_lock(mas->tree); } else { mas_alloc_nodes(mas, gfp); } if (!mas_allocated(mas)) return false; mas->status = ma_start; return true; } void __init maple_tree_init(void) { maple_node_cache = kmem_cache_create("maple_node", sizeof(struct maple_node), sizeof(struct maple_node), SLAB_PANIC, NULL); } /** * mtree_load() - Load a value stored in a maple tree * @mt: The maple tree * @index: The index to load * * Return: the entry or %NULL */ void *mtree_load(struct maple_tree *mt, unsigned long index) { MA_STATE(mas, mt, index, index); void *entry; trace_ma_read(__func__, &mas); rcu_read_lock(); retry: entry = mas_start(&mas); if (unlikely(mas_is_none(&mas))) goto unlock; if (unlikely(mas_is_ptr(&mas))) { if (index) entry = NULL; goto unlock; } entry = mtree_lookup_walk(&mas); if (!entry && unlikely(mas_is_start(&mas))) goto retry; unlock: rcu_read_unlock(); if (xa_is_zero(entry)) return NULL; return entry; } EXPORT_SYMBOL(mtree_load); /** * mtree_store_range() - Store an entry at a given range. * @mt: The maple tree * @index: The start of the range * @last: The end of the range * @entry: The entry to store * @gfp: The GFP_FLAGS to use for allocations * * Return: 0 on success, -EINVAL on invalid request, -ENOMEM if memory could not * be allocated. */ int mtree_store_range(struct maple_tree *mt, unsigned long index, unsigned long last, void *entry, gfp_t gfp) { MA_STATE(mas, mt, index, last); MA_WR_STATE(wr_mas, &mas, entry); trace_ma_write(__func__, &mas, 0, entry); if (WARN_ON_ONCE(xa_is_advanced(entry))) return -EINVAL; if (index > last) return -EINVAL; mtree_lock(mt); retry: mas_wr_store_entry(&wr_mas); if (mas_nomem(&mas, gfp)) goto retry; mtree_unlock(mt); if (mas_is_err(&mas)) return xa_err(mas.node); return 0; } EXPORT_SYMBOL(mtree_store_range); /** * mtree_store() - Store an entry at a given index. * @mt: The maple tree * @index: The index to store the value * @entry: The entry to store * @gfp: The GFP_FLAGS to use for allocations * * Return: 0 on success, -EINVAL on invalid request, -ENOMEM if memory could not * be allocated. */ int mtree_store(struct maple_tree *mt, unsigned long index, void *entry, gfp_t gfp) { return mtree_store_range(mt, index, index, entry, gfp); } EXPORT_SYMBOL(mtree_store); /** * mtree_insert_range() - Insert an entry at a given range if there is no value. * @mt: The maple tree * @first: The start of the range * @last: The end of the range * @entry: The entry to store * @gfp: The GFP_FLAGS to use for allocations. * * Return: 0 on success, -EEXISTS if the range is occupied, -EINVAL on invalid * request, -ENOMEM if memory could not be allocated. */ int mtree_insert_range(struct maple_tree *mt, unsigned long first, unsigned long last, void *entry, gfp_t gfp) { MA_STATE(ms, mt, first, last); if (WARN_ON_ONCE(xa_is_advanced(entry))) return -EINVAL; if (first > last) return -EINVAL; mtree_lock(mt); retry: mas_insert(&ms, entry); if (mas_nomem(&ms, gfp)) goto retry; mtree_unlock(mt); if (mas_is_err(&ms)) return xa_err(ms.node); return 0; } EXPORT_SYMBOL(mtree_insert_range); /** * mtree_insert() - Insert an entry at a given index if there is no value. * @mt: The maple tree * @index : The index to store the value * @entry: The entry to store * @gfp: The GFP_FLAGS to use for allocations. * * Return: 0 on success, -EEXISTS if the range is occupied, -EINVAL on invalid * request, -ENOMEM if memory could not be allocated. */ int mtree_insert(struct maple_tree *mt, unsigned long index, void *entry, gfp_t gfp) { return mtree_insert_range(mt, index, index, entry, gfp); } EXPORT_SYMBOL(mtree_insert); int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long size, unsigned long min, unsigned long max, gfp_t gfp) { int ret = 0; MA_STATE(mas, mt, 0, 0); if (!mt_is_alloc(mt)) return -EINVAL; if (WARN_ON_ONCE(mt_is_reserved(entry))) return -EINVAL; mtree_lock(mt); retry: ret = mas_empty_area(&mas, min, max, size); if (ret) goto unlock; mas_insert(&mas, entry); /* * mas_nomem() may release the lock, causing the allocated area * to be unavailable, so try to allocate a free area again. */ if (mas_nomem(&mas, gfp)) goto retry; if (mas_is_err(&mas)) ret = xa_err(mas.node); else *startp = mas.index; unlock: mtree_unlock(mt); return ret; } EXPORT_SYMBOL(mtree_alloc_range); /** * mtree_alloc_cyclic() - Find somewhere to store this entry in the tree. * @mt: The maple tree. * @startp: Pointer to ID. * @range_lo: Lower bound of range to search. * @range_hi: Upper bound of range to search. * @entry: The entry to store. * @next: Pointer to next ID to allocate. * @gfp: The GFP_FLAGS to use for allocations. * * Finds an empty entry in @mt after @next, stores the new index into * the @id pointer, stores the entry at that index, then updates @next. * * @mt must be initialized with the MT_FLAGS_ALLOC_RANGE flag. * * Context: Any context. Takes and releases the mt.lock. May sleep if * the @gfp flags permit. * * Return: 0 if the allocation succeeded without wrapping, 1 if the * allocation succeeded after wrapping, -ENOMEM if memory could not be * allocated, -EINVAL if @mt cannot be used, or -EBUSY if there are no * free entries. */ int mtree_alloc_cyclic(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long range_lo, unsigned long range_hi, unsigned long *next, gfp_t gfp) { int ret; MA_STATE(mas, mt, 0, 0); if (!mt_is_alloc(mt)) return -EINVAL; if (WARN_ON_ONCE(mt_is_reserved(entry))) return -EINVAL; mtree_lock(mt); ret = mas_alloc_cyclic(&mas, startp, entry, range_lo, range_hi, next, gfp); mtree_unlock(mt); return ret; } EXPORT_SYMBOL(mtree_alloc_cyclic); int mtree_alloc_rrange(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long size, unsigned long min, unsigned long max, gfp_t gfp) { int ret = 0; MA_STATE(mas, mt, 0, 0); if (!mt_is_alloc(mt)) return -EINVAL; if (WARN_ON_ONCE(mt_is_reserved(entry))) return -EINVAL; mtree_lock(mt); retry: ret = mas_empty_area_rev(&mas, min, max, size); if (ret) goto unlock; mas_insert(&mas, entry); /* * mas_nomem() may release the lock, causing the allocated area * to be unavailable, so try to allocate a free area again. */ if (mas_nomem(&mas, gfp)) goto retry; if (mas_is_err(&mas)) ret = xa_err(mas.node); else *startp = mas.index; unlock: mtree_unlock(mt); return ret; } EXPORT_SYMBOL(mtree_alloc_rrange); /** * mtree_erase() - Find an index and erase the entire range. * @mt: The maple tree * @index: The index to erase * * Erasing is the same as a walk to an entry then a store of a NULL to that * ENTIRE range. In fact, it is implemented as such using the advanced API. * * Return: The entry stored at the @index or %NULL */ void *mtree_erase(struct maple_tree *mt, unsigned long index) { void *entry = NULL; MA_STATE(mas, mt, index, index); trace_ma_op(__func__, &mas); mtree_lock(mt); entry = mas_erase(&mas); mtree_unlock(mt); return entry; } EXPORT_SYMBOL(mtree_erase); /* * mas_dup_free() - Free an incomplete duplication of a tree. * @mas: The maple state of a incomplete tree. * * The parameter @mas->node passed in indicates that the allocation failed on * this node. This function frees all nodes starting from @mas->node in the * reverse order of mas_dup_build(). There is no need to hold the source tree * lock at this time. */ static void mas_dup_free(struct ma_state *mas) { struct maple_node *node; enum maple_type type; void __rcu **slots; unsigned char count, i; /* Maybe the first node allocation failed. */ if (mas_is_none(mas)) return; while (!mte_is_root(mas->node)) { mas_ascend(mas); if (mas->offset) { mas->offset--; do { mas_descend(mas); mas->offset = mas_data_end(mas); } while (!mte_is_leaf(mas->node)); mas_ascend(mas); } node = mte_to_node(mas->node); type = mte_node_type(mas->node); slots = ma_slots(node, type); count = mas_data_end(mas) + 1; for (i = 0; i < count; i++) ((unsigned long *)slots)[i] &= ~MAPLE_NODE_MASK; mt_free_bulk(count, slots); } node = mte_to_node(mas->node); mt_free_one(node); } /* * mas_copy_node() - Copy a maple node and replace the parent. * @mas: The maple state of source tree. * @new_mas: The maple state of new tree. * @parent: The parent of the new node. * * Copy @mas->node to @new_mas->node, set @parent to be the parent of * @new_mas->node. If memory allocation fails, @mas is set to -ENOMEM. */ static inline void mas_copy_node(struct ma_state *mas, struct ma_state *new_mas, struct maple_pnode *parent) { struct maple_node *node = mte_to_node(mas->node); struct maple_node *new_node = mte_to_node(new_mas->node); unsigned long val; /* Copy the node completely. */ memcpy(new_node, node, sizeof(struct maple_node)); /* Update the parent node pointer. */ val = (unsigned long)node->parent & MAPLE_NODE_MASK; new_node->parent = ma_parent_ptr(val | (unsigned long)parent); } /* * mas_dup_alloc() - Allocate child nodes for a maple node. * @mas: The maple state of source tree. * @new_mas: The maple state of new tree. * @gfp: The GFP_FLAGS to use for allocations. * * This function allocates child nodes for @new_mas->node during the duplication * process. If memory allocation fails, @mas is set to -ENOMEM. */ static inline void mas_dup_alloc(struct ma_state *mas, struct ma_state *new_mas, gfp_t gfp) { struct maple_node *node = mte_to_node(mas->node); struct maple_node *new_node = mte_to_node(new_mas->node); enum maple_type type; unsigned char request, count, i; void __rcu **slots; void __rcu **new_slots; unsigned long val; /* Allocate memory for child nodes. */ type = mte_node_type(mas->node); new_slots = ma_slots(new_node, type); request = mas_data_end(mas) + 1; count = mt_alloc_bulk(gfp, request, (void **)new_slots); if (unlikely(count < request)) { memset(new_slots, 0, request * sizeof(void *)); mas_set_err(mas, -ENOMEM); return; } /* Restore node type information in slots. */ slots = ma_slots(node, type); for (i = 0; i < count; i++) { val = (unsigned long)mt_slot_locked(mas->tree, slots, i); val &= MAPLE_NODE_MASK; ((unsigned long *)new_slots)[i] |= val; } } /* * mas_dup_build() - Build a new maple tree from a source tree * @mas: The maple state of source tree, need to be in MAS_START state. * @new_mas: The maple state of new tree, need to be in MAS_START state. * @gfp: The GFP_FLAGS to use for allocations. * * This function builds a new tree in DFS preorder. If the memory allocation * fails, the error code -ENOMEM will be set in @mas, and @new_mas points to the * last node. mas_dup_free() will free the incomplete duplication of a tree. * * Note that the attributes of the two trees need to be exactly the same, and the * new tree needs to be empty, otherwise -EINVAL will be set in @mas. */ static inline void mas_dup_build(struct ma_state *mas, struct ma_state *new_mas, gfp_t gfp) { struct maple_node *node; struct maple_pnode *parent = NULL; struct maple_enode *root; enum maple_type type; if (unlikely(mt_attr(mas->tree) != mt_attr(new_mas->tree)) || unlikely(!mtree_empty(new_mas->tree))) { mas_set_err(mas, -EINVAL); return; } root = mas_start(mas); if (mas_is_ptr(mas) || mas_is_none(mas)) goto set_new_tree; node = mt_alloc_one(gfp); if (!node) { new_mas->status = ma_none; mas_set_err(mas, -ENOMEM); return; } type = mte_node_type(mas->node); root = mt_mk_node(node, type); new_mas->node = root; new_mas->min = 0; new_mas->max = ULONG_MAX; root = mte_mk_root(root); while (1) { mas_copy_node(mas, new_mas, parent); if (!mte_is_leaf(mas->node)) { /* Only allocate child nodes for non-leaf nodes. */ mas_dup_alloc(mas, new_mas, gfp); if (unlikely(mas_is_err(mas))) return; } else { /* * This is the last leaf node and duplication is * completed. */ if (mas->max == ULONG_MAX) goto done; /* This is not the last leaf node and needs to go up. */ do { mas_ascend(mas); mas_ascend(new_mas); } while (mas->offset == mas_data_end(mas)); /* Move to the next subtree. */ mas->offset++; new_mas->offset++; } mas_descend(mas); parent = ma_parent_ptr(mte_to_node(new_mas->node)); mas_descend(new_mas); mas->offset = 0; new_mas->offset = 0; } done: /* Specially handle the parent of the root node. */ mte_to_node(root)->parent = ma_parent_ptr(mas_tree_parent(new_mas)); set_new_tree: /* Make them the same height */ new_mas->tree->ma_flags = mas->tree->ma_flags; rcu_assign_pointer(new_mas->tree->ma_root, root); } /** * __mt_dup(): Duplicate an entire maple tree * @mt: The source maple tree * @new: The new maple tree * @gfp: The GFP_FLAGS to use for allocations * * This function duplicates a maple tree in Depth-First Search (DFS) pre-order * traversal. It uses memcpy() to copy nodes in the source tree and allocate * new child nodes in non-leaf nodes. The new node is exactly the same as the * source node except for all the addresses stored in it. It will be faster than * traversing all elements in the source tree and inserting them one by one into * the new tree. * The user needs to ensure that the attributes of the source tree and the new * tree are the same, and the new tree needs to be an empty tree, otherwise * -EINVAL will be returned. * Note that the user needs to manually lock the source tree and the new tree. * * Return: 0 on success, -ENOMEM if memory could not be allocated, -EINVAL If * the attributes of the two trees are different or the new tree is not an empty * tree. */ int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp) { int ret = 0; MA_STATE(mas, mt, 0, 0); MA_STATE(new_mas, new, 0, 0); mas_dup_build(&mas, &new_mas, gfp); if (unlikely(mas_is_err(&mas))) { ret = xa_err(mas.node); if (ret == -ENOMEM) mas_dup_free(&new_mas); } return ret; } EXPORT_SYMBOL(__mt_dup); /** * mtree_dup(): Duplicate an entire maple tree * @mt: The source maple tree * @new: The new maple tree * @gfp: The GFP_FLAGS to use for allocations * * This function duplicates a maple tree in Depth-First Search (DFS) pre-order * traversal. It uses memcpy() to copy nodes in the source tree and allocate * new child nodes in non-leaf nodes. The new node is exactly the same as the * source node except for all the addresses stored in it. It will be faster than * traversing all elements in the source tree and inserting them one by one into * the new tree. * The user needs to ensure that the attributes of the source tree and the new * tree are the same, and the new tree needs to be an empty tree, otherwise * -EINVAL will be returned. * * Return: 0 on success, -ENOMEM if memory could not be allocated, -EINVAL If * the attributes of the two trees are different or the new tree is not an empty * tree. */ int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp) { int ret = 0; MA_STATE(mas, mt, 0, 0); MA_STATE(new_mas, new, 0, 0); mas_lock(&new_mas); mas_lock_nested(&mas, SINGLE_DEPTH_NESTING); mas_dup_build(&mas, &new_mas, gfp); mas_unlock(&mas); if (unlikely(mas_is_err(&mas))) { ret = xa_err(mas.node); if (ret == -ENOMEM) mas_dup_free(&new_mas); } mas_unlock(&new_mas); return ret; } EXPORT_SYMBOL(mtree_dup); /** * __mt_destroy() - Walk and free all nodes of a locked maple tree. * @mt: The maple tree * * Note: Does not handle locking. */ void __mt_destroy(struct maple_tree *mt) { void *root = mt_root_locked(mt); rcu_assign_pointer(mt->ma_root, NULL); if (xa_is_node(root)) mte_destroy_walk(root, mt); mt->ma_flags = mt_attr(mt); } EXPORT_SYMBOL_GPL(__mt_destroy); /** * mtree_destroy() - Destroy a maple tree * @mt: The maple tree * * Frees all resources used by the tree. Handles locking. */ void mtree_destroy(struct maple_tree *mt) { mtree_lock(mt); __mt_destroy(mt); mtree_unlock(mt); } EXPORT_SYMBOL(mtree_destroy); /** * mt_find() - Search from the start up until an entry is found. * @mt: The maple tree * @index: Pointer which contains the start location of the search * @max: The maximum value of the search range * * Takes RCU read lock internally to protect the search, which does not * protect the returned pointer after dropping RCU read lock. * See also: Documentation/core-api/maple_tree.rst * * In case that an entry is found @index is updated to point to the next * possible entry independent whether the found entry is occupying a * single index or a range if indices. * * Return: The entry at or after the @index or %NULL */ void *mt_find(struct maple_tree *mt, unsigned long *index, unsigned long max) { MA_STATE(mas, mt, *index, *index); void *entry; #ifdef CONFIG_DEBUG_MAPLE_TREE unsigned long copy = *index; #endif trace_ma_read(__func__, &mas); if ((*index) > max) return NULL; rcu_read_lock(); retry: entry = mas_state_walk(&mas); if (mas_is_start(&mas)) goto retry; if (unlikely(xa_is_zero(entry))) entry = NULL; if (entry) goto unlock; while (mas_is_active(&mas) && (mas.last < max)) { entry = mas_next_entry(&mas, max); if (likely(entry && !xa_is_zero(entry))) break; } if (unlikely(xa_is_zero(entry))) entry = NULL; unlock: rcu_read_unlock(); if (likely(entry)) { *index = mas.last + 1; #ifdef CONFIG_DEBUG_MAPLE_TREE if (MT_WARN_ON(mt, (*index) && ((*index) <= copy))) pr_err("index not increased! %lx <= %lx\n", *index, copy); #endif } return entry; } EXPORT_SYMBOL(mt_find); /** * mt_find_after() - Search from the start up until an entry is found. * @mt: The maple tree * @index: Pointer which contains the start location of the search * @max: The maximum value to check * * Same as mt_find() except that it checks @index for 0 before * searching. If @index == 0, the search is aborted. This covers a wrap * around of @index to 0 in an iterator loop. * * Return: The entry at or after the @index or %NULL */ void *mt_find_after(struct maple_tree *mt, unsigned long *index, unsigned long max) { if (!(*index)) return NULL; return mt_find(mt, index, max); } EXPORT_SYMBOL(mt_find_after); #ifdef CONFIG_DEBUG_MAPLE_TREE atomic_t maple_tree_tests_run; EXPORT_SYMBOL_GPL(maple_tree_tests_run); atomic_t maple_tree_tests_passed; EXPORT_SYMBOL_GPL(maple_tree_tests_passed); #ifndef __KERNEL__ extern void kmem_cache_set_non_kernel(struct kmem_cache *, unsigned int); void mt_set_non_kernel(unsigned int val) { kmem_cache_set_non_kernel(maple_node_cache, val); } extern unsigned long kmem_cache_get_alloc(struct kmem_cache *); unsigned long mt_get_alloc_size(void) { return kmem_cache_get_alloc(maple_node_cache); } extern void kmem_cache_zero_nr_tallocated(struct kmem_cache *); void mt_zero_nr_tallocated(void) { kmem_cache_zero_nr_tallocated(maple_node_cache); } extern unsigned int kmem_cache_nr_tallocated(struct kmem_cache *); unsigned int mt_nr_tallocated(void) { return kmem_cache_nr_tallocated(maple_node_cache); } extern unsigned int kmem_cache_nr_allocated(struct kmem_cache *); unsigned int mt_nr_allocated(void) { return kmem_cache_nr_allocated(maple_node_cache); } void mt_cache_shrink(void) { } #else /* * mt_cache_shrink() - For testing, don't use this. * * Certain testcases can trigger an OOM when combined with other memory * debugging configuration options. This function is used to reduce the * possibility of an out of memory even due to kmem_cache objects remaining * around for longer than usual. */ void mt_cache_shrink(void) { kmem_cache_shrink(maple_node_cache); } EXPORT_SYMBOL_GPL(mt_cache_shrink); #endif /* not defined __KERNEL__ */ /* * mas_get_slot() - Get the entry in the maple state node stored at @offset. * @mas: The maple state * @offset: The offset into the slot array to fetch. * * Return: The entry stored at @offset. */ static inline struct maple_enode *mas_get_slot(struct ma_state *mas, unsigned char offset) { return mas_slot(mas, ma_slots(mas_mn(mas), mte_node_type(mas->node)), offset); } /* Depth first search, post-order */ static void mas_dfs_postorder(struct ma_state *mas, unsigned long max) { struct maple_enode *p, *mn = mas->node; unsigned long p_min, p_max; mas_next_node(mas, mas_mn(mas), max); if (!mas_is_overflow(mas)) return; if (mte_is_root(mn)) return; mas->node = mn; mas_ascend(mas); do { p = mas->node; p_min = mas->min; p_max = mas->max; mas_prev_node(mas, 0); } while (!mas_is_underflow(mas)); mas->node = p; mas->max = p_max; mas->min = p_min; } /* Tree validations */ static void mt_dump_node(const struct maple_tree *mt, void *entry, unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format); static void mt_dump_range(unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format) { static const char spaces[] = " "; switch(format) { case mt_dump_hex: if (min == max) pr_info("%.*s%lx: ", depth * 2, spaces, min); else pr_info("%.*s%lx-%lx: ", depth * 2, spaces, min, max); break; case mt_dump_dec: if (min == max) pr_info("%.*s%lu: ", depth * 2, spaces, min); else pr_info("%.*s%lu-%lu: ", depth * 2, spaces, min, max); } } static void mt_dump_entry(void *entry, unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format) { mt_dump_range(min, max, depth, format); if (xa_is_value(entry)) pr_cont("value %ld (0x%lx) [%p]\n", xa_to_value(entry), xa_to_value(entry), entry); else if (xa_is_zero(entry)) pr_cont("zero (%ld)\n", xa_to_internal(entry)); else if (mt_is_reserved(entry)) pr_cont("UNKNOWN ENTRY (%p)\n", entry); else pr_cont("%p\n", entry); } static void mt_dump_range64(const struct maple_tree *mt, void *entry, unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format) { struct maple_range_64 *node = &mte_to_node(entry)->mr64; bool leaf = mte_is_leaf(entry); unsigned long first = min; int i; pr_cont(" contents: "); for (i = 0; i < MAPLE_RANGE64_SLOTS - 1; i++) { switch(format) { case mt_dump_hex: pr_cont("%p %lX ", node->slot[i], node->pivot[i]); break; case mt_dump_dec: pr_cont("%p %lu ", node->slot[i], node->pivot[i]); } } pr_cont("%p\n", node->slot[i]); for (i = 0; i < MAPLE_RANGE64_SLOTS; i++) { unsigned long last = max; if (i < (MAPLE_RANGE64_SLOTS - 1)) last = node->pivot[i]; else if (!node->slot[i] && max != mt_node_max(entry)) break; if (last == 0 && i > 0) break; if (leaf) mt_dump_entry(mt_slot(mt, node->slot, i), first, last, depth + 1, format); else if (node->slot[i]) mt_dump_node(mt, mt_slot(mt, node->slot, i), first, last, depth + 1, format); if (last == max) break; if (last > max) { switch(format) { case mt_dump_hex: pr_err("node %p last (%lx) > max (%lx) at pivot %d!\n", node, last, max, i); break; case mt_dump_dec: pr_err("node %p last (%lu) > max (%lu) at pivot %d!\n", node, last, max, i); } } first = last + 1; } } static void mt_dump_arange64(const struct maple_tree *mt, void *entry, unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format) { struct maple_arange_64 *node = &mte_to_node(entry)->ma64; bool leaf = mte_is_leaf(entry); unsigned long first = min; int i; pr_cont(" contents: "); for (i = 0; i < MAPLE_ARANGE64_SLOTS; i++) { switch (format) { case mt_dump_hex: pr_cont("%lx ", node->gap[i]); break; case mt_dump_dec: pr_cont("%lu ", node->gap[i]); } } pr_cont("| %02X %02X| ", node->meta.end, node->meta.gap); for (i = 0; i < MAPLE_ARANGE64_SLOTS - 1; i++) { switch (format) { case mt_dump_hex: pr_cont("%p %lX ", node->slot[i], node->pivot[i]); break; case mt_dump_dec: pr_cont("%p %lu ", node->slot[i], node->pivot[i]); } } pr_cont("%p\n", node->slot[i]); for (i = 0; i < MAPLE_ARANGE64_SLOTS; i++) { unsigned long last = max; if (i < (MAPLE_ARANGE64_SLOTS - 1)) last = node->pivot[i]; else if (!node->slot[i]) break; if (last == 0 && i > 0) break; if (leaf) mt_dump_entry(mt_slot(mt, node->slot, i), first, last, depth + 1, format); else if (node->slot[i]) mt_dump_node(mt, mt_slot(mt, node->slot, i), first, last, depth + 1, format); if (last == max) break; if (last > max) { pr_err("node %p last (%lu) > max (%lu) at pivot %d!\n", node, last, max, i); break; } first = last + 1; } } static void mt_dump_node(const struct maple_tree *mt, void *entry, unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format) { struct maple_node *node = mte_to_node(entry); unsigned int type = mte_node_type(entry); unsigned int i; mt_dump_range(min, max, depth, format); pr_cont("node %p depth %d type %d parent %p", node, depth, type, node ? node->parent : NULL); switch (type) { case maple_dense: pr_cont("\n"); for (i = 0; i < MAPLE_NODE_SLOTS; i++) { if (min + i > max) pr_cont("OUT OF RANGE: "); mt_dump_entry(mt_slot(mt, node->slot, i), min + i, min + i, depth, format); } break; case maple_leaf_64: case maple_range_64: mt_dump_range64(mt, entry, min, max, depth, format); break; case maple_arange_64: mt_dump_arange64(mt, entry, min, max, depth, format); break; default: pr_cont(" UNKNOWN TYPE\n"); } } void mt_dump(const struct maple_tree *mt, enum mt_dump_format format) { void *entry = rcu_dereference_check(mt->ma_root, mt_locked(mt)); pr_info("maple_tree(%p) flags %X, height %u root %p\n", mt, mt->ma_flags, mt_height(mt), entry); if (!xa_is_node(entry)) mt_dump_entry(entry, 0, 0, 0, format); else if (entry) mt_dump_node(mt, entry, 0, mt_node_max(entry), 0, format); } EXPORT_SYMBOL_GPL(mt_dump); /* * Calculate the maximum gap in a node and check if that's what is reported in * the parent (unless root). */ static void mas_validate_gaps(struct ma_state *mas) { struct maple_enode *mte = mas->node; struct maple_node *p_mn, *node = mte_to_node(mte); enum maple_type mt = mte_node_type(mas->node); unsigned long gap = 0, max_gap = 0; unsigned long p_end, p_start = mas->min; unsigned char p_slot, offset; unsigned long *gaps = NULL; unsigned long *pivots = ma_pivots(node, mt); unsigned int i; if (ma_is_dense(mt)) { for (i = 0; i < mt_slot_count(mte); i++) { if (mas_get_slot(mas, i)) { if (gap > max_gap) max_gap = gap; gap = 0; continue; } gap++; } goto counted; } gaps = ma_gaps(node, mt); for (i = 0; i < mt_slot_count(mte); i++) { p_end = mas_safe_pivot(mas, pivots, i, mt); if (!gaps) { if (!mas_get_slot(mas, i)) gap = p_end - p_start + 1; } else { void *entry = mas_get_slot(mas, i); gap = gaps[i]; MT_BUG_ON(mas->tree, !entry); if (gap > p_end - p_start + 1) { pr_err("%p[%u] %lu >= %lu - %lu + 1 (%lu)\n", mas_mn(mas), i, gap, p_end, p_start, p_end - p_start + 1); MT_BUG_ON(mas->tree, gap > p_end - p_start + 1); } } if (gap > max_gap) max_gap = gap; p_start = p_end + 1; if (p_end >= mas->max) break; } counted: if (mt == maple_arange_64) { MT_BUG_ON(mas->tree, !gaps); offset = ma_meta_gap(node); if (offset > i) { pr_err("gap offset %p[%u] is invalid\n", node, offset); MT_BUG_ON(mas->tree, 1); } if (gaps[offset] != max_gap) { pr_err("gap %p[%u] is not the largest gap %lu\n", node, offset, max_gap); MT_BUG_ON(mas->tree, 1); } for (i++ ; i < mt_slot_count(mte); i++) { if (gaps[i] != 0) { pr_err("gap %p[%u] beyond node limit != 0\n", node, i); MT_BUG_ON(mas->tree, 1); } } } if (mte_is_root(mte)) return; p_slot = mte_parent_slot(mas->node); p_mn = mte_parent(mte); MT_BUG_ON(mas->tree, max_gap > mas->max); if (ma_gaps(p_mn, mas_parent_type(mas, mte))[p_slot] != max_gap) { pr_err("gap %p[%u] != %lu\n", p_mn, p_slot, max_gap); mt_dump(mas->tree, mt_dump_hex); MT_BUG_ON(mas->tree, 1); } } static void mas_validate_parent_slot(struct ma_state *mas) { struct maple_node *parent; struct maple_enode *node; enum maple_type p_type; unsigned char p_slot; void __rcu **slots; int i; if (mte_is_root(mas->node)) return; p_slot = mte_parent_slot(mas->node); p_type = mas_parent_type(mas, mas->node); parent = mte_parent(mas->node); slots = ma_slots(parent, p_type); MT_BUG_ON(mas->tree, mas_mn(mas) == parent); /* Check prev/next parent slot for duplicate node entry */ for (i = 0; i < mt_slots[p_type]; i++) { node = mas_slot(mas, slots, i); if (i == p_slot) { if (node != mas->node) pr_err("parent %p[%u] does not have %p\n", parent, i, mas_mn(mas)); MT_BUG_ON(mas->tree, node != mas->node); } else if (node == mas->node) { pr_err("Invalid child %p at parent %p[%u] p_slot %u\n", mas_mn(mas), parent, i, p_slot); MT_BUG_ON(mas->tree, node == mas->node); } } } static void mas_validate_child_slot(struct ma_state *mas) { enum maple_type type = mte_node_type(mas->node); void __rcu **slots = ma_slots(mte_to_node(mas->node), type); unsigned long *pivots = ma_pivots(mte_to_node(mas->node), type); struct maple_enode *child; unsigned char i; if (mte_is_leaf(mas->node)) return; for (i = 0; i < mt_slots[type]; i++) { child = mas_slot(mas, slots, i); if (!child) { pr_err("Non-leaf node lacks child at %p[%u]\n", mas_mn(mas), i); MT_BUG_ON(mas->tree, 1); } if (mte_parent_slot(child) != i) { pr_err("Slot error at %p[%u]: child %p has pslot %u\n", mas_mn(mas), i, mte_to_node(child), mte_parent_slot(child)); MT_BUG_ON(mas->tree, 1); } if (mte_parent(child) != mte_to_node(mas->node)) { pr_err("child %p has parent %p not %p\n", mte_to_node(child), mte_parent(child), mte_to_node(mas->node)); MT_BUG_ON(mas->tree, 1); } if (i < mt_pivots[type] && pivots[i] == mas->max) break; } } /* * Validate all pivots are within mas->min and mas->max, check metadata ends * where the maximum ends and ensure there is no slots or pivots set outside of * the end of the data. */ static void mas_validate_limits(struct ma_state *mas) { int i; unsigned long prev_piv = 0; enum maple_type type = mte_node_type(mas->node); void __rcu **slots = ma_slots(mte_to_node(mas->node), type); unsigned long *pivots = ma_pivots(mas_mn(mas), type); for (i = 0; i < mt_slots[type]; i++) { unsigned long piv; piv = mas_safe_pivot(mas, pivots, i, type); if (!piv && (i != 0)) { pr_err("Missing node limit pivot at %p[%u]", mas_mn(mas), i); MAS_WARN_ON(mas, 1); } if (prev_piv > piv) { pr_err("%p[%u] piv %lu < prev_piv %lu\n", mas_mn(mas), i, piv, prev_piv); MAS_WARN_ON(mas, piv < prev_piv); } if (piv < mas->min) { pr_err("%p[%u] %lu < %lu\n", mas_mn(mas), i, piv, mas->min); MAS_WARN_ON(mas, piv < mas->min); } if (piv > mas->max) { pr_err("%p[%u] %lu > %lu\n", mas_mn(mas), i, piv, mas->max); MAS_WARN_ON(mas, piv > mas->max); } prev_piv = piv; if (piv == mas->max) break; } if (mas_data_end(mas) != i) { pr_err("node%p: data_end %u != the last slot offset %u\n", mas_mn(mas), mas_data_end(mas), i); MT_BUG_ON(mas->tree, 1); } for (i += 1; i < mt_slots[type]; i++) { void *entry = mas_slot(mas, slots, i); if (entry && (i != mt_slots[type] - 1)) { pr_err("%p[%u] should not have entry %p\n", mas_mn(mas), i, entry); MT_BUG_ON(mas->tree, entry != NULL); } if (i < mt_pivots[type]) { unsigned long piv = pivots[i]; if (!piv) continue; pr_err("%p[%u] should not have piv %lu\n", mas_mn(mas), i, piv); MAS_WARN_ON(mas, i < mt_pivots[type] - 1); } } } static void mt_validate_nulls(struct maple_tree *mt) { void *entry, *last = (void *)1; unsigned char offset = 0; void __rcu **slots; MA_STATE(mas, mt, 0, 0); mas_start(&mas); if (mas_is_none(&mas) || (mas_is_ptr(&mas))) return; while (!mte_is_leaf(mas.node)) mas_descend(&mas); slots = ma_slots(mte_to_node(mas.node), mte_node_type(mas.node)); do { entry = mas_slot(&mas, slots, offset); if (!last && !entry) { pr_err("Sequential nulls end at %p[%u]\n", mas_mn(&mas), offset); } MT_BUG_ON(mt, !last && !entry); last = entry; if (offset == mas_data_end(&mas)) { mas_next_node(&mas, mas_mn(&mas), ULONG_MAX); if (mas_is_overflow(&mas)) return; offset = 0; slots = ma_slots(mte_to_node(mas.node), mte_node_type(mas.node)); } else { offset++; } } while (!mas_is_overflow(&mas)); } /* * validate a maple tree by checking: * 1. The limits (pivots are within mas->min to mas->max) * 2. The gap is correctly set in the parents */ void mt_validate(struct maple_tree *mt) { unsigned char end; MA_STATE(mas, mt, 0, 0); rcu_read_lock(); mas_start(&mas); if (!mas_is_active(&mas)) goto done; while (!mte_is_leaf(mas.node)) mas_descend(&mas); while (!mas_is_overflow(&mas)) { MAS_WARN_ON(&mas, mte_dead_node(mas.node)); end = mas_data_end(&mas); if (MAS_WARN_ON(&mas, (end < mt_min_slot_count(mas.node)) && (mas.max != ULONG_MAX))) { pr_err("Invalid size %u of %p\n", end, mas_mn(&mas)); } mas_validate_parent_slot(&mas); mas_validate_limits(&mas); mas_validate_child_slot(&mas); if (mt_is_alloc(mt)) mas_validate_gaps(&mas); mas_dfs_postorder(&mas, ULONG_MAX); } mt_validate_nulls(mt); done: rcu_read_unlock(); } EXPORT_SYMBOL_GPL(mt_validate); void mas_dump(const struct ma_state *mas) { pr_err("MAS: tree=%p enode=%p ", mas->tree, mas->node); switch (mas->status) { case ma_active: pr_err("(ma_active)"); break; case ma_none: pr_err("(ma_none)"); break; case ma_root: pr_err("(ma_root)"); break; case ma_start: pr_err("(ma_start) "); break; case ma_pause: pr_err("(ma_pause) "); break; case ma_overflow: pr_err("(ma_overflow) "); break; case ma_underflow: pr_err("(ma_underflow) "); break; case ma_error: pr_err("(ma_error) "); break; } pr_err("[%u/%u] index=%lx last=%lx\n", mas->offset, mas->end, mas->index, mas->last); pr_err(" min=%lx max=%lx alloc=%p, depth=%u, flags=%x\n", mas->min, mas->max, mas->alloc, mas->depth, mas->mas_flags); if (mas->index > mas->last) pr_err("Check index & last\n"); } EXPORT_SYMBOL_GPL(mas_dump); void mas_wr_dump(const struct ma_wr_state *wr_mas) { pr_err("WR_MAS: node=%p r_min=%lx r_max=%lx\n", wr_mas->node, wr_mas->r_min, wr_mas->r_max); pr_err(" type=%u off_end=%u, node_end=%u, end_piv=%lx\n", wr_mas->type, wr_mas->offset_end, wr_mas->mas->end, wr_mas->end_piv); } EXPORT_SYMBOL_GPL(mas_wr_dump); #endif /* CONFIG_DEBUG_MAPLE_TREE */ |
79 340 120 2871 7 849 3229 2 21 60 9 5011 21 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_HUGE_MM_H #define _LINUX_HUGE_MM_H #include <linux/sched/coredump.h> #include <linux/mm_types.h> #include <linux/fs.h> /* only for vma_is_dax() */ #include <linux/kobject.h> vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf); int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); void huge_pmd_set_accessed(struct vm_fault *vmf); int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, pud_t *dst_pud, pud_t *src_pud, unsigned long addr, struct vm_area_struct *vma); #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud); #else static inline void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) { } #endif vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf); bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long next); int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr); int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr); bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd); int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, pgprot_t newprot, unsigned long cp_flags); vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write); vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write); enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_UNSUPPORTED, TRANSPARENT_HUGEPAGE_FLAG, TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG, TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG, }; struct kobject; struct kobj_attribute; ssize_t single_hugepage_flag_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count, enum transparent_hugepage_flag flag); ssize_t single_hugepage_flag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf, enum transparent_hugepage_flag flag); extern struct kobj_attribute shmem_enabled_attr; extern struct kobj_attribute thpsize_shmem_enabled_attr; /* * Mask of all large folio orders supported for anonymous THP; all orders up to * and including PMD_ORDER, except order-0 (which is not "huge") and order-1 * (which is a limitation of the THP implementation). */ #define THP_ORDERS_ALL_ANON ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1))) /* * Mask of all large folio orders supported for file THP. */ #define THP_ORDERS_ALL_FILE (BIT(PMD_ORDER) | BIT(PUD_ORDER)) /* * Mask of all large folio orders supported for THP. */ #define THP_ORDERS_ALL (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE) #define TVA_SMAPS (1 << 0) /* Will be used for procfs */ #define TVA_IN_PF (1 << 1) /* Page fault handler */ #define TVA_ENFORCE_SYSFS (1 << 2) /* Obey sysfs configuration */ #define thp_vma_allowable_order(vma, vm_flags, tva_flags, order) \ (!!thp_vma_allowable_orders(vma, vm_flags, tva_flags, BIT(order))) #ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES #define HPAGE_PMD_SHIFT PMD_SHIFT #define HPAGE_PUD_SHIFT PUD_SHIFT #else #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) #define HPAGE_PUD_SHIFT ({ BUILD_BUG(); 0; }) #endif #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER) #define HPAGE_PMD_MASK (~(HPAGE_PMD_SIZE - 1)) #define HPAGE_PMD_SIZE ((1UL) << HPAGE_PMD_SHIFT) #define HPAGE_PUD_ORDER (HPAGE_PUD_SHIFT-PAGE_SHIFT) #define HPAGE_PUD_NR (1<<HPAGE_PUD_ORDER) #define HPAGE_PUD_MASK (~(HPAGE_PUD_SIZE - 1)) #define HPAGE_PUD_SIZE ((1UL) << HPAGE_PUD_SHIFT) #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern unsigned long transparent_hugepage_flags; extern unsigned long huge_anon_orders_always; extern unsigned long huge_anon_orders_madvise; extern unsigned long huge_anon_orders_inherit; static inline bool hugepage_global_enabled(void) { return transparent_hugepage_flags & ((1<<TRANSPARENT_HUGEPAGE_FLAG) | (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)); } static inline bool hugepage_global_always(void) { return transparent_hugepage_flags & (1<<TRANSPARENT_HUGEPAGE_FLAG); } static inline int highest_order(unsigned long orders) { return fls_long(orders) - 1; } static inline int next_order(unsigned long *orders, int prev) { *orders &= ~BIT(prev); return highest_order(*orders); } /* * Do the below checks: * - For file vma, check if the linear page offset of vma is * order-aligned within the file. The hugepage is * guaranteed to be order-aligned within the file, but we must * check that the order-aligned addresses in the VMA map to * order-aligned offsets within the file, else the hugepage will * not be mappable. * - For all vmas, check if the haddr is in an aligned hugepage * area. */ static inline bool thp_vma_suitable_order(struct vm_area_struct *vma, unsigned long addr, int order) { unsigned long hpage_size = PAGE_SIZE << order; unsigned long haddr; /* Don't have to check pgoff for anonymous vma */ if (!vma_is_anonymous(vma)) { if (!IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, hpage_size >> PAGE_SHIFT)) return false; } haddr = ALIGN_DOWN(addr, hpage_size); if (haddr < vma->vm_start || haddr + hpage_size > vma->vm_end) return false; return true; } /* * Filter the bitfield of input orders to the ones suitable for use in the vma. * See thp_vma_suitable_order(). * All orders that pass the checks are returned as a bitfield. */ static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma, unsigned long addr, unsigned long orders) { int order; /* * Iterate over orders, highest to lowest, removing orders that don't * meet alignment requirements from the set. Exit loop at first order * that meets requirements, since all lower orders must also meet * requirements. */ order = highest_order(orders); while (orders) { if (thp_vma_suitable_order(vma, addr, order)) break; order = next_order(&orders, order); } return orders; } static inline bool file_thp_enabled(struct vm_area_struct *vma) { struct inode *inode; if (!vma->vm_file) return false; inode = vma->vm_file->f_inode; return (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS)) && !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode); } unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, unsigned long vm_flags, unsigned long tva_flags, unsigned long orders); /** * thp_vma_allowable_orders - determine hugepage orders that are allowed for vma * @vma: the vm area to check * @vm_flags: use these vm_flags instead of vma->vm_flags * @tva_flags: Which TVA flags to honour * @orders: bitfield of all orders to consider * * Calculates the intersection of the requested hugepage orders and the allowed * hugepage orders for the provided vma. Permitted orders are encoded as a set * bit at the corresponding bit position (bit-2 corresponds to order-2, bit-3 * corresponds to order-3, etc). Order-0 is never considered a hugepage order. * * Return: bitfield of orders allowed for hugepage in the vma. 0 if no hugepage * orders are allowed. */ static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, unsigned long vm_flags, unsigned long tva_flags, unsigned long orders) { /* Optimization to check if required orders are enabled early. */ if ((tva_flags & TVA_ENFORCE_SYSFS) && vma_is_anonymous(vma)) { unsigned long mask = READ_ONCE(huge_anon_orders_always); if (vm_flags & VM_HUGEPAGE) mask |= READ_ONCE(huge_anon_orders_madvise); if (hugepage_global_always() || ((vm_flags & VM_HUGEPAGE) && hugepage_global_enabled())) mask |= READ_ONCE(huge_anon_orders_inherit); orders &= mask; if (!orders) return 0; } return __thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders); } struct thpsize { struct kobject kobj; struct list_head node; int order; }; #define to_thpsize(kobj) container_of(kobj, struct thpsize, kobj) enum mthp_stat_item { MTHP_STAT_ANON_FAULT_ALLOC, MTHP_STAT_ANON_FAULT_FALLBACK, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, MTHP_STAT_SWPOUT, MTHP_STAT_SWPOUT_FALLBACK, MTHP_STAT_SHMEM_ALLOC, MTHP_STAT_SHMEM_FALLBACK, MTHP_STAT_SHMEM_FALLBACK_CHARGE, MTHP_STAT_SPLIT, MTHP_STAT_SPLIT_FAILED, MTHP_STAT_SPLIT_DEFERRED, __MTHP_STAT_COUNT }; struct mthp_stat { unsigned long stats[ilog2(MAX_PTRS_PER_PTE) + 1][__MTHP_STAT_COUNT]; }; #ifdef CONFIG_SYSFS DECLARE_PER_CPU(struct mthp_stat, mthp_stats); static inline void count_mthp_stat(int order, enum mthp_stat_item item) { if (order <= 0 || order > PMD_ORDER) return; this_cpu_inc(mthp_stats.stats[order][item]); } #else static inline void count_mthp_stat(int order, enum mthp_stat_item item) { } #endif #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG)) unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags); bool can_split_folio(struct folio *folio, int *pextra_pins); int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, unsigned int new_order); static inline int split_huge_page(struct page *page) { return split_huge_page_to_list_to_order(page, NULL, 0); } void deferred_split_folio(struct folio *folio); void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze, struct folio *folio); #define split_huge_pmd(__vma, __pmd, __address) \ do { \ pmd_t *____pmd = (__pmd); \ if (is_swap_pmd(*____pmd) || pmd_trans_huge(*____pmd) \ || pmd_devmap(*____pmd)) \ __split_huge_pmd(__vma, __pmd, __address, \ false, NULL); \ } while (0) void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, bool freeze, struct folio *folio); void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, unsigned long address); #define split_huge_pud(__vma, __pud, __address) \ do { \ pud_t *____pud = (__pud); \ if (pud_trans_huge(*____pud) \ || pud_devmap(*____pud)) \ __split_huge_pud(__vma, __pud, __address); \ } while (0) int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice); int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end); void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, long adjust_next); spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma); spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma); static inline int is_swap_pmd(pmd_t pmd) { return !pmd_none(pmd) && !pmd_present(pmd); } /* mmap_lock must be held on entry */ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) return __pmd_trans_huge_lock(pmd, vma); else return NULL; } static inline spinlock_t *pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) { if (pud_trans_huge(*pud) || pud_devmap(*pud)) return __pud_trans_huge_lock(pud, vma); else return NULL; } /** * folio_test_pmd_mappable - Can we map this folio with a PMD? * @folio: The folio to test */ static inline bool folio_test_pmd_mappable(struct folio *folio) { return folio_order(folio) >= HPAGE_PMD_ORDER; } struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap); vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf); extern struct folio *huge_zero_folio; extern unsigned long huge_zero_pfn; static inline bool is_huge_zero_folio(const struct folio *folio) { return READ_ONCE(huge_zero_folio) == folio; } static inline bool is_huge_zero_pmd(pmd_t pmd) { return pmd_present(pmd) && READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd); } static inline bool is_huge_zero_pud(pud_t pud) { return false; } struct folio *mm_get_huge_zero_folio(struct mm_struct *mm); void mm_put_huge_zero_folio(struct mm_struct *mm); #define mk_huge_pmd(page, prot) pmd_mkhuge(mk_pmd(page, prot)) static inline bool thp_migration_supported(void) { return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION); } void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, bool freeze, struct folio *folio); bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp, struct folio *folio); #else /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline bool folio_test_pmd_mappable(struct folio *folio) { return false; } static inline bool thp_vma_suitable_order(struct vm_area_struct *vma, unsigned long addr, int order) { return false; } static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma, unsigned long addr, unsigned long orders) { return 0; } static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, unsigned long vm_flags, unsigned long tva_flags, unsigned long orders) { return 0; } #define transparent_hugepage_flags 0UL #define thp_get_unmapped_area NULL static inline unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { return 0; } static inline bool can_split_folio(struct folio *folio, int *pextra_pins) { return false; } static inline int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, unsigned int new_order) { return 0; } static inline int split_huge_page(struct page *page) { return 0; } static inline void deferred_split_folio(struct folio *folio) {} #define split_huge_pmd(__vma, __pmd, __address) \ do { } while (0) static inline void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze, struct folio *folio) {} static inline void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, bool freeze, struct folio *folio) {} static inline void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, bool freeze, struct folio *folio) {} static inline bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp, struct folio *folio) { return false; } #define split_huge_pud(__vma, __pmd, __address) \ do { } while (0) static inline int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) { return -EINVAL; } static inline int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end) { return -EINVAL; } static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, long adjust_next) { } static inline int is_swap_pmd(pmd_t pmd) { return 0; } static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { return NULL; } static inline spinlock_t *pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) { return NULL; } static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) { return 0; } static inline bool is_huge_zero_folio(const struct folio *folio) { return false; } static inline bool is_huge_zero_pmd(pmd_t pmd) { return false; } static inline bool is_huge_zero_pud(pud_t pud) { return false; } static inline void mm_put_huge_zero_folio(struct mm_struct *mm) { return; } static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap) { return NULL; } static inline bool thp_migration_supported(void) { return false; } static inline int highest_order(unsigned long orders) { return 0; } static inline int next_order(unsigned long *orders, int prev) { return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline int split_folio_to_list_to_order(struct folio *folio, struct list_head *list, int new_order) { return split_huge_page_to_list_to_order(&folio->page, list, new_order); } static inline int split_folio_to_order(struct folio *folio, int new_order) { return split_folio_to_list_to_order(folio, NULL, new_order); } #define split_folio_to_list(f, l) split_folio_to_list_to_order(f, l, 0) #define split_folio(f) split_folio_to_order(f, 0) #endif /* _LINUX_HUGE_MM_H */ |
3 1 4 3 7 1 6 2 2 77 2 75 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 * Phillip Lougher <phillip@squashfs.org.uk> * * export.c */ /* * This file implements code to make Squashfs filesystems exportable (NFS etc.) * * The export code uses an inode lookup table to map inode numbers passed in * filehandles to an inode location on disk. This table is stored compressed * into metadata blocks. A second index table is used to locate these. This * second index table for speed of access (and because it is small) is read at * mount time and cached in memory. * * The inode lookup table is used only by the export code, inode disk * locations are directly encoded in directories, enabling direct access * without an intermediate lookup for all operations except the export ops. */ #include <linux/fs.h> #include <linux/vfs.h> #include <linux/dcache.h> #include <linux/exportfs.h> #include <linux/slab.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "squashfs_fs_i.h" #include "squashfs.h" /* * Look-up inode number (ino) in table, returning the inode location. */ static long long squashfs_inode_lookup(struct super_block *sb, int ino_num) { struct squashfs_sb_info *msblk = sb->s_fs_info; int blk = SQUASHFS_LOOKUP_BLOCK(ino_num - 1); int offset = SQUASHFS_LOOKUP_BLOCK_OFFSET(ino_num - 1); u64 start; __le64 ino; int err; TRACE("Entered squashfs_inode_lookup, inode_number = %d\n", ino_num); if (ino_num == 0 || (ino_num - 1) >= msblk->inodes) return -EINVAL; start = le64_to_cpu(msblk->inode_lookup_table[blk]); err = squashfs_read_metadata(sb, &ino, &start, &offset, sizeof(ino)); if (err < 0) return err; TRACE("squashfs_inode_lookup, inode = 0x%llx\n", (u64) le64_to_cpu(ino)); return le64_to_cpu(ino); } static struct dentry *squashfs_export_iget(struct super_block *sb, unsigned int ino_num) { long long ino; struct dentry *dentry = ERR_PTR(-ENOENT); TRACE("Entered squashfs_export_iget\n"); ino = squashfs_inode_lookup(sb, ino_num); if (ino >= 0) dentry = d_obtain_alias(squashfs_iget(sb, ino, ino_num)); return dentry; } static struct dentry *squashfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { if ((fh_type != FILEID_INO32_GEN && fh_type != FILEID_INO32_GEN_PARENT) || fh_len < 2) return NULL; return squashfs_export_iget(sb, fid->i32.ino); } static struct dentry *squashfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { if (fh_type != FILEID_INO32_GEN_PARENT || fh_len < 4) return NULL; return squashfs_export_iget(sb, fid->i32.parent_ino); } static struct dentry *squashfs_get_parent(struct dentry *child) { struct inode *inode = d_inode(child); unsigned int parent_ino = squashfs_i(inode)->parent; return squashfs_export_iget(inode->i_sb, parent_ino); } /* * Read uncompressed inode lookup table indexes off disk into memory */ __le64 *squashfs_read_inode_lookup_table(struct super_block *sb, u64 lookup_table_start, u64 next_table, unsigned int inodes) { unsigned int length = SQUASHFS_LOOKUP_BLOCK_BYTES(inodes); unsigned int indexes = SQUASHFS_LOOKUP_BLOCKS(inodes); int n; __le64 *table; u64 start, end; TRACE("In read_inode_lookup_table, length %d\n", length); /* Sanity check values */ /* there should always be at least one inode */ if (inodes == 0) return ERR_PTR(-EINVAL); /* * The computed size of the lookup table (length bytes) should exactly * match the table start and end points */ if (length != (next_table - lookup_table_start)) return ERR_PTR(-EINVAL); table = squashfs_read_table(sb, lookup_table_start, length); if (IS_ERR(table)) return table; /* * table0], table[1], ... table[indexes - 1] store the locations * of the compressed inode lookup blocks. Each entry should be * less than the next (i.e. table[0] < table[1]), and the difference * between them should be SQUASHFS_METADATA_SIZE or less. * table[indexes - 1] should be less than lookup_table_start, and * again the difference should be SQUASHFS_METADATA_SIZE or less */ for (n = 0; n < (indexes - 1); n++) { start = le64_to_cpu(table[n]); end = le64_to_cpu(table[n + 1]); if (start >= end || (end - start) > (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) { kfree(table); return ERR_PTR(-EINVAL); } } start = le64_to_cpu(table[indexes - 1]); if (start >= lookup_table_start || (lookup_table_start - start) > (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) { kfree(table); return ERR_PTR(-EINVAL); } return table; } const struct export_operations squashfs_export_ops = { .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = squashfs_fh_to_dentry, .fh_to_parent = squashfs_fh_to_parent, .get_parent = squashfs_get_parent }; |
1 1 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Quickcam cameras initialization data * * V4L2 by Jean-Francois Moine <http://moinejf.free.fr> */ #define MODULE_NAME "tv8532" #include "gspca.h" MODULE_AUTHOR("Michel Xhaard <mxhaard@users.sourceforge.net>"); MODULE_DESCRIPTION("TV8532 USB Camera Driver"); MODULE_LICENSE("GPL"); /* specific webcam descriptor */ struct sd { struct gspca_dev gspca_dev; /* !! must be the first item */ __u8 packet; }; static const struct v4l2_pix_format sif_mode[] = { {176, 144, V4L2_PIX_FMT_SBGGR8, V4L2_FIELD_NONE, .bytesperline = 176, .sizeimage = 176 * 144, .colorspace = V4L2_COLORSPACE_SRGB, .priv = 1}, {352, 288, V4L2_PIX_FMT_SBGGR8, V4L2_FIELD_NONE, .bytesperline = 352, .sizeimage = 352 * 288, .colorspace = V4L2_COLORSPACE_SRGB, .priv = 0}, }; /* TV-8532A (ICM532A) registers (LE) */ #define R00_PART_CONTROL 0x00 #define LATENT_CHANGE 0x80 #define EXPO_CHANGE 0x04 #define R01_TIMING_CONTROL_LOW 0x01 #define CMD_EEprom_Open 0x30 #define CMD_EEprom_Close 0x29 #define R03_TABLE_ADDR 0x03 #define R04_WTRAM_DATA_L 0x04 #define R05_WTRAM_DATA_M 0x05 #define R06_WTRAM_DATA_H 0x06 #define R07_TABLE_LEN 0x07 #define R08_RAM_WRITE_ACTION 0x08 #define R0C_AD_WIDTHL 0x0c #define R0D_AD_WIDTHH 0x0d #define R0E_AD_HEIGHTL 0x0e #define R0F_AD_HEIGHTH 0x0f #define R10_AD_COL_BEGINL 0x10 #define R11_AD_COL_BEGINH 0x11 #define MIRROR 0x04 /* [10] */ #define R14_AD_ROW_BEGINL 0x14 #define R15_AD_ROWBEGINH 0x15 #define R1C_AD_EXPOSE_TIMEL 0x1c #define R20_GAIN_G1L 0x20 #define R21_GAIN_G1H 0x21 #define R22_GAIN_RL 0x22 #define R23_GAIN_RH 0x23 #define R24_GAIN_BL 0x24 #define R25_GAIN_BH 0x25 #define R26_GAIN_G2L 0x26 #define R27_GAIN_G2H 0x27 #define R28_QUANT 0x28 #define R29_LINE 0x29 #define R2C_POLARITY 0x2c #define R2D_POINT 0x2d #define R2E_POINTH 0x2e #define R2F_POINTB 0x2f #define R30_POINTBH 0x30 #define R31_UPD 0x31 #define R2A_HIGH_BUDGET 0x2a #define R2B_LOW_BUDGET 0x2b #define R34_VID 0x34 #define R35_VIDH 0x35 #define R36_PID 0x36 #define R37_PIDH 0x37 #define R39_Test1 0x39 /* GPIO */ #define R3B_Test3 0x3b /* GPIO */ #define R83_AD_IDH 0x83 #define R91_AD_SLOPEREG 0x91 #define R94_AD_BITCONTROL 0x94 static const u8 eeprom_data[][3] = { /* dataH dataM dataL */ {0x01, 0x00, 0x01}, {0x01, 0x80, 0x11}, {0x05, 0x00, 0x14}, {0x05, 0x00, 0x1c}, {0x0d, 0x00, 0x1e}, {0x05, 0x00, 0x1f}, {0x05, 0x05, 0x19}, {0x05, 0x01, 0x1b}, {0x05, 0x09, 0x1e}, {0x0d, 0x89, 0x2e}, {0x05, 0x89, 0x2f}, {0x05, 0x0d, 0xd9}, {0x05, 0x09, 0xf1}, }; /* write 1 byte */ static void reg_w1(struct gspca_dev *gspca_dev, __u16 index, __u8 value) { gspca_dev->usb_buf[0] = value; usb_control_msg(gspca_dev->dev, usb_sndctrlpipe(gspca_dev->dev, 0), 0x02, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 0, /* value */ index, gspca_dev->usb_buf, 1, 500); } /* write 2 bytes */ static void reg_w2(struct gspca_dev *gspca_dev, u16 index, u16 value) { gspca_dev->usb_buf[0] = value; gspca_dev->usb_buf[1] = value >> 8; usb_control_msg(gspca_dev->dev, usb_sndctrlpipe(gspca_dev->dev, 0), 0x02, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 0, /* value */ index, gspca_dev->usb_buf, 2, 500); } static void tv_8532WriteEEprom(struct gspca_dev *gspca_dev) { int i; reg_w1(gspca_dev, R01_TIMING_CONTROL_LOW, CMD_EEprom_Open); for (i = 0; i < ARRAY_SIZE(eeprom_data); i++) { reg_w1(gspca_dev, R03_TABLE_ADDR, i); reg_w1(gspca_dev, R04_WTRAM_DATA_L, eeprom_data[i][2]); reg_w1(gspca_dev, R05_WTRAM_DATA_M, eeprom_data[i][1]); reg_w1(gspca_dev, R06_WTRAM_DATA_H, eeprom_data[i][0]); reg_w1(gspca_dev, R08_RAM_WRITE_ACTION, 0); } reg_w1(gspca_dev, R07_TABLE_LEN, i); reg_w1(gspca_dev, R01_TIMING_CONTROL_LOW, CMD_EEprom_Close); } /* this function is called at probe time */ static int sd_config(struct gspca_dev *gspca_dev, const struct usb_device_id *id) { struct cam *cam; cam = &gspca_dev->cam; cam->cam_mode = sif_mode; cam->nmodes = ARRAY_SIZE(sif_mode); return 0; } static void tv_8532_setReg(struct gspca_dev *gspca_dev) { reg_w1(gspca_dev, R3B_Test3, 0x0a); /* Test0Sel = 10 */ /******************************************************/ reg_w1(gspca_dev, R0E_AD_HEIGHTL, 0x90); reg_w1(gspca_dev, R0F_AD_HEIGHTH, 0x01); reg_w2(gspca_dev, R1C_AD_EXPOSE_TIMEL, 0x018f); reg_w1(gspca_dev, R10_AD_COL_BEGINL, 0x44); /* begin active line */ reg_w1(gspca_dev, R11_AD_COL_BEGINH, 0x00); /* mirror and digital gain */ reg_w1(gspca_dev, R14_AD_ROW_BEGINL, 0x0a); reg_w1(gspca_dev, R94_AD_BITCONTROL, 0x02); reg_w1(gspca_dev, R91_AD_SLOPEREG, 0x00); reg_w1(gspca_dev, R00_PART_CONTROL, LATENT_CHANGE | EXPO_CHANGE); /* = 0x84 */ } /* this function is called at probe and resume time */ static int sd_init(struct gspca_dev *gspca_dev) { tv_8532WriteEEprom(gspca_dev); return 0; } static void setexposure(struct gspca_dev *gspca_dev, s32 val) { reg_w2(gspca_dev, R1C_AD_EXPOSE_TIMEL, val); reg_w1(gspca_dev, R00_PART_CONTROL, LATENT_CHANGE | EXPO_CHANGE); /* 0x84 */ } static void setgain(struct gspca_dev *gspca_dev, s32 val) { reg_w2(gspca_dev, R20_GAIN_G1L, val); reg_w2(gspca_dev, R22_GAIN_RL, val); reg_w2(gspca_dev, R24_GAIN_BL, val); reg_w2(gspca_dev, R26_GAIN_G2L, val); } /* -- start the camera -- */ static int sd_start(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *) gspca_dev; reg_w1(gspca_dev, R0C_AD_WIDTHL, 0xe8); /* 0x20; 0x0c */ reg_w1(gspca_dev, R0D_AD_WIDTHH, 0x03); /************************************************/ reg_w1(gspca_dev, R28_QUANT, 0x90); /* 0x72 compressed mode 0x28 */ if (gspca_dev->cam.cam_mode[(int) gspca_dev->curr_mode].priv) { /* 176x144 */ reg_w1(gspca_dev, R29_LINE, 0x41); /* CIF - 2 lines/packet */ } else { /* 352x288 */ reg_w1(gspca_dev, R29_LINE, 0x81); /* CIF - 2 lines/packet */ } /************************************************/ reg_w1(gspca_dev, R2C_POLARITY, 0x10); /* slow clock */ reg_w1(gspca_dev, R2D_POINT, 0x14); reg_w1(gspca_dev, R2E_POINTH, 0x01); reg_w1(gspca_dev, R2F_POINTB, 0x12); reg_w1(gspca_dev, R30_POINTBH, 0x01); tv_8532_setReg(gspca_dev); /************************************************/ reg_w1(gspca_dev, R31_UPD, 0x01); /* update registers */ msleep(200); reg_w1(gspca_dev, R31_UPD, 0x00); /* end update */ gspca_dev->empty_packet = 0; /* check the empty packets */ sd->packet = 0; /* ignore the first packets */ return 0; } static void sd_stopN(struct gspca_dev *gspca_dev) { reg_w1(gspca_dev, R3B_Test3, 0x0b); /* Test0Sel = 11 = GPIO */ } static void sd_pkt_scan(struct gspca_dev *gspca_dev, u8 *data, /* isoc packet */ int len) /* iso packet length */ { struct sd *sd = (struct sd *) gspca_dev; int packet_type0, packet_type1; packet_type0 = packet_type1 = INTER_PACKET; if (gspca_dev->empty_packet) { gspca_dev->empty_packet = 0; sd->packet = gspca_dev->pixfmt.height / 2; packet_type0 = FIRST_PACKET; } else if (sd->packet == 0) return; /* 2 more lines in 352x288 ! */ sd->packet--; if (sd->packet == 0) packet_type1 = LAST_PACKET; /* each packet contains: * - header 2 bytes * - RGRG line * - 4 bytes * - GBGB line * - 4 bytes */ gspca_frame_add(gspca_dev, packet_type0, data + 2, gspca_dev->pixfmt.width); gspca_frame_add(gspca_dev, packet_type1, data + gspca_dev->pixfmt.width + 5, gspca_dev->pixfmt.width); } static int sd_s_ctrl(struct v4l2_ctrl *ctrl) { struct gspca_dev *gspca_dev = container_of(ctrl->handler, struct gspca_dev, ctrl_handler); gspca_dev->usb_err = 0; if (!gspca_dev->streaming) return 0; switch (ctrl->id) { case V4L2_CID_EXPOSURE: setexposure(gspca_dev, ctrl->val); break; case V4L2_CID_GAIN: setgain(gspca_dev, ctrl->val); break; } return gspca_dev->usb_err; } static const struct v4l2_ctrl_ops sd_ctrl_ops = { .s_ctrl = sd_s_ctrl, }; static int sd_init_controls(struct gspca_dev *gspca_dev) { struct v4l2_ctrl_handler *hdl = &gspca_dev->ctrl_handler; gspca_dev->vdev.ctrl_handler = hdl; v4l2_ctrl_handler_init(hdl, 2); v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_EXPOSURE, 0, 0x18f, 1, 0x18f); v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_GAIN, 0, 0x7ff, 1, 0x100); if (hdl->error) { pr_err("Could not initialize controls\n"); return hdl->error; } return 0; } /* sub-driver description */ static const struct sd_desc sd_desc = { .name = MODULE_NAME, .config = sd_config, .init = sd_init, .init_controls = sd_init_controls, .start = sd_start, .stopN = sd_stopN, .pkt_scan = sd_pkt_scan, }; /* -- module initialisation -- */ static const struct usb_device_id device_table[] = { {USB_DEVICE(0x046d, 0x0920)}, {USB_DEVICE(0x046d, 0x0921)}, {USB_DEVICE(0x0545, 0x808b)}, {USB_DEVICE(0x0545, 0x8333)}, {USB_DEVICE(0x0923, 0x010f)}, {} }; MODULE_DEVICE_TABLE(usb, device_table); /* -- device connect -- */ static int sd_probe(struct usb_interface *intf, const struct usb_device_id *id) { return gspca_dev_probe(intf, id, &sd_desc, sizeof(struct sd), THIS_MODULE); } static struct usb_driver sd_driver = { .name = MODULE_NAME, .id_table = device_table, .probe = sd_probe, .disconnect = gspca_disconnect, #ifdef CONFIG_PM .suspend = gspca_suspend, .resume = gspca_resume, .reset_resume = gspca_resume, #endif }; module_usb_driver(sd_driver); |
12 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * inet6 interface/address list definitions * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> */ #ifndef _NET_IF_INET6_H #define _NET_IF_INET6_H #include <net/snmp.h> #include <linux/ipv6.h> #include <linux/refcount.h> /* inet6_dev.if_flags */ #define IF_RA_OTHERCONF 0x80 #define IF_RA_MANAGED 0x40 #define IF_RA_RCVD 0x20 #define IF_RS_SENT 0x10 #define IF_READY 0x80000000 enum { INET6_IFADDR_STATE_PREDAD, INET6_IFADDR_STATE_DAD, INET6_IFADDR_STATE_POSTDAD, INET6_IFADDR_STATE_ERRDAD, INET6_IFADDR_STATE_DEAD, }; struct inet6_ifaddr { struct in6_addr addr; __u32 prefix_len; __u32 rt_priority; /* In seconds, relative to tstamp. Expiry is at tstamp + HZ * lft. */ __u32 valid_lft; __u32 prefered_lft; refcount_t refcnt; spinlock_t lock; int state; __u32 flags; __u8 dad_probes; __u8 stable_privacy_retry; __u16 scope; __u64 dad_nonce; unsigned long cstamp; /* created timestamp */ unsigned long tstamp; /* updated timestamp */ struct delayed_work dad_work; struct inet6_dev *idev; struct fib6_info *rt; struct hlist_node addr_lst; struct list_head if_list; /* * Used to safely traverse idev->addr_list in process context * if the idev->lock needed to protect idev->addr_list cannot be held. * In that case, add the items to this list temporarily and iterate * without holding idev->lock. * See addrconf_ifdown and dev_forward_change. */ struct list_head if_list_aux; struct list_head tmp_list; struct inet6_ifaddr *ifpub; int regen_count; bool tokenized; u8 ifa_proto; struct rcu_head rcu; struct in6_addr peer_addr; }; struct ip6_sf_socklist { unsigned int sl_max; unsigned int sl_count; struct rcu_head rcu; struct in6_addr sl_addr[] __counted_by(sl_max); }; #define IP6_SFBLOCK 10 /* allocate this many at once */ struct ipv6_mc_socklist { struct in6_addr addr; int ifindex; unsigned int sfmode; /* MCAST_{INCLUDE,EXCLUDE} */ struct ipv6_mc_socklist __rcu *next; struct ip6_sf_socklist __rcu *sflist; struct rcu_head rcu; }; struct ip6_sf_list { struct ip6_sf_list __rcu *sf_next; struct in6_addr sf_addr; unsigned long sf_count[2]; /* include/exclude counts */ unsigned char sf_gsresp; /* include in g & s response? */ unsigned char sf_oldin; /* change state */ unsigned char sf_crcount; /* retrans. left to send */ struct rcu_head rcu; }; #define MAF_TIMER_RUNNING 0x01 #define MAF_LAST_REPORTER 0x02 #define MAF_LOADED 0x04 #define MAF_NOREPORT 0x08 #define MAF_GSQUERY 0x10 struct ifmcaddr6 { struct in6_addr mca_addr; struct inet6_dev *idev; struct ifmcaddr6 __rcu *next; struct ip6_sf_list __rcu *mca_sources; struct ip6_sf_list __rcu *mca_tomb; unsigned int mca_sfmode; unsigned char mca_crcount; unsigned long mca_sfcount[2]; struct delayed_work mca_work; unsigned int mca_flags; int mca_users; refcount_t mca_refcnt; unsigned long mca_cstamp; unsigned long mca_tstamp; struct rcu_head rcu; }; /* Anycast stuff */ struct ipv6_ac_socklist { struct in6_addr acl_addr; int acl_ifindex; struct ipv6_ac_socklist *acl_next; }; struct ifacaddr6 { struct in6_addr aca_addr; struct fib6_info *aca_rt; struct ifacaddr6 __rcu *aca_next; struct hlist_node aca_addr_lst; int aca_users; refcount_t aca_refcnt; unsigned long aca_cstamp; unsigned long aca_tstamp; struct rcu_head rcu; }; #define IFA_HOST IPV6_ADDR_LOOPBACK #define IFA_LINK IPV6_ADDR_LINKLOCAL #define IFA_SITE IPV6_ADDR_SITELOCAL struct ipv6_devstat { struct proc_dir_entry *proc_dir_entry; DEFINE_SNMP_STAT(struct ipstats_mib, ipv6); DEFINE_SNMP_STAT_ATOMIC(struct icmpv6_mib_device, icmpv6dev); DEFINE_SNMP_STAT_ATOMIC(struct icmpv6msg_mib_device, icmpv6msgdev); }; struct inet6_dev { struct net_device *dev; netdevice_tracker dev_tracker; struct list_head addr_list; struct ifmcaddr6 __rcu *mc_list; struct ifmcaddr6 __rcu *mc_tomb; unsigned char mc_qrv; /* Query Robustness Variable */ unsigned char mc_gq_running; unsigned char mc_ifc_count; unsigned char mc_dad_count; unsigned long mc_v1_seen; /* Max time we stay in MLDv1 mode */ unsigned long mc_qi; /* Query Interval */ unsigned long mc_qri; /* Query Response Interval */ unsigned long mc_maxdelay; struct delayed_work mc_gq_work; /* general query work */ struct delayed_work mc_ifc_work; /* interface change work */ struct delayed_work mc_dad_work; /* dad complete mc work */ struct delayed_work mc_query_work; /* mld query work */ struct delayed_work mc_report_work; /* mld report work */ struct sk_buff_head mc_query_queue; /* mld query queue */ struct sk_buff_head mc_report_queue; /* mld report queue */ spinlock_t mc_query_lock; /* mld query queue lock */ spinlock_t mc_report_lock; /* mld query report lock */ struct mutex mc_lock; /* mld global lock */ struct ifacaddr6 __rcu *ac_list; rwlock_t lock; refcount_t refcnt; __u32 if_flags; int dead; u32 desync_factor; struct list_head tempaddr_list; struct in6_addr token; struct neigh_parms *nd_parms; struct ipv6_devconf cnf; struct ipv6_devstat stats; struct timer_list rs_timer; __s32 rs_interval; /* in jiffies */ __u8 rs_probes; unsigned long tstamp; /* ipv6InterfaceTable update timestamp */ struct rcu_head rcu; unsigned int ra_mtu; }; static inline void ipv6_eth_mc_map(const struct in6_addr *addr, char *buf) { /* * +-------+-------+-------+-------+-------+-------+ * | 33 | 33 | DST13 | DST14 | DST15 | DST16 | * +-------+-------+-------+-------+-------+-------+ */ buf[0]= 0x33; buf[1]= 0x33; memcpy(buf + 2, &addr->s6_addr32[3], sizeof(__u32)); } static inline void ipv6_arcnet_mc_map(const struct in6_addr *addr, char *buf) { buf[0] = 0x00; } static inline void ipv6_ib_mc_map(const struct in6_addr *addr, const unsigned char *broadcast, char *buf) { unsigned char scope = broadcast[5] & 0xF; buf[0] = 0; /* Reserved */ buf[1] = 0xff; /* Multicast QPN */ buf[2] = 0xff; buf[3] = 0xff; buf[4] = 0xff; buf[5] = 0x10 | scope; /* scope from broadcast address */ buf[6] = 0x60; /* IPv6 signature */ buf[7] = 0x1b; buf[8] = broadcast[8]; /* P_Key */ buf[9] = broadcast[9]; memcpy(buf + 10, addr->s6_addr + 6, 10); } static inline int ipv6_ipgre_mc_map(const struct in6_addr *addr, const unsigned char *broadcast, char *buf) { if ((broadcast[0] | broadcast[1] | broadcast[2] | broadcast[3]) != 0) { memcpy(buf, broadcast, 4); } else { /* v4mapped? */ if ((addr->s6_addr32[0] | addr->s6_addr32[1] | (addr->s6_addr32[2] ^ htonl(0x0000ffff))) != 0) return -EINVAL; memcpy(buf, &addr->s6_addr32[3], 4); } return 0; } #endif |
45 45 9 22 22 45 45 45 64 63 35 11 63 64 19 19 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 | // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2016 Facebook */ #include "percpu_freelist.h" int pcpu_freelist_init(struct pcpu_freelist *s) { int cpu; s->freelist = alloc_percpu(struct pcpu_freelist_head); if (!s->freelist) return -ENOMEM; for_each_possible_cpu(cpu) { struct pcpu_freelist_head *head = per_cpu_ptr(s->freelist, cpu); raw_spin_lock_init(&head->lock); head->first = NULL; } raw_spin_lock_init(&s->extralist.lock); s->extralist.first = NULL; return 0; } void pcpu_freelist_destroy(struct pcpu_freelist *s) { free_percpu(s->freelist); } static inline void pcpu_freelist_push_node(struct pcpu_freelist_head *head, struct pcpu_freelist_node *node) { node->next = head->first; WRITE_ONCE(head->first, node); } static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head, struct pcpu_freelist_node *node) { raw_spin_lock(&head->lock); pcpu_freelist_push_node(head, node); raw_spin_unlock(&head->lock); } static inline bool pcpu_freelist_try_push_extra(struct pcpu_freelist *s, struct pcpu_freelist_node *node) { if (!raw_spin_trylock(&s->extralist.lock)) return false; pcpu_freelist_push_node(&s->extralist, node); raw_spin_unlock(&s->extralist.lock); return true; } static inline void ___pcpu_freelist_push_nmi(struct pcpu_freelist *s, struct pcpu_freelist_node *node) { int cpu, orig_cpu; orig_cpu = raw_smp_processor_id(); while (1) { for_each_cpu_wrap(cpu, cpu_possible_mask, orig_cpu) { struct pcpu_freelist_head *head; head = per_cpu_ptr(s->freelist, cpu); if (raw_spin_trylock(&head->lock)) { pcpu_freelist_push_node(head, node); raw_spin_unlock(&head->lock); return; } } /* cannot lock any per cpu lock, try extralist */ if (pcpu_freelist_try_push_extra(s, node)) return; } } void __pcpu_freelist_push(struct pcpu_freelist *s, struct pcpu_freelist_node *node) { if (in_nmi()) ___pcpu_freelist_push_nmi(s, node); else ___pcpu_freelist_push(this_cpu_ptr(s->freelist), node); } void pcpu_freelist_push(struct pcpu_freelist *s, struct pcpu_freelist_node *node) { unsigned long flags; local_irq_save(flags); __pcpu_freelist_push(s, node); local_irq_restore(flags); } void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, u32 nr_elems) { struct pcpu_freelist_head *head; unsigned int cpu, cpu_idx, i, j, n, m; n = nr_elems / num_possible_cpus(); m = nr_elems % num_possible_cpus(); cpu_idx = 0; for_each_possible_cpu(cpu) { head = per_cpu_ptr(s->freelist, cpu); j = n + (cpu_idx < m ? 1 : 0); for (i = 0; i < j; i++) { /* No locking required as this is not visible yet. */ pcpu_freelist_push_node(head, buf); buf += elem_size; } cpu_idx++; } } static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s) { struct pcpu_freelist_head *head; struct pcpu_freelist_node *node; int cpu; for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) { head = per_cpu_ptr(s->freelist, cpu); if (!READ_ONCE(head->first)) continue; raw_spin_lock(&head->lock); node = head->first; if (node) { WRITE_ONCE(head->first, node->next); raw_spin_unlock(&head->lock); return node; } raw_spin_unlock(&head->lock); } /* per cpu lists are all empty, try extralist */ if (!READ_ONCE(s->extralist.first)) return NULL; raw_spin_lock(&s->extralist.lock); node = s->extralist.first; if (node) WRITE_ONCE(s->extralist.first, node->next); raw_spin_unlock(&s->extralist.lock); return node; } static struct pcpu_freelist_node * ___pcpu_freelist_pop_nmi(struct pcpu_freelist *s) { struct pcpu_freelist_head *head; struct pcpu_freelist_node *node; int cpu; for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) { head = per_cpu_ptr(s->freelist, cpu); if (!READ_ONCE(head->first)) continue; if (raw_spin_trylock(&head->lock)) { node = head->first; if (node) { WRITE_ONCE(head->first, node->next); raw_spin_unlock(&head->lock); return node; } raw_spin_unlock(&head->lock); } } /* cannot pop from per cpu lists, try extralist */ if (!READ_ONCE(s->extralist.first) || !raw_spin_trylock(&s->extralist.lock)) return NULL; node = s->extralist.first; if (node) WRITE_ONCE(s->extralist.first, node->next); raw_spin_unlock(&s->extralist.lock); return node; } struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s) { if (in_nmi()) return ___pcpu_freelist_pop_nmi(s); return ___pcpu_freelist_pop(s); } struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s) { struct pcpu_freelist_node *ret; unsigned long flags; local_irq_save(flags); ret = __pcpu_freelist_pop(s); local_irq_restore(flags); return ret; } |
2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 | /* * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the * LICENSE file in the root directory of this source tree) and the GPLv2 (found * in the COPYING file in the root directory of this source tree). * You may select, at your option, one of the above-listed licenses. */ /* zstd_ddict.c : * concentrates all logic that needs to know the internals of ZSTD_DDict object */ /*-******************************************************* * Dependencies *********************************************************/ #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */ #include "../common/cpu.h" /* bmi2 */ #include "../common/mem.h" /* low level memory routines */ #define FSE_STATIC_LINKING_ONLY #include "../common/fse.h" #define HUF_STATIC_LINKING_ONLY #include "../common/huf.h" #include "zstd_decompress_internal.h" #include "zstd_ddict.h" /*-******************************************************* * Types *********************************************************/ struct ZSTD_DDict_s { void* dictBuffer; const void* dictContent; size_t dictSize; ZSTD_entropyDTables_t entropy; U32 dictID; U32 entropyPresent; ZSTD_customMem cMem; }; /* typedef'd to ZSTD_DDict within "zstd.h" */ const void* ZSTD_DDict_dictContent(const ZSTD_DDict* ddict) { assert(ddict != NULL); return ddict->dictContent; } size_t ZSTD_DDict_dictSize(const ZSTD_DDict* ddict) { assert(ddict != NULL); return ddict->dictSize; } void ZSTD_copyDDictParameters(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict) { DEBUGLOG(4, "ZSTD_copyDDictParameters"); assert(dctx != NULL); assert(ddict != NULL); dctx->dictID = ddict->dictID; dctx->prefixStart = ddict->dictContent; dctx->virtualStart = ddict->dictContent; dctx->dictEnd = (const BYTE*)ddict->dictContent + ddict->dictSize; dctx->previousDstEnd = dctx->dictEnd; #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION dctx->dictContentBeginForFuzzing = dctx->prefixStart; dctx->dictContentEndForFuzzing = dctx->previousDstEnd; #endif if (ddict->entropyPresent) { dctx->litEntropy = 1; dctx->fseEntropy = 1; dctx->LLTptr = ddict->entropy.LLTable; dctx->MLTptr = ddict->entropy.MLTable; dctx->OFTptr = ddict->entropy.OFTable; dctx->HUFptr = ddict->entropy.hufTable; dctx->entropy.rep[0] = ddict->entropy.rep[0]; dctx->entropy.rep[1] = ddict->entropy.rep[1]; dctx->entropy.rep[2] = ddict->entropy.rep[2]; } else { dctx->litEntropy = 0; dctx->fseEntropy = 0; } } static size_t ZSTD_loadEntropy_intoDDict(ZSTD_DDict* ddict, ZSTD_dictContentType_e dictContentType) { ddict->dictID = 0; ddict->entropyPresent = 0; if (dictContentType == ZSTD_dct_rawContent) return 0; if (ddict->dictSize < 8) { if (dictContentType == ZSTD_dct_fullDict) return ERROR(dictionary_corrupted); /* only accept specified dictionaries */ return 0; /* pure content mode */ } { U32 const magic = MEM_readLE32(ddict->dictContent); if (magic != ZSTD_MAGIC_DICTIONARY) { if (dictContentType == ZSTD_dct_fullDict) return ERROR(dictionary_corrupted); /* only accept specified dictionaries */ return 0; /* pure content mode */ } } ddict->dictID = MEM_readLE32((const char*)ddict->dictContent + ZSTD_FRAMEIDSIZE); /* load entropy tables */ RETURN_ERROR_IF(ZSTD_isError(ZSTD_loadDEntropy( &ddict->entropy, ddict->dictContent, ddict->dictSize)), dictionary_corrupted, ""); ddict->entropyPresent = 1; return 0; } static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType) { if ((dictLoadMethod == ZSTD_dlm_byRef) || (!dict) || (!dictSize)) { ddict->dictBuffer = NULL; ddict->dictContent = dict; if (!dict) dictSize = 0; } else { void* const internalBuffer = ZSTD_customMalloc(dictSize, ddict->cMem); ddict->dictBuffer = internalBuffer; ddict->dictContent = internalBuffer; if (!internalBuffer) return ERROR(memory_allocation); ZSTD_memcpy(internalBuffer, dict, dictSize); } ddict->dictSize = dictSize; ddict->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */ /* parse dictionary content */ FORWARD_IF_ERROR( ZSTD_loadEntropy_intoDDict(ddict, dictContentType) , ""); return 0; } ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType, ZSTD_customMem customMem) { if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL; { ZSTD_DDict* const ddict = (ZSTD_DDict*) ZSTD_customMalloc(sizeof(ZSTD_DDict), customMem); if (ddict == NULL) return NULL; ddict->cMem = customMem; { size_t const initResult = ZSTD_initDDict_internal(ddict, dict, dictSize, dictLoadMethod, dictContentType); if (ZSTD_isError(initResult)) { ZSTD_freeDDict(ddict); return NULL; } } return ddict; } } /*! ZSTD_createDDict() : * Create a digested dictionary, to start decompression without startup delay. * `dict` content is copied inside DDict. * Consequently, `dict` can be released after `ZSTD_DDict` creation */ ZSTD_DDict* ZSTD_createDDict(const void* dict, size_t dictSize) { ZSTD_customMem const allocator = { NULL, NULL, NULL }; return ZSTD_createDDict_advanced(dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto, allocator); } /*! ZSTD_createDDict_byReference() : * Create a digested dictionary, to start decompression without startup delay. * Dictionary content is simply referenced, it will be accessed during decompression. * Warning : dictBuffer must outlive DDict (DDict must be freed before dictBuffer) */ ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize) { ZSTD_customMem const allocator = { NULL, NULL, NULL }; return ZSTD_createDDict_advanced(dictBuffer, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto, allocator); } const ZSTD_DDict* ZSTD_initStaticDDict( void* sBuffer, size_t sBufferSize, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType) { size_t const neededSpace = sizeof(ZSTD_DDict) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize); ZSTD_DDict* const ddict = (ZSTD_DDict*)sBuffer; assert(sBuffer != NULL); assert(dict != NULL); if ((size_t)sBuffer & 7) return NULL; /* 8-aligned */ if (sBufferSize < neededSpace) return NULL; if (dictLoadMethod == ZSTD_dlm_byCopy) { ZSTD_memcpy(ddict+1, dict, dictSize); /* local copy */ dict = ddict+1; } if (ZSTD_isError( ZSTD_initDDict_internal(ddict, dict, dictSize, ZSTD_dlm_byRef, dictContentType) )) return NULL; return ddict; } size_t ZSTD_freeDDict(ZSTD_DDict* ddict) { if (ddict==NULL) return 0; /* support free on NULL */ { ZSTD_customMem const cMem = ddict->cMem; ZSTD_customFree(ddict->dictBuffer, cMem); ZSTD_customFree(ddict, cMem); return 0; } } /*! ZSTD_estimateDDictSize() : * Estimate amount of memory that will be needed to create a dictionary for decompression. * Note : dictionary created by reference using ZSTD_dlm_byRef are smaller */ size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod) { return sizeof(ZSTD_DDict) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize); } size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict) { if (ddict==NULL) return 0; /* support sizeof on NULL */ return sizeof(*ddict) + (ddict->dictBuffer ? ddict->dictSize : 0) ; } /*! ZSTD_getDictID_fromDDict() : * Provides the dictID of the dictionary loaded into `ddict`. * If @return == 0, the dictionary is not conformant to Zstandard specification, or empty. * Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */ unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict) { if (ddict==NULL) return 0; return ZSTD_getDictID_fromDict(ddict->dictContent, ddict->dictSize); } |
3 3 9 10 59 9 5 68 22 28 33 12 18 3 11 78 28 5 14 69 18 9 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/bitmap.h> #include <linux/ctype.h> #include <linux/errno.h> #include <linux/err.h> #include <linux/export.h> #include <linux/hex.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/string.h> #include "kstrtox.h" /** * bitmap_parse_user - convert an ASCII hex string in a user buffer into a bitmap * * @ubuf: pointer to user buffer containing string. * @ulen: buffer size in bytes. If string is smaller than this * then it must be terminated with a \0. * @maskp: pointer to bitmap array that will contain result. * @nmaskbits: size of bitmap, in bits. */ int bitmap_parse_user(const char __user *ubuf, unsigned int ulen, unsigned long *maskp, int nmaskbits) { char *buf; int ret; buf = memdup_user_nul(ubuf, ulen); if (IS_ERR(buf)) return PTR_ERR(buf); ret = bitmap_parse(buf, UINT_MAX, maskp, nmaskbits); kfree(buf); return ret; } EXPORT_SYMBOL(bitmap_parse_user); /** * bitmap_print_to_pagebuf - convert bitmap to list or hex format ASCII string * @list: indicates whether the bitmap must be list * @buf: page aligned buffer into which string is placed * @maskp: pointer to bitmap to convert * @nmaskbits: size of bitmap, in bits * * Output format is a comma-separated list of decimal numbers and * ranges if list is specified or hex digits grouped into comma-separated * sets of 8 digits/set. Returns the number of characters written to buf. * * It is assumed that @buf is a pointer into a PAGE_SIZE, page-aligned * area and that sufficient storage remains at @buf to accommodate the * bitmap_print_to_pagebuf() output. Returns the number of characters * actually printed to @buf, excluding terminating '\0'. */ int bitmap_print_to_pagebuf(bool list, char *buf, const unsigned long *maskp, int nmaskbits) { ptrdiff_t len = PAGE_SIZE - offset_in_page(buf); return list ? scnprintf(buf, len, "%*pbl\n", nmaskbits, maskp) : scnprintf(buf, len, "%*pb\n", nmaskbits, maskp); } EXPORT_SYMBOL(bitmap_print_to_pagebuf); /** * bitmap_print_to_buf - convert bitmap to list or hex format ASCII string * @list: indicates whether the bitmap must be list * true: print in decimal list format * false: print in hexadecimal bitmask format * @buf: buffer into which string is placed * @maskp: pointer to bitmap to convert * @nmaskbits: size of bitmap, in bits * @off: in the string from which we are copying, We copy to @buf * @count: the maximum number of bytes to print */ static int bitmap_print_to_buf(bool list, char *buf, const unsigned long *maskp, int nmaskbits, loff_t off, size_t count) { const char *fmt = list ? "%*pbl\n" : "%*pb\n"; ssize_t size; void *data; data = kasprintf(GFP_KERNEL, fmt, nmaskbits, maskp); if (!data) return -ENOMEM; size = memory_read_from_buffer(buf, count, &off, data, strlen(data) + 1); kfree(data); return size; } /** * bitmap_print_bitmask_to_buf - convert bitmap to hex bitmask format ASCII string * @buf: buffer into which string is placed * @maskp: pointer to bitmap to convert * @nmaskbits: size of bitmap, in bits * @off: in the string from which we are copying, We copy to @buf * @count: the maximum number of bytes to print * * The bitmap_print_to_pagebuf() is used indirectly via its cpumap wrapper * cpumap_print_to_pagebuf() or directly by drivers to export hexadecimal * bitmask and decimal list to userspace by sysfs ABI. * Drivers might be using a normal attribute for this kind of ABIs. A * normal attribute typically has show entry as below:: * * static ssize_t example_attribute_show(struct device *dev, * struct device_attribute *attr, char *buf) * { * ... * return bitmap_print_to_pagebuf(true, buf, &mask, nr_trig_max); * } * * show entry of attribute has no offset and count parameters and this * means the file is limited to one page only. * bitmap_print_to_pagebuf() API works terribly well for this kind of * normal attribute with buf parameter and without offset, count:: * * bitmap_print_to_pagebuf(bool list, char *buf, const unsigned long *maskp, * int nmaskbits) * { * } * * The problem is once we have a large bitmap, we have a chance to get a * bitmask or list more than one page. Especially for list, it could be * as complex as 0,3,5,7,9,... We have no simple way to know it exact size. * It turns out bin_attribute is a way to break this limit. bin_attribute * has show entry as below:: * * static ssize_t * example_bin_attribute_show(struct file *filp, struct kobject *kobj, * struct bin_attribute *attr, char *buf, * loff_t offset, size_t count) * { * ... * } * * With the new offset and count parameters, this makes sysfs ABI be able * to support file size more than one page. For example, offset could be * >= 4096. * bitmap_print_bitmask_to_buf(), bitmap_print_list_to_buf() wit their * cpumap wrapper cpumap_print_bitmask_to_buf(), cpumap_print_list_to_buf() * make those drivers be able to support large bitmask and list after they * move to use bin_attribute. In result, we have to pass the corresponding * parameters such as off, count from bin_attribute show entry to this API. * * The role of cpumap_print_bitmask_to_buf() and cpumap_print_list_to_buf() * is similar with cpumap_print_to_pagebuf(), the difference is that * bitmap_print_to_pagebuf() mainly serves sysfs attribute with the assumption * the destination buffer is exactly one page and won't be more than one page. * cpumap_print_bitmask_to_buf() and cpumap_print_list_to_buf(), on the other * hand, mainly serves bin_attribute which doesn't work with exact one page, * and it can break the size limit of converted decimal list and hexadecimal * bitmask. * * WARNING! * * This function is not a replacement for sprintf() or bitmap_print_to_pagebuf(). * It is intended to workaround sysfs limitations discussed above and should be * used carefully in general case for the following reasons: * * - Time complexity is O(nbits^2/count), comparing to O(nbits) for snprintf(). * - Memory complexity is O(nbits), comparing to O(1) for snprintf(). * - @off and @count are NOT offset and number of bits to print. * - If printing part of bitmap as list, the resulting string is not a correct * list representation of bitmap. Particularly, some bits within or out of * related interval may be erroneously set or unset. The format of the string * may be broken, so bitmap_parselist-like parser may fail parsing it. * - If printing the whole bitmap as list by parts, user must ensure the order * of calls of the function such that the offset is incremented linearly. * - If printing the whole bitmap as list by parts, user must keep bitmap * unchanged between the very first and very last call. Otherwise concatenated * result may be incorrect, and format may be broken. * * Returns the number of characters actually printed to @buf */ int bitmap_print_bitmask_to_buf(char *buf, const unsigned long *maskp, int nmaskbits, loff_t off, size_t count) { return bitmap_print_to_buf(false, buf, maskp, nmaskbits, off, count); } EXPORT_SYMBOL(bitmap_print_bitmask_to_buf); /** * bitmap_print_list_to_buf - convert bitmap to decimal list format ASCII string * @buf: buffer into which string is placed * @maskp: pointer to bitmap to convert * @nmaskbits: size of bitmap, in bits * @off: in the string from which we are copying, We copy to @buf * @count: the maximum number of bytes to print * * Everything is same with the above bitmap_print_bitmask_to_buf() except * the print format. */ int bitmap_print_list_to_buf(char *buf, const unsigned long *maskp, int nmaskbits, loff_t off, size_t count) { return bitmap_print_to_buf(true, buf, maskp, nmaskbits, off, count); } EXPORT_SYMBOL(bitmap_print_list_to_buf); /* * Region 9-38:4/10 describes the following bitmap structure: * 0 9 12 18 38 N * .........****......****......****.................. * ^ ^ ^ ^ ^ * start off group_len end nbits */ struct region { unsigned int start; unsigned int off; unsigned int group_len; unsigned int end; unsigned int nbits; }; static void bitmap_set_region(const struct region *r, unsigned long *bitmap) { unsigned int start; for (start = r->start; start <= r->end; start += r->group_len) bitmap_set(bitmap, start, min(r->end - start + 1, r->off)); } static int bitmap_check_region(const struct region *r) { if (r->start > r->end || r->group_len == 0 || r->off > r->group_len) return -EINVAL; if (r->end >= r->nbits) return -ERANGE; return 0; } static const char *bitmap_getnum(const char *str, unsigned int *num, unsigned int lastbit) { unsigned long long n; unsigned int len; if (str[0] == 'N') { *num = lastbit; return str + 1; } len = _parse_integer(str, 10, &n); if (!len) return ERR_PTR(-EINVAL); if (len & KSTRTOX_OVERFLOW || n != (unsigned int)n) return ERR_PTR(-EOVERFLOW); *num = n; return str + len; } static inline bool end_of_str(char c) { return c == '\0' || c == '\n'; } static inline bool __end_of_region(char c) { return isspace(c) || c == ','; } static inline bool end_of_region(char c) { return __end_of_region(c) || end_of_str(c); } /* * The format allows commas and whitespaces at the beginning * of the region. */ static const char *bitmap_find_region(const char *str) { while (__end_of_region(*str)) str++; return end_of_str(*str) ? NULL : str; } static const char *bitmap_find_region_reverse(const char *start, const char *end) { while (start <= end && __end_of_region(*end)) end--; return end; } static const char *bitmap_parse_region(const char *str, struct region *r) { unsigned int lastbit = r->nbits - 1; if (!strncasecmp(str, "all", 3)) { r->start = 0; r->end = lastbit; str += 3; goto check_pattern; } str = bitmap_getnum(str, &r->start, lastbit); if (IS_ERR(str)) return str; if (end_of_region(*str)) goto no_end; if (*str != '-') return ERR_PTR(-EINVAL); str = bitmap_getnum(str + 1, &r->end, lastbit); if (IS_ERR(str)) return str; check_pattern: if (end_of_region(*str)) goto no_pattern; if (*str != ':') return ERR_PTR(-EINVAL); str = bitmap_getnum(str + 1, &r->off, lastbit); if (IS_ERR(str)) return str; if (*str != '/') return ERR_PTR(-EINVAL); return bitmap_getnum(str + 1, &r->group_len, lastbit); no_end: r->end = r->start; no_pattern: r->off = r->end + 1; r->group_len = r->end + 1; return end_of_str(*str) ? NULL : str; } /** * bitmap_parselist - convert list format ASCII string to bitmap * @buf: read user string from this buffer; must be terminated * with a \0 or \n. * @maskp: write resulting mask here * @nmaskbits: number of bits in mask to be written * * Input format is a comma-separated list of decimal numbers and * ranges. Consecutively set bits are shown as two hyphen-separated * decimal numbers, the smallest and largest bit numbers set in * the range. * Optionally each range can be postfixed to denote that only parts of it * should be set. The range will divided to groups of specific size. * From each group will be used only defined amount of bits. * Syntax: range:used_size/group_size * Example: 0-1023:2/256 ==> 0,1,256,257,512,513,768,769 * The value 'N' can be used as a dynamically substituted token for the * maximum allowed value; i.e (nmaskbits - 1). Keep in mind that it is * dynamic, so if system changes cause the bitmap width to change, such * as more cores in a CPU list, then any ranges using N will also change. * * Returns: 0 on success, -errno on invalid input strings. Error values: * * - ``-EINVAL``: wrong region format * - ``-EINVAL``: invalid character in string * - ``-ERANGE``: bit number specified too large for mask * - ``-EOVERFLOW``: integer overflow in the input parameters */ int bitmap_parselist(const char *buf, unsigned long *maskp, int nmaskbits) { struct region r; long ret; r.nbits = nmaskbits; bitmap_zero(maskp, r.nbits); while (buf) { buf = bitmap_find_region(buf); if (buf == NULL) return 0; buf = bitmap_parse_region(buf, &r); if (IS_ERR(buf)) return PTR_ERR(buf); ret = bitmap_check_region(&r); if (ret) return ret; bitmap_set_region(&r, maskp); } return 0; } EXPORT_SYMBOL(bitmap_parselist); /** * bitmap_parselist_user() - convert user buffer's list format ASCII * string to bitmap * * @ubuf: pointer to user buffer containing string. * @ulen: buffer size in bytes. If string is smaller than this * then it must be terminated with a \0. * @maskp: pointer to bitmap array that will contain result. * @nmaskbits: size of bitmap, in bits. * * Wrapper for bitmap_parselist(), providing it with user buffer. */ int bitmap_parselist_user(const char __user *ubuf, unsigned int ulen, unsigned long *maskp, int nmaskbits) { char *buf; int ret; buf = memdup_user_nul(ubuf, ulen); if (IS_ERR(buf)) return PTR_ERR(buf); ret = bitmap_parselist(buf, maskp, nmaskbits); kfree(buf); return ret; } EXPORT_SYMBOL(bitmap_parselist_user); static const char *bitmap_get_x32_reverse(const char *start, const char *end, u32 *num) { u32 ret = 0; int c, i; for (i = 0; i < 32; i += 4) { c = hex_to_bin(*end--); if (c < 0) return ERR_PTR(-EINVAL); ret |= c << i; if (start > end || __end_of_region(*end)) goto out; } if (hex_to_bin(*end--) >= 0) return ERR_PTR(-EOVERFLOW); out: *num = ret; return end; } /** * bitmap_parse - convert an ASCII hex string into a bitmap. * @start: pointer to buffer containing string. * @buflen: buffer size in bytes. If string is smaller than this * then it must be terminated with a \0 or \n. In that case, * UINT_MAX may be provided instead of string length. * @maskp: pointer to bitmap array that will contain result. * @nmaskbits: size of bitmap, in bits. * * Commas group hex digits into chunks. Each chunk defines exactly 32 * bits of the resultant bitmask. No chunk may specify a value larger * than 32 bits (%-EOVERFLOW), and if a chunk specifies a smaller value * then leading 0-bits are prepended. %-EINVAL is returned for illegal * characters. Grouping such as "1,,5", ",44", "," or "" is allowed. * Leading, embedded and trailing whitespace accepted. */ int bitmap_parse(const char *start, unsigned int buflen, unsigned long *maskp, int nmaskbits) { const char *end = strnchrnul(start, buflen, '\n') - 1; int chunks = BITS_TO_U32(nmaskbits); u32 *bitmap = (u32 *)maskp; int unset_bit; int chunk; for (chunk = 0; ; chunk++) { end = bitmap_find_region_reverse(start, end); if (start > end) break; if (!chunks--) return -EOVERFLOW; #if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN) end = bitmap_get_x32_reverse(start, end, &bitmap[chunk ^ 1]); #else end = bitmap_get_x32_reverse(start, end, &bitmap[chunk]); #endif if (IS_ERR(end)) return PTR_ERR(end); } unset_bit = (BITS_TO_U32(nmaskbits) - chunks) * 32; if (unset_bit < nmaskbits) { bitmap_clear(maskp, unset_bit, nmaskbits - unset_bit); return 0; } if (find_next_bit(maskp, unset_bit, nmaskbits) != unset_bit) return -EOVERFLOW; return 0; } EXPORT_SYMBOL(bitmap_parse); |
131 131 125 175 51 125 104 73 176 140 176 176 139 24 180 24 1 180 181 181 17 11 14 9 10 157 47 2 10 5 5 4 4 182 182 104 78 157 157 1 1 4 134 7 11 146 16 146 146 12 100 40 141 6 14 79 1 52 11 130 130 130 50 127 127 125 124 75 53 126 6 3 3 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 | // SPDX-License-Identifier: GPL-2.0 /* * USB Serial Converter driver * * Copyright (C) 2009 - 2013 Johan Hovold (jhovold@gmail.com) * Copyright (C) 1999 - 2012 Greg Kroah-Hartman (greg@kroah.com) * Copyright (C) 2000 Peter Berger (pberger@brimson.com) * Copyright (C) 2000 Al Borchers (borchers@steinerpoint.com) * * This driver was originally based on the ACM driver by Armin Fuerst (which was * based on a driver by Brad Keryan) * * See Documentation/usb/usb-serial.rst for more information on using this * driver */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/tty.h> #include <linux/tty_driver.h> #include <linux/tty_flip.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/seq_file.h> #include <linux/spinlock.h> #include <linux/mutex.h> #include <linux/list.h> #include <linux/uaccess.h> #include <linux/serial.h> #include <linux/usb.h> #include <linux/usb/serial.h> #include <linux/kfifo.h> #include <linux/idr.h> #define DRIVER_AUTHOR "Greg Kroah-Hartman <gregkh@linuxfoundation.org>" #define DRIVER_DESC "USB Serial Driver core" #define USB_SERIAL_TTY_MAJOR 188 #define USB_SERIAL_TTY_MINORS 512 /* should be enough for a while */ /* There is no MODULE_DEVICE_TABLE for usbserial.c. Instead the MODULE_DEVICE_TABLE declarations in each serial driver cause the "hotplug" program to pull in whatever module is necessary via modprobe, and modprobe will load usbserial because the serial drivers depend on it. */ static DEFINE_IDR(serial_minors); static DEFINE_MUTEX(table_lock); static LIST_HEAD(usb_serial_driver_list); /* * Look up the serial port structure. If it is found and it hasn't been * disconnected, return with the parent usb_serial structure's disc_mutex held * and its refcount incremented. Otherwise return NULL. */ struct usb_serial_port *usb_serial_port_get_by_minor(unsigned minor) { struct usb_serial *serial; struct usb_serial_port *port; mutex_lock(&table_lock); port = idr_find(&serial_minors, minor); if (!port) goto exit; serial = port->serial; mutex_lock(&serial->disc_mutex); if (serial->disconnected) { mutex_unlock(&serial->disc_mutex); port = NULL; } else { kref_get(&serial->kref); } exit: mutex_unlock(&table_lock); return port; } static int allocate_minors(struct usb_serial *serial, int num_ports) { struct usb_serial_port *port; unsigned int i, j; int minor; dev_dbg(&serial->interface->dev, "%s %d\n", __func__, num_ports); mutex_lock(&table_lock); for (i = 0; i < num_ports; ++i) { port = serial->port[i]; minor = idr_alloc(&serial_minors, port, 0, USB_SERIAL_TTY_MINORS, GFP_KERNEL); if (minor < 0) goto error; port->minor = minor; port->port_number = i; } serial->minors_reserved = 1; mutex_unlock(&table_lock); return 0; error: /* unwind the already allocated minors */ for (j = 0; j < i; ++j) idr_remove(&serial_minors, serial->port[j]->minor); mutex_unlock(&table_lock); return minor; } static void release_minors(struct usb_serial *serial) { int i; mutex_lock(&table_lock); for (i = 0; i < serial->num_ports; ++i) idr_remove(&serial_minors, serial->port[i]->minor); mutex_unlock(&table_lock); serial->minors_reserved = 0; } int usb_serial_claim_interface(struct usb_serial *serial, struct usb_interface *intf) { struct usb_driver *driver = serial->type->usb_driver; int ret; if (serial->sibling) return -EBUSY; ret = usb_driver_claim_interface(driver, intf, serial); if (ret) { dev_err(&serial->interface->dev, "failed to claim sibling interface: %d\n", ret); return ret; } serial->sibling = intf; return 0; } EXPORT_SYMBOL_GPL(usb_serial_claim_interface); static void release_sibling(struct usb_serial *serial, struct usb_interface *intf) { struct usb_driver *driver = serial->type->usb_driver; struct usb_interface *sibling; if (!serial->sibling) return; if (intf == serial->sibling) sibling = serial->interface; else sibling = serial->sibling; usb_set_intfdata(sibling, NULL); usb_driver_release_interface(driver, sibling); } static void destroy_serial(struct kref *kref) { struct usb_serial *serial; struct usb_serial_port *port; int i; serial = to_usb_serial(kref); /* return the minor range that this device had */ if (serial->minors_reserved) release_minors(serial); if (serial->attached && serial->type->release) serial->type->release(serial); /* Now that nothing is using the ports, they can be freed */ for (i = 0; i < serial->num_port_pointers; ++i) { port = serial->port[i]; if (port) { port->serial = NULL; put_device(&port->dev); } } usb_put_intf(serial->interface); usb_put_dev(serial->dev); kfree(serial); } void usb_serial_put(struct usb_serial *serial) { kref_put(&serial->kref, destroy_serial); } /***************************************************************************** * Driver tty interface functions *****************************************************************************/ /** * serial_install - install tty * @driver: the driver (USB in our case) * @tty: the tty being created * * Initialise the termios structure for this tty. We use the default * USB serial settings but permit them to be overridden by * serial->type->init_termios on first open. * * This is the first place a new tty gets used. Hence this is where we * acquire references to the usb_serial structure and the driver module, * where we store a pointer to the port. All these actions are reversed * in serial_cleanup(). */ static int serial_install(struct tty_driver *driver, struct tty_struct *tty) { int idx = tty->index; struct usb_serial *serial; struct usb_serial_port *port; bool init_termios; int retval = -ENODEV; port = usb_serial_port_get_by_minor(idx); if (!port) return retval; serial = port->serial; if (!try_module_get(serial->type->driver.owner)) goto err_put_serial; init_termios = (driver->termios[idx] == NULL); retval = tty_standard_install(driver, tty); if (retval) goto err_put_module; mutex_unlock(&serial->disc_mutex); /* allow the driver to update the initial settings */ if (init_termios && serial->type->init_termios) serial->type->init_termios(tty); tty->driver_data = port; return retval; err_put_module: module_put(serial->type->driver.owner); err_put_serial: usb_serial_put(serial); mutex_unlock(&serial->disc_mutex); return retval; } static int serial_port_activate(struct tty_port *tport, struct tty_struct *tty) { struct usb_serial_port *port = container_of(tport, struct usb_serial_port, port); struct usb_serial *serial = port->serial; int retval; mutex_lock(&serial->disc_mutex); if (serial->disconnected) { retval = -ENODEV; goto out_unlock; } retval = usb_autopm_get_interface(serial->interface); if (retval) goto out_unlock; retval = port->serial->type->open(tty, port); if (retval) usb_autopm_put_interface(serial->interface); out_unlock: mutex_unlock(&serial->disc_mutex); if (retval < 0) retval = usb_translate_errors(retval); return retval; } static int serial_open(struct tty_struct *tty, struct file *filp) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); return tty_port_open(&port->port, tty, filp); } /** * serial_port_shutdown - shut down hardware * @tport: tty port to shut down * * Shut down a USB serial port. Serialized against activate by the * tport mutex and kept to matching open/close pairs * of calls by the tty-port initialized flag. * * Not called if tty is console. */ static void serial_port_shutdown(struct tty_port *tport) { struct usb_serial_port *port = container_of(tport, struct usb_serial_port, port); struct usb_serial_driver *drv = port->serial->type; if (drv->close) drv->close(port); usb_autopm_put_interface(port->serial->interface); } static void serial_hangup(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); tty_port_hangup(&port->port); } static void serial_close(struct tty_struct *tty, struct file *filp) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); tty_port_close(&port->port, tty, filp); } /** * serial_cleanup - free resources post close/hangup * @tty: tty to clean up * * Do the resource freeing and refcount dropping for the port. * Avoid freeing the console. * * Called asynchronously after the last tty kref is dropped. */ static void serial_cleanup(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; struct usb_serial *serial; struct module *owner; dev_dbg(&port->dev, "%s\n", __func__); /* The console is magical. Do not hang up the console hardware * or there will be tears. */ if (port->port.console) return; tty->driver_data = NULL; serial = port->serial; owner = serial->type->driver.owner; usb_serial_put(serial); module_put(owner); } static ssize_t serial_write(struct tty_struct *tty, const u8 *buf, size_t count) { struct usb_serial_port *port = tty->driver_data; int retval = -ENODEV; if (port->serial->dev->state == USB_STATE_NOTATTACHED) goto exit; dev_dbg(&port->dev, "%s - %zu byte(s)\n", __func__, count); retval = port->serial->type->write(tty, port, buf, count); if (retval < 0) retval = usb_translate_errors(retval); exit: return retval; } static unsigned int serial_write_room(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); return port->serial->type->write_room(tty); } static unsigned int serial_chars_in_buffer(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; struct usb_serial *serial = port->serial; dev_dbg(&port->dev, "%s\n", __func__); if (serial->disconnected) return 0; return serial->type->chars_in_buffer(tty); } static void serial_wait_until_sent(struct tty_struct *tty, int timeout) { struct usb_serial_port *port = tty->driver_data; struct usb_serial *serial = port->serial; dev_dbg(&port->dev, "%s\n", __func__); if (!port->serial->type->wait_until_sent) return; mutex_lock(&serial->disc_mutex); if (!serial->disconnected) port->serial->type->wait_until_sent(tty, timeout); mutex_unlock(&serial->disc_mutex); } static void serial_throttle(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); if (port->serial->type->throttle) port->serial->type->throttle(tty); } static void serial_unthrottle(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); if (port->serial->type->unthrottle) port->serial->type->unthrottle(tty); } static int serial_get_serial(struct tty_struct *tty, struct serial_struct *ss) { struct usb_serial_port *port = tty->driver_data; struct tty_port *tport = &port->port; unsigned int close_delay, closing_wait; mutex_lock(&tport->mutex); close_delay = jiffies_to_msecs(tport->close_delay) / 10; closing_wait = tport->closing_wait; if (closing_wait != ASYNC_CLOSING_WAIT_NONE) closing_wait = jiffies_to_msecs(closing_wait) / 10; ss->line = port->minor; ss->close_delay = close_delay; ss->closing_wait = closing_wait; if (port->serial->type->get_serial) port->serial->type->get_serial(tty, ss); mutex_unlock(&tport->mutex); return 0; } static int serial_set_serial(struct tty_struct *tty, struct serial_struct *ss) { struct usb_serial_port *port = tty->driver_data; struct tty_port *tport = &port->port; unsigned int close_delay, closing_wait; int ret = 0; close_delay = msecs_to_jiffies(ss->close_delay * 10); closing_wait = ss->closing_wait; if (closing_wait != ASYNC_CLOSING_WAIT_NONE) closing_wait = msecs_to_jiffies(closing_wait * 10); mutex_lock(&tport->mutex); if (!capable(CAP_SYS_ADMIN)) { if (close_delay != tport->close_delay || closing_wait != tport->closing_wait) { ret = -EPERM; goto out_unlock; } } if (port->serial->type->set_serial) { ret = port->serial->type->set_serial(tty, ss); if (ret) goto out_unlock; } tport->close_delay = close_delay; tport->closing_wait = closing_wait; out_unlock: mutex_unlock(&tport->mutex); return ret; } static int serial_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { struct usb_serial_port *port = tty->driver_data; int retval = -ENOIOCTLCMD; dev_dbg(&port->dev, "%s - cmd 0x%04x\n", __func__, cmd); switch (cmd) { case TIOCMIWAIT: if (port->serial->type->tiocmiwait) retval = port->serial->type->tiocmiwait(tty, arg); break; default: if (port->serial->type->ioctl) retval = port->serial->type->ioctl(tty, cmd, arg); } return retval; } static void serial_set_termios(struct tty_struct *tty, const struct ktermios *old) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); if (port->serial->type->set_termios) port->serial->type->set_termios(tty, port, old); else tty_termios_copy_hw(&tty->termios, old); } static int serial_break(struct tty_struct *tty, int break_state) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); if (port->serial->type->break_ctl) return port->serial->type->break_ctl(tty, break_state); return -ENOTTY; } static int serial_proc_show(struct seq_file *m, void *v) { struct usb_serial *serial; struct usb_serial_port *port; int i; char tmp[40]; seq_puts(m, "usbserinfo:1.0 driver:2.0\n"); for (i = 0; i < USB_SERIAL_TTY_MINORS; ++i) { port = usb_serial_port_get_by_minor(i); if (port == NULL) continue; serial = port->serial; seq_printf(m, "%d:", i); if (serial->type->driver.owner) seq_printf(m, " module:%s", module_name(serial->type->driver.owner)); seq_printf(m, " name:\"%s\"", serial->type->description); seq_printf(m, " vendor:%04x product:%04x", le16_to_cpu(serial->dev->descriptor.idVendor), le16_to_cpu(serial->dev->descriptor.idProduct)); seq_printf(m, " num_ports:%d", serial->num_ports); seq_printf(m, " port:%d", port->port_number); usb_make_path(serial->dev, tmp, sizeof(tmp)); seq_printf(m, " path:%s", tmp); seq_putc(m, '\n'); usb_serial_put(serial); mutex_unlock(&serial->disc_mutex); } return 0; } static int serial_tiocmget(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); if (port->serial->type->tiocmget) return port->serial->type->tiocmget(tty); return -ENOTTY; } static int serial_tiocmset(struct tty_struct *tty, unsigned int set, unsigned int clear) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); if (port->serial->type->tiocmset) return port->serial->type->tiocmset(tty, set, clear); return -ENOTTY; } static int serial_get_icount(struct tty_struct *tty, struct serial_icounter_struct *icount) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); if (port->serial->type->get_icount) return port->serial->type->get_icount(tty, icount); return -ENOTTY; } /* * We would be calling tty_wakeup here, but unfortunately some line * disciplines have an annoying habit of calling tty->write from * the write wakeup callback (e.g. n_hdlc.c). */ void usb_serial_port_softint(struct usb_serial_port *port) { schedule_work(&port->work); } EXPORT_SYMBOL_GPL(usb_serial_port_softint); static void usb_serial_port_work(struct work_struct *work) { struct usb_serial_port *port = container_of(work, struct usb_serial_port, work); tty_port_tty_wakeup(&port->port); } static void usb_serial_port_poison_urbs(struct usb_serial_port *port) { int i; for (i = 0; i < ARRAY_SIZE(port->read_urbs); ++i) usb_poison_urb(port->read_urbs[i]); for (i = 0; i < ARRAY_SIZE(port->write_urbs); ++i) usb_poison_urb(port->write_urbs[i]); usb_poison_urb(port->interrupt_in_urb); usb_poison_urb(port->interrupt_out_urb); } static void usb_serial_port_unpoison_urbs(struct usb_serial_port *port) { int i; for (i = 0; i < ARRAY_SIZE(port->read_urbs); ++i) usb_unpoison_urb(port->read_urbs[i]); for (i = 0; i < ARRAY_SIZE(port->write_urbs); ++i) usb_unpoison_urb(port->write_urbs[i]); usb_unpoison_urb(port->interrupt_in_urb); usb_unpoison_urb(port->interrupt_out_urb); } static void usb_serial_port_release(struct device *dev) { struct usb_serial_port *port = to_usb_serial_port(dev); int i; dev_dbg(dev, "%s\n", __func__); usb_free_urb(port->interrupt_in_urb); usb_free_urb(port->interrupt_out_urb); for (i = 0; i < ARRAY_SIZE(port->read_urbs); ++i) { usb_free_urb(port->read_urbs[i]); kfree(port->bulk_in_buffers[i]); } for (i = 0; i < ARRAY_SIZE(port->write_urbs); ++i) { usb_free_urb(port->write_urbs[i]); kfree(port->bulk_out_buffers[i]); } kfifo_free(&port->write_fifo); kfree(port->interrupt_in_buffer); kfree(port->interrupt_out_buffer); tty_port_destroy(&port->port); kfree(port); } static struct usb_serial *create_serial(struct usb_device *dev, struct usb_interface *interface, struct usb_serial_driver *driver) { struct usb_serial *serial; serial = kzalloc(sizeof(*serial), GFP_KERNEL); if (!serial) return NULL; serial->dev = usb_get_dev(dev); serial->type = driver; serial->interface = usb_get_intf(interface); kref_init(&serial->kref); mutex_init(&serial->disc_mutex); serial->minors_reserved = 0; return serial; } static const struct usb_device_id *match_dynamic_id(struct usb_interface *intf, struct usb_serial_driver *drv) { struct usb_dynid *dynid; spin_lock(&drv->dynids.lock); list_for_each_entry(dynid, &drv->dynids.list, node) { if (usb_match_one_id(intf, &dynid->id)) { spin_unlock(&drv->dynids.lock); return &dynid->id; } } spin_unlock(&drv->dynids.lock); return NULL; } static const struct usb_device_id *get_iface_id(struct usb_serial_driver *drv, struct usb_interface *intf) { const struct usb_device_id *id; id = usb_match_id(intf, drv->id_table); if (id) { dev_dbg(&intf->dev, "static descriptor matches\n"); goto exit; } id = match_dynamic_id(intf, drv); if (id) dev_dbg(&intf->dev, "dynamic descriptor matches\n"); exit: return id; } /* Caller must hold table_lock */ static struct usb_serial_driver *search_serial_device( struct usb_interface *iface) { const struct usb_device_id *id = NULL; struct usb_serial_driver *drv; struct usb_driver *driver = to_usb_driver(iface->dev.driver); /* Check if the usb id matches a known device */ list_for_each_entry(drv, &usb_serial_driver_list, driver_list) { if (drv->usb_driver == driver) id = get_iface_id(drv, iface); if (id) return drv; } return NULL; } static bool serial_port_carrier_raised(struct tty_port *port) { struct usb_serial_port *p = container_of(port, struct usb_serial_port, port); struct usb_serial_driver *drv = p->serial->type; if (drv->carrier_raised) return drv->carrier_raised(p); /* No carrier control - don't block */ return true; } static void serial_port_dtr_rts(struct tty_port *port, bool on) { struct usb_serial_port *p = container_of(port, struct usb_serial_port, port); struct usb_serial_driver *drv = p->serial->type; if (drv->dtr_rts) drv->dtr_rts(p, on); } static ssize_t port_number_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_serial_port *port = to_usb_serial_port(dev); return sprintf(buf, "%u\n", port->port_number); } static DEVICE_ATTR_RO(port_number); static struct attribute *usb_serial_port_attrs[] = { &dev_attr_port_number.attr, NULL }; ATTRIBUTE_GROUPS(usb_serial_port); static const struct tty_port_operations serial_port_ops = { .carrier_raised = serial_port_carrier_raised, .dtr_rts = serial_port_dtr_rts, .activate = serial_port_activate, .shutdown = serial_port_shutdown, }; static void store_endpoint(struct usb_serial *serial, struct usb_serial_endpoints *epds, struct usb_endpoint_descriptor *epd) { struct device *dev = &serial->interface->dev; u8 addr = epd->bEndpointAddress; if (usb_endpoint_is_bulk_in(epd)) { if (epds->num_bulk_in == ARRAY_SIZE(epds->bulk_in)) return; dev_dbg(dev, "found bulk in endpoint %02x\n", addr); epds->bulk_in[epds->num_bulk_in++] = epd; } else if (usb_endpoint_is_bulk_out(epd)) { if (epds->num_bulk_out == ARRAY_SIZE(epds->bulk_out)) return; dev_dbg(dev, "found bulk out endpoint %02x\n", addr); epds->bulk_out[epds->num_bulk_out++] = epd; } else if (usb_endpoint_is_int_in(epd)) { if (epds->num_interrupt_in == ARRAY_SIZE(epds->interrupt_in)) return; dev_dbg(dev, "found interrupt in endpoint %02x\n", addr); epds->interrupt_in[epds->num_interrupt_in++] = epd; } else if (usb_endpoint_is_int_out(epd)) { if (epds->num_interrupt_out == ARRAY_SIZE(epds->interrupt_out)) return; dev_dbg(dev, "found interrupt out endpoint %02x\n", addr); epds->interrupt_out[epds->num_interrupt_out++] = epd; } } static void find_endpoints(struct usb_serial *serial, struct usb_serial_endpoints *epds, struct usb_interface *intf) { struct usb_host_interface *iface_desc; struct usb_endpoint_descriptor *epd; unsigned int i; iface_desc = intf->cur_altsetting; for (i = 0; i < iface_desc->desc.bNumEndpoints; ++i) { epd = &iface_desc->endpoint[i].desc; store_endpoint(serial, epds, epd); } } static int setup_port_bulk_in(struct usb_serial_port *port, struct usb_endpoint_descriptor *epd) { struct usb_serial_driver *type = port->serial->type; struct usb_device *udev = port->serial->dev; int buffer_size; int i; buffer_size = max_t(int, type->bulk_in_size, usb_endpoint_maxp(epd)); port->bulk_in_size = buffer_size; port->bulk_in_endpointAddress = epd->bEndpointAddress; for (i = 0; i < ARRAY_SIZE(port->read_urbs); ++i) { set_bit(i, &port->read_urbs_free); port->read_urbs[i] = usb_alloc_urb(0, GFP_KERNEL); if (!port->read_urbs[i]) return -ENOMEM; port->bulk_in_buffers[i] = kmalloc(buffer_size, GFP_KERNEL); if (!port->bulk_in_buffers[i]) return -ENOMEM; usb_fill_bulk_urb(port->read_urbs[i], udev, usb_rcvbulkpipe(udev, epd->bEndpointAddress), port->bulk_in_buffers[i], buffer_size, type->read_bulk_callback, port); } port->read_urb = port->read_urbs[0]; port->bulk_in_buffer = port->bulk_in_buffers[0]; return 0; } static int setup_port_bulk_out(struct usb_serial_port *port, struct usb_endpoint_descriptor *epd) { struct usb_serial_driver *type = port->serial->type; struct usb_device *udev = port->serial->dev; int buffer_size; int i; if (kfifo_alloc(&port->write_fifo, PAGE_SIZE, GFP_KERNEL)) return -ENOMEM; if (type->bulk_out_size) buffer_size = type->bulk_out_size; else buffer_size = usb_endpoint_maxp(epd); port->bulk_out_size = buffer_size; port->bulk_out_endpointAddress = epd->bEndpointAddress; for (i = 0; i < ARRAY_SIZE(port->write_urbs); ++i) { set_bit(i, &port->write_urbs_free); port->write_urbs[i] = usb_alloc_urb(0, GFP_KERNEL); if (!port->write_urbs[i]) return -ENOMEM; port->bulk_out_buffers[i] = kmalloc(buffer_size, GFP_KERNEL); if (!port->bulk_out_buffers[i]) return -ENOMEM; usb_fill_bulk_urb(port->write_urbs[i], udev, usb_sndbulkpipe(udev, epd->bEndpointAddress), port->bulk_out_buffers[i], buffer_size, type->write_bulk_callback, port); } port->write_urb = port->write_urbs[0]; port->bulk_out_buffer = port->bulk_out_buffers[0]; return 0; } static int setup_port_interrupt_in(struct usb_serial_port *port, struct usb_endpoint_descriptor *epd) { struct usb_serial_driver *type = port->serial->type; struct usb_device *udev = port->serial->dev; int buffer_size; port->interrupt_in_urb = usb_alloc_urb(0, GFP_KERNEL); if (!port->interrupt_in_urb) return -ENOMEM; buffer_size = usb_endpoint_maxp(epd); port->interrupt_in_endpointAddress = epd->bEndpointAddress; port->interrupt_in_buffer = kmalloc(buffer_size, GFP_KERNEL); if (!port->interrupt_in_buffer) return -ENOMEM; usb_fill_int_urb(port->interrupt_in_urb, udev, usb_rcvintpipe(udev, epd->bEndpointAddress), port->interrupt_in_buffer, buffer_size, type->read_int_callback, port, epd->bInterval); return 0; } static int setup_port_interrupt_out(struct usb_serial_port *port, struct usb_endpoint_descriptor *epd) { struct usb_serial_driver *type = port->serial->type; struct usb_device *udev = port->serial->dev; int buffer_size; port->interrupt_out_urb = usb_alloc_urb(0, GFP_KERNEL); if (!port->interrupt_out_urb) return -ENOMEM; buffer_size = usb_endpoint_maxp(epd); port->interrupt_out_size = buffer_size; port->interrupt_out_endpointAddress = epd->bEndpointAddress; port->interrupt_out_buffer = kmalloc(buffer_size, GFP_KERNEL); if (!port->interrupt_out_buffer) return -ENOMEM; usb_fill_int_urb(port->interrupt_out_urb, udev, usb_sndintpipe(udev, epd->bEndpointAddress), port->interrupt_out_buffer, buffer_size, type->write_int_callback, port, epd->bInterval); return 0; } static int usb_serial_probe(struct usb_interface *interface, const struct usb_device_id *id) { struct device *ddev = &interface->dev; struct usb_device *dev = interface_to_usbdev(interface); struct usb_serial *serial = NULL; struct usb_serial_port *port; struct usb_serial_endpoints *epds; struct usb_serial_driver *type = NULL; int retval; int i; int num_ports = 0; unsigned char max_endpoints; mutex_lock(&table_lock); type = search_serial_device(interface); if (!type) { mutex_unlock(&table_lock); dev_dbg(ddev, "none matched\n"); return -ENODEV; } if (!try_module_get(type->driver.owner)) { mutex_unlock(&table_lock); dev_err(ddev, "module get failed, exiting\n"); return -EIO; } mutex_unlock(&table_lock); serial = create_serial(dev, interface, type); if (!serial) { retval = -ENOMEM; goto err_put_module; } /* if this device type has a probe function, call it */ if (type->probe) { const struct usb_device_id *id; id = get_iface_id(type, interface); retval = type->probe(serial, id); if (retval) { dev_dbg(ddev, "sub driver rejected device\n"); goto err_release_sibling; } } /* descriptor matches, let's find the endpoints needed */ epds = kzalloc(sizeof(*epds), GFP_KERNEL); if (!epds) { retval = -ENOMEM; goto err_release_sibling; } find_endpoints(serial, epds, interface); if (serial->sibling) find_endpoints(serial, epds, serial->sibling); if (epds->num_bulk_in < type->num_bulk_in || epds->num_bulk_out < type->num_bulk_out || epds->num_interrupt_in < type->num_interrupt_in || epds->num_interrupt_out < type->num_interrupt_out) { dev_err(ddev, "required endpoints missing\n"); retval = -ENODEV; goto err_free_epds; } if (type->calc_num_ports) { retval = type->calc_num_ports(serial, epds); if (retval < 0) goto err_free_epds; num_ports = retval; } if (!num_ports) num_ports = type->num_ports; if (num_ports > MAX_NUM_PORTS) { dev_warn(ddev, "too many ports requested: %d\n", num_ports); num_ports = MAX_NUM_PORTS; } serial->num_ports = (unsigned char)num_ports; serial->num_bulk_in = epds->num_bulk_in; serial->num_bulk_out = epds->num_bulk_out; serial->num_interrupt_in = epds->num_interrupt_in; serial->num_interrupt_out = epds->num_interrupt_out; /* found all that we need */ dev_info(ddev, "%s converter detected\n", type->description); /* create our ports, we need as many as the max endpoints */ /* we don't use num_ports here because some devices have more endpoint pairs than ports */ max_endpoints = max(epds->num_bulk_in, epds->num_bulk_out); max_endpoints = max(max_endpoints, epds->num_interrupt_in); max_endpoints = max(max_endpoints, epds->num_interrupt_out); max_endpoints = max(max_endpoints, serial->num_ports); serial->num_port_pointers = max_endpoints; dev_dbg(ddev, "setting up %d port structure(s)\n", max_endpoints); for (i = 0; i < max_endpoints; ++i) { port = kzalloc(sizeof(struct usb_serial_port), GFP_KERNEL); if (!port) { retval = -ENOMEM; goto err_free_epds; } tty_port_init(&port->port); port->port.ops = &serial_port_ops; port->serial = serial; spin_lock_init(&port->lock); /* Keep this for private driver use for the moment but should probably go away */ INIT_WORK(&port->work, usb_serial_port_work); serial->port[i] = port; port->dev.parent = &interface->dev; port->dev.driver = NULL; port->dev.bus = &usb_serial_bus_type; port->dev.release = &usb_serial_port_release; port->dev.groups = usb_serial_port_groups; device_initialize(&port->dev); } /* set up the endpoint information */ for (i = 0; i < epds->num_bulk_in; ++i) { retval = setup_port_bulk_in(serial->port[i], epds->bulk_in[i]); if (retval) goto err_free_epds; } for (i = 0; i < epds->num_bulk_out; ++i) { retval = setup_port_bulk_out(serial->port[i], epds->bulk_out[i]); if (retval) goto err_free_epds; } if (serial->type->read_int_callback) { for (i = 0; i < epds->num_interrupt_in; ++i) { retval = setup_port_interrupt_in(serial->port[i], epds->interrupt_in[i]); if (retval) goto err_free_epds; } } else if (epds->num_interrupt_in) { dev_dbg(ddev, "The device claims to support interrupt in transfers, but read_int_callback is not defined\n"); } if (serial->type->write_int_callback) { for (i = 0; i < epds->num_interrupt_out; ++i) { retval = setup_port_interrupt_out(serial->port[i], epds->interrupt_out[i]); if (retval) goto err_free_epds; } } else if (epds->num_interrupt_out) { dev_dbg(ddev, "The device claims to support interrupt out transfers, but write_int_callback is not defined\n"); } usb_set_intfdata(interface, serial); /* if this device type has an attach function, call it */ if (type->attach) { retval = type->attach(serial); if (retval < 0) goto err_free_epds; serial->attached = 1; if (retval > 0) { /* quietly accept this device, but don't bind to a serial port as it's about to disappear */ serial->num_ports = 0; goto exit; } } else { serial->attached = 1; } retval = allocate_minors(serial, num_ports); if (retval) { dev_err(ddev, "No more free serial minor numbers\n"); goto err_free_epds; } /* register all of the individual ports with the driver core */ for (i = 0; i < num_ports; ++i) { port = serial->port[i]; dev_set_name(&port->dev, "ttyUSB%d", port->minor); dev_dbg(ddev, "registering %s\n", dev_name(&port->dev)); device_enable_async_suspend(&port->dev); retval = device_add(&port->dev); if (retval) dev_err(ddev, "Error registering port device, continuing\n"); } if (num_ports > 0) usb_serial_console_init(serial->port[0]->minor); exit: kfree(epds); module_put(type->driver.owner); return 0; err_free_epds: kfree(epds); err_release_sibling: release_sibling(serial, interface); usb_serial_put(serial); err_put_module: module_put(type->driver.owner); return retval; } static void usb_serial_disconnect(struct usb_interface *interface) { int i; struct usb_serial *serial = usb_get_intfdata(interface); struct device *dev = &interface->dev; struct usb_serial_port *port; struct tty_struct *tty; /* sibling interface is cleaning up */ if (!serial) return; usb_serial_console_disconnect(serial); mutex_lock(&serial->disc_mutex); /* must set a flag, to signal subdrivers */ serial->disconnected = 1; mutex_unlock(&serial->disc_mutex); for (i = 0; i < serial->num_ports; ++i) { port = serial->port[i]; tty = tty_port_tty_get(&port->port); if (tty) { tty_vhangup(tty); tty_kref_put(tty); } usb_serial_port_poison_urbs(port); wake_up_interruptible(&port->port.delta_msr_wait); cancel_work_sync(&port->work); if (device_is_registered(&port->dev)) device_del(&port->dev); } if (serial->type->disconnect) serial->type->disconnect(serial); release_sibling(serial, interface); /* let the last holder of this object cause it to be cleaned up */ usb_serial_put(serial); dev_info(dev, "device disconnected\n"); } int usb_serial_suspend(struct usb_interface *intf, pm_message_t message) { struct usb_serial *serial = usb_get_intfdata(intf); int i, r; /* suspend when called for first sibling interface */ if (serial->suspend_count++) return 0; /* * serial->type->suspend() MUST return 0 in system sleep context, * otherwise, the resume callback has to recover device from * previous suspend failure. */ if (serial->type->suspend) { r = serial->type->suspend(serial, message); if (r < 0) { serial->suspend_count--; return r; } } for (i = 0; i < serial->num_ports; ++i) usb_serial_port_poison_urbs(serial->port[i]); return 0; } EXPORT_SYMBOL(usb_serial_suspend); static void usb_serial_unpoison_port_urbs(struct usb_serial *serial) { int i; for (i = 0; i < serial->num_ports; ++i) usb_serial_port_unpoison_urbs(serial->port[i]); } int usb_serial_resume(struct usb_interface *intf) { struct usb_serial *serial = usb_get_intfdata(intf); int rv; /* resume when called for last sibling interface */ if (--serial->suspend_count) return 0; usb_serial_unpoison_port_urbs(serial); if (serial->type->resume) rv = serial->type->resume(serial); else rv = usb_serial_generic_resume(serial); return rv; } EXPORT_SYMBOL(usb_serial_resume); static int usb_serial_reset_resume(struct usb_interface *intf) { struct usb_serial *serial = usb_get_intfdata(intf); int rv; /* resume when called for last sibling interface */ if (--serial->suspend_count) return 0; usb_serial_unpoison_port_urbs(serial); if (serial->type->reset_resume) { rv = serial->type->reset_resume(serial); } else { rv = -EOPNOTSUPP; intf->needs_binding = 1; } return rv; } static const struct tty_operations serial_ops = { .open = serial_open, .close = serial_close, .write = serial_write, .hangup = serial_hangup, .write_room = serial_write_room, .ioctl = serial_ioctl, .set_termios = serial_set_termios, .throttle = serial_throttle, .unthrottle = serial_unthrottle, .break_ctl = serial_break, .chars_in_buffer = serial_chars_in_buffer, .wait_until_sent = serial_wait_until_sent, .tiocmget = serial_tiocmget, .tiocmset = serial_tiocmset, .get_icount = serial_get_icount, .set_serial = serial_set_serial, .get_serial = serial_get_serial, .cleanup = serial_cleanup, .install = serial_install, .proc_show = serial_proc_show, }; struct tty_driver *usb_serial_tty_driver; static int __init usb_serial_init(void) { int result; usb_serial_tty_driver = tty_alloc_driver(USB_SERIAL_TTY_MINORS, TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV); if (IS_ERR(usb_serial_tty_driver)) return PTR_ERR(usb_serial_tty_driver); /* Initialize our global data */ result = bus_register(&usb_serial_bus_type); if (result) { pr_err("%s - registering bus driver failed\n", __func__); goto err_put_driver; } usb_serial_tty_driver->driver_name = "usbserial"; usb_serial_tty_driver->name = "ttyUSB"; usb_serial_tty_driver->major = USB_SERIAL_TTY_MAJOR; usb_serial_tty_driver->minor_start = 0; usb_serial_tty_driver->type = TTY_DRIVER_TYPE_SERIAL; usb_serial_tty_driver->subtype = SERIAL_TYPE_NORMAL; usb_serial_tty_driver->init_termios = tty_std_termios; usb_serial_tty_driver->init_termios.c_cflag = B9600 | CS8 | CREAD | HUPCL | CLOCAL; usb_serial_tty_driver->init_termios.c_ispeed = 9600; usb_serial_tty_driver->init_termios.c_ospeed = 9600; tty_set_operations(usb_serial_tty_driver, &serial_ops); result = tty_register_driver(usb_serial_tty_driver); if (result) { pr_err("%s - tty_register_driver failed\n", __func__); goto err_unregister_bus; } /* register the generic driver, if we should */ result = usb_serial_generic_register(); if (result < 0) { pr_err("%s - registering generic driver failed\n", __func__); goto err_unregister_driver; } return result; err_unregister_driver: tty_unregister_driver(usb_serial_tty_driver); err_unregister_bus: bus_unregister(&usb_serial_bus_type); err_put_driver: pr_err("%s - returning with error %d\n", __func__, result); tty_driver_kref_put(usb_serial_tty_driver); return result; } static void __exit usb_serial_exit(void) { usb_serial_console_exit(); usb_serial_generic_deregister(); tty_unregister_driver(usb_serial_tty_driver); tty_driver_kref_put(usb_serial_tty_driver); bus_unregister(&usb_serial_bus_type); idr_destroy(&serial_minors); } module_init(usb_serial_init); module_exit(usb_serial_exit); #define set_to_generic_if_null(type, function) \ do { \ if (!type->function) { \ type->function = usb_serial_generic_##function; \ pr_debug("%s: using generic " #function "\n", \ type->driver.name); \ } \ } while (0) static void usb_serial_operations_init(struct usb_serial_driver *device) { set_to_generic_if_null(device, open); set_to_generic_if_null(device, write); set_to_generic_if_null(device, close); set_to_generic_if_null(device, write_room); set_to_generic_if_null(device, chars_in_buffer); if (device->tx_empty) set_to_generic_if_null(device, wait_until_sent); set_to_generic_if_null(device, read_bulk_callback); set_to_generic_if_null(device, write_bulk_callback); set_to_generic_if_null(device, process_read_urb); set_to_generic_if_null(device, prepare_write_buffer); } static int usb_serial_register(struct usb_serial_driver *driver) { int retval; if (usb_disabled()) return -ENODEV; if (!driver->description) driver->description = driver->driver.name; if (!driver->usb_driver) { WARN(1, "Serial driver %s has no usb_driver\n", driver->description); return -EINVAL; } /* Prevent individual ports from being unbound. */ driver->driver.suppress_bind_attrs = true; usb_serial_operations_init(driver); /* Add this device to our list of devices */ mutex_lock(&table_lock); list_add(&driver->driver_list, &usb_serial_driver_list); retval = usb_serial_bus_register(driver); if (retval) { pr_err("problem %d when registering driver %s\n", retval, driver->description); list_del(&driver->driver_list); } else { pr_info("USB Serial support registered for %s\n", driver->description); } mutex_unlock(&table_lock); return retval; } static void usb_serial_deregister(struct usb_serial_driver *device) { pr_info("USB Serial deregistering driver %s\n", device->description); mutex_lock(&table_lock); list_del(&device->driver_list); mutex_unlock(&table_lock); usb_serial_bus_deregister(device); } /** * usb_serial_register_drivers - register drivers for a usb-serial module * @serial_drivers: NULL-terminated array of pointers to drivers to be registered * @name: name of the usb_driver for this set of @serial_drivers * @id_table: list of all devices this @serial_drivers set binds to * * Registers all the drivers in the @serial_drivers array, and dynamically * creates a struct usb_driver with the name @name and id_table of @id_table. */ int usb_serial_register_drivers(struct usb_serial_driver *const serial_drivers[], const char *name, const struct usb_device_id *id_table) { int rc; struct usb_driver *udriver; struct usb_serial_driver * const *sd; /* * udriver must be registered before any of the serial drivers, * because the store_new_id() routine for the serial drivers (in * bus.c) probes udriver. * * Performance hack: We don't want udriver to be probed until * the serial drivers are registered, because the probe would * simply fail for lack of a matching serial driver. * So we leave udriver's id_table set to NULL until we are all set. * * Suspend/resume support is implemented in the usb-serial core, * so fill in the PM-related fields in udriver. */ udriver = kzalloc(sizeof(*udriver), GFP_KERNEL); if (!udriver) return -ENOMEM; udriver->name = name; udriver->no_dynamic_id = 1; udriver->supports_autosuspend = 1; udriver->suspend = usb_serial_suspend; udriver->resume = usb_serial_resume; udriver->probe = usb_serial_probe; udriver->disconnect = usb_serial_disconnect; /* we only set the reset_resume field if the serial_driver has one */ for (sd = serial_drivers; *sd; ++sd) { if ((*sd)->reset_resume) { udriver->reset_resume = usb_serial_reset_resume; break; } } rc = usb_register(udriver); if (rc) goto err_free_driver; for (sd = serial_drivers; *sd; ++sd) { (*sd)->usb_driver = udriver; rc = usb_serial_register(*sd); if (rc) goto err_deregister_drivers; } /* Now set udriver's id_table and look for matches */ udriver->id_table = id_table; rc = driver_attach(&udriver->driver); return 0; err_deregister_drivers: while (sd-- > serial_drivers) usb_serial_deregister(*sd); usb_deregister(udriver); err_free_driver: kfree(udriver); return rc; } EXPORT_SYMBOL_GPL(usb_serial_register_drivers); /** * usb_serial_deregister_drivers - deregister drivers for a usb-serial module * @serial_drivers: NULL-terminated array of pointers to drivers to be deregistered * * Deregisters all the drivers in the @serial_drivers array and deregisters and * frees the struct usb_driver that was created by the call to * usb_serial_register_drivers(). */ void usb_serial_deregister_drivers(struct usb_serial_driver *const serial_drivers[]) { struct usb_driver *udriver = (*serial_drivers)->usb_driver; for (; *serial_drivers; ++serial_drivers) usb_serial_deregister(*serial_drivers); usb_deregister(udriver); kfree(udriver); } EXPORT_SYMBOL_GPL(usb_serial_deregister_drivers); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL v2"); |
12 1 3 2 2 4 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 | // SPDX-License-Identifier: GPL-2.0-or-later /* * mpls tunnels An implementation mpls tunnels using the light weight tunnel * infrastructure * * Authors: Roopa Prabhu, <roopa@cumulusnetworks.com> */ #include <linux/types.h> #include <linux/skbuff.h> #include <linux/net.h> #include <linux/module.h> #include <linux/mpls.h> #include <linux/vmalloc.h> #include <net/ip.h> #include <net/dst.h> #include <net/lwtunnel.h> #include <net/netevent.h> #include <net/netns/generic.h> #include <net/ip6_fib.h> #include <net/route.h> #include <net/mpls_iptunnel.h> #include <linux/mpls_iptunnel.h> #include "internal.h" static const struct nla_policy mpls_iptunnel_policy[MPLS_IPTUNNEL_MAX + 1] = { [MPLS_IPTUNNEL_DST] = { .len = sizeof(u32) }, [MPLS_IPTUNNEL_TTL] = { .type = NLA_U8 }, }; static unsigned int mpls_encap_size(struct mpls_iptunnel_encap *en) { /* The size of the layer 2.5 labels to be added for this route */ return en->labels * sizeof(struct mpls_shim_hdr); } static int mpls_xmit(struct sk_buff *skb) { struct mpls_iptunnel_encap *tun_encap_info; struct mpls_shim_hdr *hdr; struct net_device *out_dev; unsigned int hh_len; unsigned int new_header_size; unsigned int mtu; struct dst_entry *dst = skb_dst(skb); struct rtable *rt = NULL; struct rt6_info *rt6 = NULL; struct mpls_dev *out_mdev; struct net *net; int err = 0; bool bos; int i; unsigned int ttl; /* Find the output device */ out_dev = dst->dev; net = dev_net(out_dev); if (!mpls_output_possible(out_dev) || !dst->lwtstate || skb_warn_if_lro(skb)) goto drop; skb_forward_csum(skb); tun_encap_info = mpls_lwtunnel_encap(dst->lwtstate); /* Obtain the ttl using the following set of rules. * * LWT ttl propagation setting: * - disabled => use default TTL value from LWT * - enabled => use TTL value from IPv4/IPv6 header * - default => * Global ttl propagation setting: * - disabled => use default TTL value from global setting * - enabled => use TTL value from IPv4/IPv6 header */ if (dst->ops->family == AF_INET) { if (tun_encap_info->ttl_propagate == MPLS_TTL_PROP_DISABLED) ttl = tun_encap_info->default_ttl; else if (tun_encap_info->ttl_propagate == MPLS_TTL_PROP_DEFAULT && !net->mpls.ip_ttl_propagate) ttl = net->mpls.default_ttl; else ttl = ip_hdr(skb)->ttl; rt = dst_rtable(dst); } else if (dst->ops->family == AF_INET6) { if (tun_encap_info->ttl_propagate == MPLS_TTL_PROP_DISABLED) ttl = tun_encap_info->default_ttl; else if (tun_encap_info->ttl_propagate == MPLS_TTL_PROP_DEFAULT && !net->mpls.ip_ttl_propagate) ttl = net->mpls.default_ttl; else ttl = ipv6_hdr(skb)->hop_limit; rt6 = dst_rt6_info(dst); } else { goto drop; } /* Verify the destination can hold the packet */ new_header_size = mpls_encap_size(tun_encap_info); mtu = mpls_dev_mtu(out_dev); if (mpls_pkt_too_big(skb, mtu - new_header_size)) goto drop; hh_len = LL_RESERVED_SPACE(out_dev); if (!out_dev->header_ops) hh_len = 0; /* Ensure there is enough space for the headers in the skb */ if (skb_cow(skb, hh_len + new_header_size)) goto drop; skb_set_inner_protocol(skb, skb->protocol); skb_reset_inner_network_header(skb); skb_push(skb, new_header_size); skb_reset_network_header(skb); skb->dev = out_dev; skb->protocol = htons(ETH_P_MPLS_UC); /* Push the new labels */ hdr = mpls_hdr(skb); bos = true; for (i = tun_encap_info->labels - 1; i >= 0; i--) { hdr[i] = mpls_entry_encode(tun_encap_info->label[i], ttl, 0, bos); bos = false; } mpls_stats_inc_outucastpkts(out_dev, skb); if (rt) { if (rt->rt_gw_family == AF_INET6) err = neigh_xmit(NEIGH_ND_TABLE, out_dev, &rt->rt_gw6, skb); else err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gw4, skb); } else if (rt6) { if (ipv6_addr_v4mapped(&rt6->rt6i_gateway)) { /* 6PE (RFC 4798) */ err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt6->rt6i_gateway.s6_addr32[3], skb); } else err = neigh_xmit(NEIGH_ND_TABLE, out_dev, &rt6->rt6i_gateway, skb); } if (err) net_dbg_ratelimited("%s: packet transmission failed: %d\n", __func__, err); return LWTUNNEL_XMIT_DONE; drop: out_mdev = out_dev ? mpls_dev_get(out_dev) : NULL; if (out_mdev) MPLS_INC_STATS(out_mdev, tx_errors); kfree_skb(skb); return -EINVAL; } static int mpls_build_state(struct net *net, struct nlattr *nla, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) { struct mpls_iptunnel_encap *tun_encap_info; struct nlattr *tb[MPLS_IPTUNNEL_MAX + 1]; struct lwtunnel_state *newts; u8 n_labels; int ret; ret = nla_parse_nested_deprecated(tb, MPLS_IPTUNNEL_MAX, nla, mpls_iptunnel_policy, extack); if (ret < 0) return ret; if (!tb[MPLS_IPTUNNEL_DST]) { NL_SET_ERR_MSG(extack, "MPLS_IPTUNNEL_DST attribute is missing"); return -EINVAL; } /* determine number of labels */ if (nla_get_labels(tb[MPLS_IPTUNNEL_DST], MAX_NEW_LABELS, &n_labels, NULL, extack)) return -EINVAL; newts = lwtunnel_state_alloc(struct_size(tun_encap_info, label, n_labels)); if (!newts) return -ENOMEM; tun_encap_info = mpls_lwtunnel_encap(newts); ret = nla_get_labels(tb[MPLS_IPTUNNEL_DST], n_labels, &tun_encap_info->labels, tun_encap_info->label, extack); if (ret) goto errout; tun_encap_info->ttl_propagate = MPLS_TTL_PROP_DEFAULT; if (tb[MPLS_IPTUNNEL_TTL]) { tun_encap_info->default_ttl = nla_get_u8(tb[MPLS_IPTUNNEL_TTL]); /* TTL 0 implies propagate from IP header */ tun_encap_info->ttl_propagate = tun_encap_info->default_ttl ? MPLS_TTL_PROP_DISABLED : MPLS_TTL_PROP_ENABLED; } newts->type = LWTUNNEL_ENCAP_MPLS; newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT; newts->headroom = mpls_encap_size(tun_encap_info); *ts = newts; return 0; errout: kfree(newts); *ts = NULL; return ret; } static int mpls_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwtstate) { struct mpls_iptunnel_encap *tun_encap_info; tun_encap_info = mpls_lwtunnel_encap(lwtstate); if (nla_put_labels(skb, MPLS_IPTUNNEL_DST, tun_encap_info->labels, tun_encap_info->label)) goto nla_put_failure; if (tun_encap_info->ttl_propagate != MPLS_TTL_PROP_DEFAULT && nla_put_u8(skb, MPLS_IPTUNNEL_TTL, tun_encap_info->default_ttl)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static int mpls_encap_nlsize(struct lwtunnel_state *lwtstate) { struct mpls_iptunnel_encap *tun_encap_info; int nlsize; tun_encap_info = mpls_lwtunnel_encap(lwtstate); nlsize = nla_total_size(tun_encap_info->labels * 4); if (tun_encap_info->ttl_propagate != MPLS_TTL_PROP_DEFAULT) nlsize += nla_total_size(1); return nlsize; } static int mpls_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) { struct mpls_iptunnel_encap *a_hdr = mpls_lwtunnel_encap(a); struct mpls_iptunnel_encap *b_hdr = mpls_lwtunnel_encap(b); int l; if (a_hdr->labels != b_hdr->labels || a_hdr->ttl_propagate != b_hdr->ttl_propagate || a_hdr->default_ttl != b_hdr->default_ttl) return 1; for (l = 0; l < a_hdr->labels; l++) if (a_hdr->label[l] != b_hdr->label[l]) return 1; return 0; } static const struct lwtunnel_encap_ops mpls_iptun_ops = { .build_state = mpls_build_state, .xmit = mpls_xmit, .fill_encap = mpls_fill_encap_info, .get_encap_size = mpls_encap_nlsize, .cmp_encap = mpls_encap_cmp, .owner = THIS_MODULE, }; static int __init mpls_iptunnel_init(void) { return lwtunnel_encap_add_ops(&mpls_iptun_ops, LWTUNNEL_ENCAP_MPLS); } module_init(mpls_iptunnel_init); static void __exit mpls_iptunnel_exit(void) { lwtunnel_encap_del_ops(&mpls_iptun_ops, LWTUNNEL_ENCAP_MPLS); } module_exit(mpls_iptunnel_exit); MODULE_ALIAS_RTNL_LWT(MPLS); MODULE_SOFTDEP("post: mpls_gso"); MODULE_DESCRIPTION("MultiProtocol Label Switching IP Tunnels"); MODULE_LICENSE("GPL v2"); |
4 6 2 2 12 10 2 3 3 2 2 2 2 2 2 16 6 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 | /* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ #ifndef __IP_SET_BITMAP_IP_GEN_H #define __IP_SET_BITMAP_IP_GEN_H #include <linux/rcupdate_wait.h> #define mtype_do_test IPSET_TOKEN(MTYPE, _do_test) #define mtype_gc_test IPSET_TOKEN(MTYPE, _gc_test) #define mtype_is_filled IPSET_TOKEN(MTYPE, _is_filled) #define mtype_do_add IPSET_TOKEN(MTYPE, _do_add) #define mtype_ext_cleanup IPSET_TOKEN(MTYPE, _ext_cleanup) #define mtype_do_del IPSET_TOKEN(MTYPE, _do_del) #define mtype_do_list IPSET_TOKEN(MTYPE, _do_list) #define mtype_do_head IPSET_TOKEN(MTYPE, _do_head) #define mtype_adt_elem IPSET_TOKEN(MTYPE, _adt_elem) #define mtype_add_timeout IPSET_TOKEN(MTYPE, _add_timeout) #define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init) #define mtype_kadt IPSET_TOKEN(MTYPE, _kadt) #define mtype_uadt IPSET_TOKEN(MTYPE, _uadt) #define mtype_destroy IPSET_TOKEN(MTYPE, _destroy) #define mtype_memsize IPSET_TOKEN(MTYPE, _memsize) #define mtype_flush IPSET_TOKEN(MTYPE, _flush) #define mtype_head IPSET_TOKEN(MTYPE, _head) #define mtype_same_set IPSET_TOKEN(MTYPE, _same_set) #define mtype_elem IPSET_TOKEN(MTYPE, _elem) #define mtype_test IPSET_TOKEN(MTYPE, _test) #define mtype_add IPSET_TOKEN(MTYPE, _add) #define mtype_del IPSET_TOKEN(MTYPE, _del) #define mtype_list IPSET_TOKEN(MTYPE, _list) #define mtype_gc IPSET_TOKEN(MTYPE, _gc) #define mtype_cancel_gc IPSET_TOKEN(MTYPE, _cancel_gc) #define mtype MTYPE #define get_ext(set, map, id) ((map)->extensions + ((set)->dsize * (id))) static void mtype_gc_init(struct ip_set *set, void (*gc)(struct timer_list *t)) { struct mtype *map = set->data; timer_setup(&map->gc, gc, 0); mod_timer(&map->gc, jiffies + IPSET_GC_PERIOD(set->timeout) * HZ); } static void mtype_ext_cleanup(struct ip_set *set) { struct mtype *map = set->data; u32 id; for (id = 0; id < map->elements; id++) if (test_bit(id, map->members)) ip_set_ext_destroy(set, get_ext(set, map, id)); } static void mtype_destroy(struct ip_set *set) { struct mtype *map = set->data; if (set->dsize && set->extensions & IPSET_EXT_DESTROY) mtype_ext_cleanup(set); ip_set_free(map->members); ip_set_free(map); set->data = NULL; } static void mtype_flush(struct ip_set *set) { struct mtype *map = set->data; if (set->extensions & IPSET_EXT_DESTROY) mtype_ext_cleanup(set); bitmap_zero(map->members, map->elements); set->elements = 0; set->ext_size = 0; } /* Calculate the actual memory size of the set data */ static size_t mtype_memsize(const struct mtype *map, size_t dsize) { return sizeof(*map) + map->memsize + map->elements * dsize; } static int mtype_head(struct ip_set *set, struct sk_buff *skb) { const struct mtype *map = set->data; struct nlattr *nested; size_t memsize = mtype_memsize(map, set->dsize) + set->ext_size; nested = nla_nest_start(skb, IPSET_ATTR_DATA); if (!nested) goto nla_put_failure; if (mtype_do_head(skb, map) || nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) || nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) || nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(set->elements))) goto nla_put_failure; if (unlikely(ip_set_put_flags(skb, set))) goto nla_put_failure; nla_nest_end(skb, nested); return 0; nla_put_failure: return -EMSGSIZE; } static int mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags) { struct mtype *map = set->data; const struct mtype_adt_elem *e = value; void *x = get_ext(set, map, e->id); int ret = mtype_do_test(e, map, set->dsize); if (ret <= 0) return ret; return ip_set_match_extensions(set, ext, mext, flags, x); } static int mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags) { struct mtype *map = set->data; const struct mtype_adt_elem *e = value; void *x = get_ext(set, map, e->id); int ret = mtype_do_add(e, map, flags, set->dsize); if (ret == IPSET_ADD_FAILED) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(x, set))) { set->elements--; ret = 0; } else if (!(flags & IPSET_FLAG_EXIST)) { set_bit(e->id, map->members); return -IPSET_ERR_EXIST; } /* Element is re-added, cleanup extensions */ ip_set_ext_destroy(set, x); } if (ret > 0) set->elements--; if (SET_WITH_TIMEOUT(set)) #ifdef IP_SET_BITMAP_STORED_TIMEOUT mtype_add_timeout(ext_timeout(x, set), e, ext, set, map, ret); #else ip_set_timeout_set(ext_timeout(x, set), ext->timeout); #endif if (SET_WITH_COUNTER(set)) ip_set_init_counter(ext_counter(x, set), ext); if (SET_WITH_COMMENT(set)) ip_set_init_comment(set, ext_comment(x, set), ext); if (SET_WITH_SKBINFO(set)) ip_set_init_skbinfo(ext_skbinfo(x, set), ext); /* Activate element */ set_bit(e->id, map->members); set->elements++; return 0; } static int mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags) { struct mtype *map = set->data; const struct mtype_adt_elem *e = value; void *x = get_ext(set, map, e->id); if (mtype_do_del(e, map)) return -IPSET_ERR_EXIST; ip_set_ext_destroy(set, x); set->elements--; if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(x, set))) return -IPSET_ERR_EXIST; return 0; } #ifndef IP_SET_BITMAP_STORED_TIMEOUT static bool mtype_is_filled(const struct mtype_elem *x) { return true; } #endif static int mtype_list(const struct ip_set *set, struct sk_buff *skb, struct netlink_callback *cb) { struct mtype *map = set->data; struct nlattr *adt, *nested; void *x; u32 id, first = cb->args[IPSET_CB_ARG0]; int ret = 0; adt = nla_nest_start(skb, IPSET_ATTR_ADT); if (!adt) return -EMSGSIZE; /* Extensions may be replaced */ rcu_read_lock(); for (; cb->args[IPSET_CB_ARG0] < map->elements; cb->args[IPSET_CB_ARG0]++) { cond_resched_rcu(); id = cb->args[IPSET_CB_ARG0]; x = get_ext(set, map, id); if (!test_bit(id, map->members) || (SET_WITH_TIMEOUT(set) && #ifdef IP_SET_BITMAP_STORED_TIMEOUT mtype_is_filled(x) && #endif ip_set_timeout_expired(ext_timeout(x, set)))) continue; nested = nla_nest_start(skb, IPSET_ATTR_DATA); if (!nested) { if (id == first) { nla_nest_cancel(skb, adt); ret = -EMSGSIZE; goto out; } goto nla_put_failure; } if (mtype_do_list(skb, map, id, set->dsize)) goto nla_put_failure; if (ip_set_put_extensions(skb, set, x, mtype_is_filled(x))) goto nla_put_failure; nla_nest_end(skb, nested); } nla_nest_end(skb, adt); /* Set listing finished */ cb->args[IPSET_CB_ARG0] = 0; goto out; nla_put_failure: nla_nest_cancel(skb, nested); if (unlikely(id == first)) { cb->args[IPSET_CB_ARG0] = 0; ret = -EMSGSIZE; } nla_nest_end(skb, adt); out: rcu_read_unlock(); return ret; } static void mtype_gc(struct timer_list *t) { struct mtype *map = from_timer(map, t, gc); struct ip_set *set = map->set; void *x; u32 id; /* We run parallel with other readers (test element) * but adding/deleting new entries is locked out */ spin_lock_bh(&set->lock); for (id = 0; id < map->elements; id++) if (mtype_gc_test(id, map, set->dsize)) { x = get_ext(set, map, id); if (ip_set_timeout_expired(ext_timeout(x, set))) { clear_bit(id, map->members); ip_set_ext_destroy(set, x); set->elements--; } } spin_unlock_bh(&set->lock); map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; add_timer(&map->gc); } static void mtype_cancel_gc(struct ip_set *set) { struct mtype *map = set->data; if (SET_WITH_TIMEOUT(set)) del_timer_sync(&map->gc); } static const struct ip_set_type_variant mtype = { .kadt = mtype_kadt, .uadt = mtype_uadt, .adt = { [IPSET_ADD] = mtype_add, [IPSET_DEL] = mtype_del, [IPSET_TEST] = mtype_test, }, .destroy = mtype_destroy, .flush = mtype_flush, .head = mtype_head, .list = mtype_list, .same_set = mtype_same_set, .cancel_gc = mtype_cancel_gc, }; #endif /* __IP_SET_BITMAP_IP_GEN_H */ |
2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* md.h : kernel internal structure of the Linux MD driver Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman */ #ifndef _MD_MD_H #define _MD_MD_H #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/badblocks.h> #include <linux/kobject.h> #include <linux/list.h> #include <linux/mm.h> #include <linux/mutex.h> #include <linux/timer.h> #include <linux/wait.h> #include <linux/workqueue.h> #include <trace/events/block.h> #include "md-cluster.h" #define MaxSector (~(sector_t)0) /* * These flags should really be called "NO_RETRY" rather than * "FAILFAST" because they don't make any promise about time lapse, * only about the number of retries, which will be zero. * REQ_FAILFAST_DRIVER is not included because * Commit: 4a27446f3e39 ("[SCSI] modify scsi to handle new fail fast flags.") * seems to suggest that the errors it avoids retrying should usually * be retried. */ #define MD_FAILFAST (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT) /* Status of sync thread. */ enum sync_action { /* * Represent by MD_RECOVERY_SYNC, start when: * 1) after assemble, sync data from first rdev to other copies, this * must be done first before other sync actions and will only execute * once; * 2) resize the array(notice that this is not reshape), sync data for * the new range; */ ACTION_RESYNC, /* * Represent by MD_RECOVERY_RECOVER, start when: * 1) for new replacement, sync data based on the replace rdev or * available copies from other rdev; * 2) for new member disk while the array is degraded, sync data from * other rdev; * 3) reassemble after power failure or re-add a hot removed rdev, sync * data from first rdev to other copies based on bitmap; */ ACTION_RECOVER, /* * Represent by MD_RECOVERY_SYNC | MD_RECOVERY_REQUESTED | * MD_RECOVERY_CHECK, start when user echo "check" to sysfs api * sync_action, used to check if data copies from differenct rdev are * the same. The number of mismatch sectors will be exported to user * by sysfs api mismatch_cnt; */ ACTION_CHECK, /* * Represent by MD_RECOVERY_SYNC | MD_RECOVERY_REQUESTED, start when * user echo "repair" to sysfs api sync_action, usually paired with * ACTION_CHECK, used to force syncing data once user found that there * are inconsistent data, */ ACTION_REPAIR, /* * Represent by MD_RECOVERY_RESHAPE, start when new member disk is added * to the conf, notice that this is different from spares or * replacement; */ ACTION_RESHAPE, /* * Represent by MD_RECOVERY_FROZEN, can be set by sysfs api sync_action * or internal usage like setting the array read-only, will forbid above * actions. */ ACTION_FROZEN, /* * All above actions don't match. */ ACTION_IDLE, NR_SYNC_ACTIONS, }; /* * The struct embedded in rdev is used to serialize IO. */ struct serial_in_rdev { struct rb_root_cached serial_rb; spinlock_t serial_lock; wait_queue_head_t serial_io_wait; }; /* * MD's 'extended' device */ struct md_rdev { struct list_head same_set; /* RAID devices within the same set */ sector_t sectors; /* Device size (in 512bytes sectors) */ struct mddev *mddev; /* RAID array if running */ int last_events; /* IO event timestamp */ /* * If meta_bdev is non-NULL, it means that a separate device is * being used to store the metadata (superblock/bitmap) which * would otherwise be contained on the same device as the data (bdev). */ struct block_device *meta_bdev; struct block_device *bdev; /* block device handle */ struct file *bdev_file; /* Handle from open for bdev */ struct page *sb_page, *bb_page; int sb_loaded; __u64 sb_events; sector_t data_offset; /* start of data in array */ sector_t new_data_offset;/* only relevant while reshaping */ sector_t sb_start; /* offset of the super block (in 512byte sectors) */ int sb_size; /* bytes in the superblock */ int preferred_minor; /* autorun support */ struct kobject kobj; /* A device can be in one of three states based on two flags: * Not working: faulty==1 in_sync==0 * Fully working: faulty==0 in_sync==1 * Working, but not * in sync with array * faulty==0 in_sync==0 * * It can never have faulty==1, in_sync==1 * This reduces the burden of testing multiple flags in many cases */ unsigned long flags; /* bit set of 'enum flag_bits' bits. */ wait_queue_head_t blocked_wait; int desc_nr; /* descriptor index in the superblock */ int raid_disk; /* role of device in array */ int new_raid_disk; /* role that the device will have in * the array after a level-change completes. */ int saved_raid_disk; /* role that device used to have in the * array and could again if we did a partial * resync from the bitmap */ union { sector_t recovery_offset;/* If this device has been partially * recovered, this is where we were * up to. */ sector_t journal_tail; /* If this device is a journal device, * this is the journal tail (journal * recovery start point) */ }; atomic_t nr_pending; /* number of pending requests. * only maintained for arrays that * support hot removal */ atomic_t read_errors; /* number of consecutive read errors that * we have tried to ignore. */ time64_t last_read_error; /* monotonic time since our * last read error */ atomic_t corrected_errors; /* number of corrected read errors, * for reporting to userspace and storing * in superblock. */ struct serial_in_rdev *serial; /* used for raid1 io serialization */ struct kernfs_node *sysfs_state; /* handle for 'state' * sysfs entry */ /* handle for 'unacknowledged_bad_blocks' sysfs dentry */ struct kernfs_node *sysfs_unack_badblocks; /* handle for 'bad_blocks' sysfs dentry */ struct kernfs_node *sysfs_badblocks; struct badblocks badblocks; struct { short offset; /* Offset from superblock to start of PPL. * Not used by external metadata. */ unsigned int size; /* Size in sectors of the PPL space */ sector_t sector; /* First sector of the PPL space */ } ppl; }; enum flag_bits { Faulty, /* device is known to have a fault */ In_sync, /* device is in_sync with rest of array */ Bitmap_sync, /* ..actually, not quite In_sync. Need a * bitmap-based recovery to get fully in sync. * The bit is only meaningful before device * has been passed to pers->hot_add_disk. */ WriteMostly, /* Avoid reading if at all possible */ AutoDetected, /* added by auto-detect */ Blocked, /* An error occurred but has not yet * been acknowledged by the metadata * handler, so don't allow writes * until it is cleared */ WriteErrorSeen, /* A write error has been seen on this * device */ FaultRecorded, /* Intermediate state for clearing * Blocked. The Fault is/will-be * recorded in the metadata, but that * metadata hasn't been stored safely * on disk yet. */ BlockedBadBlocks, /* A writer is blocked because they * found an unacknowledged bad-block. * This can safely be cleared at any * time, and the writer will re-check. * It may be set at any time, and at * worst the writer will timeout and * re-check. So setting it as * accurately as possible is good, but * not absolutely critical. */ WantReplacement, /* This device is a candidate to be * hot-replaced, either because it has * reported some faults, or because * of explicit request. */ Replacement, /* This device is a replacement for * a want_replacement device with same * raid_disk number. */ Candidate, /* For clustered environments only: * This device is seen locally but not * by the whole cluster */ Journal, /* This device is used as journal for * raid-5/6. * Usually, this device should be faster * than other devices in the array */ ClusterRemove, ExternalBbl, /* External metadata provides bad * block management for a disk */ FailFast, /* Minimal retries should be attempted on * this device, so use REQ_FAILFAST_DEV. * Also don't try to repair failed reads. * It is expects that no bad block log * is present. */ LastDev, /* Seems to be the last working dev as * it didn't fail, so don't use FailFast * any more for metadata */ CollisionCheck, /* * check if there is collision between raid1 * serial bios. */ Nonrot, /* non-rotational device (SSD) */ }; static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, sector_t *first_bad, int *bad_sectors) { if (unlikely(rdev->badblocks.count)) { int rv = badblocks_check(&rdev->badblocks, rdev->data_offset + s, sectors, first_bad, bad_sectors); if (rv) *first_bad -= rdev->data_offset; return rv; } return 0; } static inline int rdev_has_badblock(struct md_rdev *rdev, sector_t s, int sectors) { sector_t first_bad; int bad_sectors; return is_badblock(rdev, s, sectors, &first_bad, &bad_sectors); } extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new); extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new); struct md_cluster_info; /** * enum mddev_flags - md device flags. * @MD_ARRAY_FIRST_USE: First use of array, needs initialization. * @MD_CLOSING: If set, we are closing the array, do not open it then. * @MD_JOURNAL_CLEAN: A raid with journal is already clean. * @MD_HAS_JOURNAL: The raid array has journal feature set. * @MD_CLUSTER_RESYNC_LOCKED: cluster raid only, which means node, already took * resync lock, need to release the lock. * @MD_FAILFAST_SUPPORTED: Using MD_FAILFAST on metadata writes is supported as * calls to md_error() will never cause the array to * become failed. * @MD_HAS_PPL: The raid array has PPL feature set. * @MD_HAS_MULTIPLE_PPLS: The raid array has multiple PPLs feature set. * @MD_NOT_READY: do_md_run() is active, so 'array_state', ust not report that * array is ready yet. * @MD_BROKEN: This is used to stop writes and mark array as failed. * @MD_DELETED: This device is being deleted * * change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag is added */ enum mddev_flags { MD_ARRAY_FIRST_USE, MD_CLOSING, MD_JOURNAL_CLEAN, MD_HAS_JOURNAL, MD_CLUSTER_RESYNC_LOCKED, MD_FAILFAST_SUPPORTED, MD_HAS_PPL, MD_HAS_MULTIPLE_PPLS, MD_NOT_READY, MD_BROKEN, MD_DELETED, }; enum mddev_sb_flags { MD_SB_CHANGE_DEVS, /* Some device status has changed */ MD_SB_CHANGE_CLEAN, /* transition to or from 'clean' */ MD_SB_CHANGE_PENDING, /* switch from 'clean' to 'active' in progress */ MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */ }; #define NR_SERIAL_INFOS 8 /* record current range of serialize IOs */ struct serial_info { struct rb_node node; sector_t start; /* start sector of rb node */ sector_t last; /* end sector of rb node */ sector_t _subtree_last; /* highest sector in subtree of rb node */ }; /* * mddev->curr_resync stores the current sector of the resync but * also has some overloaded values. */ enum { /* No resync in progress */ MD_RESYNC_NONE = 0, /* Yielded to allow another conflicting resync to commence */ MD_RESYNC_YIELDED = 1, /* Delayed to check that there is no conflict with another sync */ MD_RESYNC_DELAYED = 2, /* Any value greater than or equal to this is in an active resync */ MD_RESYNC_ACTIVE = 3, }; struct mddev { void *private; struct md_personality *pers; dev_t unit; int md_minor; struct list_head disks; unsigned long flags; unsigned long sb_flags; int suspended; struct mutex suspend_mutex; struct percpu_ref active_io; int ro; int sysfs_active; /* set when sysfs deletes * are happening, so run/ * takeover/stop are not safe */ struct gendisk *gendisk; struct kobject kobj; int hold_active; #define UNTIL_IOCTL 1 #define UNTIL_STOP 2 /* Superblock information */ int major_version, minor_version, patch_version; int persistent; int external; /* metadata is * managed externally */ char metadata_type[17]; /* externally set*/ int chunk_sectors; time64_t ctime, utime; int level, layout; char clevel[16]; int raid_disks; int max_disks; sector_t dev_sectors; /* used size of * component devices */ sector_t array_sectors; /* exported array size */ int external_size; /* size managed * externally */ __u64 events; /* If the last 'event' was simply a clean->dirty transition, and * we didn't write it to the spares, then it is safe and simple * to just decrement the event count on a dirty->clean transition. * So we record that possibility here. */ int can_decrease_events; char uuid[16]; /* If the array is being reshaped, we need to record the * new shape and an indication of where we are up to. * This is written to the superblock. * If reshape_position is MaxSector, then no reshape is happening (yet). */ sector_t reshape_position; int delta_disks, new_level, new_layout; int new_chunk_sectors; int reshape_backwards; struct md_thread __rcu *thread; /* management thread */ struct md_thread __rcu *sync_thread; /* doing resync or reconstruct */ /* * Set when a sync operation is started. It holds this value even * when the sync thread is "frozen" (interrupted) or "idle" (stopped * or finished). It is overwritten when a new sync operation is begun. */ enum sync_action last_sync_action; sector_t curr_resync; /* last block scheduled */ /* As resync requests can complete out of order, we cannot easily track * how much resync has been completed. So we occasionally pause until * everything completes, then set curr_resync_completed to curr_resync. * As such it may be well behind the real resync mark, but it is a value * we are certain of. */ sector_t curr_resync_completed; unsigned long resync_mark; /* a recent timestamp */ sector_t resync_mark_cnt;/* blocks written at resync_mark */ sector_t curr_mark_cnt; /* blocks scheduled now */ sector_t resync_max_sectors; /* may be set by personality */ atomic64_t resync_mismatches; /* count of sectors where * parity/replica mismatch found */ /* allow user-space to request suspension of IO to regions of the array */ sector_t suspend_lo; sector_t suspend_hi; /* if zero, use the system-wide default */ int sync_speed_min; int sync_speed_max; /* resync even though the same disks are shared among md-devices */ int parallel_resync; int ok_start_degraded; unsigned long recovery; /* If a RAID personality determines that recovery (of a particular * device) will fail due to a read error on the source device, it * takes a copy of this number and does not attempt recovery again * until this number changes. */ int recovery_disabled; int in_sync; /* know to not need resync */ /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so * that we are never stopping an array while it is open. * 'reconfig_mutex' protects all other reconfiguration. * These locks are separate due to conflicting interactions * with disk->open_mutex. * Lock ordering is: * reconfig_mutex -> disk->open_mutex * disk->open_mutex -> open_mutex: e.g. __blkdev_get -> md_open */ struct mutex open_mutex; struct mutex reconfig_mutex; atomic_t active; /* general refcount */ atomic_t openers; /* number of active opens */ int changed; /* True if we might need to * reread partition info */ int degraded; /* whether md should consider * adding a spare */ atomic_t recovery_active; /* blocks scheduled, but not written */ wait_queue_head_t recovery_wait; sector_t recovery_cp; sector_t resync_min; /* user requested sync * starts here */ sector_t resync_max; /* resync should pause * when it gets here */ struct kernfs_node *sysfs_state; /* handle for 'array_state' * file in sysfs. */ struct kernfs_node *sysfs_action; /* handle for 'sync_action' */ struct kernfs_node *sysfs_completed; /*handle for 'sync_completed' */ struct kernfs_node *sysfs_degraded; /*handle for 'degraded' */ struct kernfs_node *sysfs_level; /*handle for 'level' */ /* used for delayed sysfs removal */ struct work_struct del_work; /* used for register new sync thread */ struct work_struct sync_work; /* "lock" protects: * flush_bio transition from NULL to !NULL * rdev superblocks, events * clearing MD_CHANGE_* * in_sync - and related safemode and MD_CHANGE changes * pers (also protected by reconfig_mutex and pending IO). * clearing ->bitmap * clearing ->bitmap_info.file * changing ->resync_{min,max} * setting MD_RECOVERY_RUNNING (which interacts with resync_{min,max}) */ spinlock_t lock; wait_queue_head_t sb_wait; /* for waiting on superblock updates */ atomic_t pending_writes; /* number of active superblock writes */ unsigned int safemode; /* if set, update "clean" superblock * when no writes pending. */ unsigned int safemode_delay; struct timer_list safemode_timer; struct percpu_ref writes_pending; int sync_checkers; /* # of threads checking writes_pending */ struct bitmap *bitmap; /* the bitmap for the device */ struct { struct file *file; /* the bitmap file */ loff_t offset; /* offset from superblock of * start of bitmap. May be * negative, but not '0' * For external metadata, offset * from start of device. */ unsigned long space; /* space available at this offset */ loff_t default_offset; /* this is the offset to use when * hot-adding a bitmap. It should * eventually be settable by sysfs. */ unsigned long default_space; /* space available at * default offset */ struct mutex mutex; unsigned long chunksize; unsigned long daemon_sleep; /* how many jiffies between updates? */ unsigned long max_write_behind; /* write-behind mode */ int external; int nodes; /* Maximum number of nodes in the cluster */ char cluster_name[64]; /* Name of the cluster */ } bitmap_info; atomic_t max_corr_read_errors; /* max read retries */ struct list_head all_mddevs; const struct attribute_group *to_remove; struct bio_set bio_set; struct bio_set sync_set; /* for sync operations like * metadata and bitmap writes */ struct bio_set io_clone_set; /* Generic flush handling. * The last to finish preflush schedules a worker to submit * the rest of the request (without the REQ_PREFLUSH flag). */ struct bio *flush_bio; atomic_t flush_pending; ktime_t start_flush, prev_flush_start; /* prev_flush_start is when the previous completed * flush was started. */ struct work_struct flush_work; struct work_struct event_work; /* used by dm to report failure event */ mempool_t *serial_info_pool; void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); struct md_cluster_info *cluster_info; unsigned int good_device_nr; /* good device num within cluster raid */ unsigned int noio_flag; /* for memalloc scope API */ /* * Temporarily store rdev that will be finally removed when * reconfig_mutex is unlocked, protected by reconfig_mutex. */ struct list_head deleting; /* The sequence number for sync thread */ atomic_t sync_seq; bool has_superblocks:1; bool fail_last_dev:1; bool serialize_policy:1; }; enum recovery_flags { /* flags for sync thread running status */ /* * set when one of sync action is set and new sync thread need to be * registered, or just add/remove spares from conf. */ MD_RECOVERY_NEEDED, /* sync thread is running, or about to be started */ MD_RECOVERY_RUNNING, /* sync thread needs to be aborted for some reason */ MD_RECOVERY_INTR, /* sync thread is done and is waiting to be unregistered */ MD_RECOVERY_DONE, /* running sync thread must abort immediately, and not restart */ MD_RECOVERY_FROZEN, /* waiting for pers->start() to finish */ MD_RECOVERY_WAIT, /* interrupted because io-error */ MD_RECOVERY_ERROR, /* flags determines sync action, see details in enum sync_action */ /* if just this flag is set, action is resync. */ MD_RECOVERY_SYNC, /* * paired with MD_RECOVERY_SYNC, if MD_RECOVERY_CHECK is not set, * action is repair, means user requested resync. */ MD_RECOVERY_REQUESTED, /* * paired with MD_RECOVERY_SYNC and MD_RECOVERY_REQUESTED, action is * check. */ MD_RECOVERY_CHECK, /* recovery, or need to try it */ MD_RECOVERY_RECOVER, /* reshape */ MD_RECOVERY_RESHAPE, /* remote node is running resync thread */ MD_RESYNCING_REMOTE, }; enum md_ro_state { MD_RDWR, MD_RDONLY, MD_AUTO_READ, MD_MAX_STATE }; static inline bool md_is_rdwr(struct mddev *mddev) { return (mddev->ro == MD_RDWR); } static inline bool reshape_interrupted(struct mddev *mddev) { /* reshape never start */ if (mddev->reshape_position == MaxSector) return false; /* interrupted */ if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return true; /* running reshape will be interrupted soon. */ if (test_bit(MD_RECOVERY_WAIT, &mddev->recovery) || test_bit(MD_RECOVERY_INTR, &mddev->recovery) || test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) return true; return false; } static inline int __must_check mddev_lock(struct mddev *mddev) { return mutex_lock_interruptible(&mddev->reconfig_mutex); } /* Sometimes we need to take the lock in a situation where * failure due to interrupts is not acceptable. */ static inline void mddev_lock_nointr(struct mddev *mddev) { mutex_lock(&mddev->reconfig_mutex); } static inline int mddev_trylock(struct mddev *mddev) { return mutex_trylock(&mddev->reconfig_mutex); } extern void mddev_unlock(struct mddev *mddev); static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors) { if (blk_queue_io_stat(bdev->bd_disk->queue)) atomic_add(nr_sectors, &bdev->bd_disk->sync_io); } static inline void md_sync_acct_bio(struct bio *bio, unsigned long nr_sectors) { md_sync_acct(bio->bi_bdev, nr_sectors); } struct md_personality { char *name; int level; struct list_head list; struct module *owner; bool __must_check (*make_request)(struct mddev *mddev, struct bio *bio); /* * start up works that do NOT require md_thread. tasks that * requires md_thread should go into start() */ int (*run)(struct mddev *mddev); /* start up works that require md threads */ int (*start)(struct mddev *mddev); void (*free)(struct mddev *mddev, void *priv); void (*status)(struct seq_file *seq, struct mddev *mddev); /* error_handler must set ->faulty and clear ->in_sync * if appropriate, and should abort recovery if needed */ void (*error_handler)(struct mddev *mddev, struct md_rdev *rdev); int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev); int (*hot_remove_disk) (struct mddev *mddev, struct md_rdev *rdev); int (*spare_active) (struct mddev *mddev); sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, sector_t max_sector, int *skipped); int (*resize) (struct mddev *mddev, sector_t sectors); sector_t (*size) (struct mddev *mddev, sector_t sectors, int raid_disks); int (*check_reshape) (struct mddev *mddev); int (*start_reshape) (struct mddev *mddev); void (*finish_reshape) (struct mddev *mddev); void (*update_reshape_pos) (struct mddev *mddev); void (*prepare_suspend) (struct mddev *mddev); /* quiesce suspends or resumes internal processing. * 1 - stop new actions and wait for action io to complete * 0 - return to normal behaviour */ void (*quiesce) (struct mddev *mddev, int quiesce); /* takeover is used to transition an array from one * personality to another. The new personality must be able * to handle the data in the current layout. * e.g. 2drive raid1 -> 2drive raid5 * ndrive raid5 -> degraded n+1drive raid6 with special layout * If the takeover succeeds, a new 'private' structure is returned. * This needs to be installed and then ->run used to activate the * array. */ void *(*takeover) (struct mddev *mddev); /* Changes the consistency policy of an active array. */ int (*change_consistency_policy)(struct mddev *mddev, const char *buf); }; struct md_sysfs_entry { struct attribute attr; ssize_t (*show)(struct mddev *, char *); ssize_t (*store)(struct mddev *, const char *, size_t); }; extern const struct attribute_group md_bitmap_group; static inline struct kernfs_node *sysfs_get_dirent_safe(struct kernfs_node *sd, char *name) { if (sd) return sysfs_get_dirent(sd, name); return sd; } static inline void sysfs_notify_dirent_safe(struct kernfs_node *sd) { if (sd) sysfs_notify_dirent(sd); } static inline char * mdname (struct mddev * mddev) { return mddev->gendisk ? mddev->gendisk->disk_name : "mdX"; } static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev) { char nm[20]; if (!test_bit(Replacement, &rdev->flags) && !test_bit(Journal, &rdev->flags) && mddev->kobj.sd) { sprintf(nm, "rd%d", rdev->raid_disk); return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); } else return 0; } static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev) { char nm[20]; if (!test_bit(Replacement, &rdev->flags) && !test_bit(Journal, &rdev->flags) && mddev->kobj.sd) { sprintf(nm, "rd%d", rdev->raid_disk); sysfs_remove_link(&mddev->kobj, nm); } } /* * iterates through some rdev ringlist. It's safe to remove the * current 'rdev'. Dont touch 'tmp' though. */ #define rdev_for_each_list(rdev, tmp, head) \ list_for_each_entry_safe(rdev, tmp, head, same_set) /* * iterates through the 'same array disks' ringlist */ #define rdev_for_each(rdev, mddev) \ list_for_each_entry(rdev, &((mddev)->disks), same_set) #define rdev_for_each_safe(rdev, tmp, mddev) \ list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set) #define rdev_for_each_rcu(rdev, mddev) \ list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set) struct md_thread { void (*run) (struct md_thread *thread); struct mddev *mddev; wait_queue_head_t wqueue; unsigned long flags; struct task_struct *tsk; unsigned long timeout; void *private; }; struct md_io_clone { struct mddev *mddev; struct bio *orig_bio; unsigned long start_time; struct bio bio_clone; }; #define THREAD_WAKEUP 0 static inline void safe_put_page(struct page *p) { if (p) put_page(p); } extern int register_md_personality(struct md_personality *p); extern int unregister_md_personality(struct md_personality *p); extern int register_md_cluster_operations(const struct md_cluster_operations *ops, struct module *module); extern int unregister_md_cluster_operations(void); extern int md_setup_cluster(struct mddev *mddev, int nodes); extern void md_cluster_stop(struct mddev *mddev); extern struct md_thread *md_register_thread( void (*run)(struct md_thread *thread), struct mddev *mddev, const char *name); extern void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp); extern void md_wakeup_thread(struct md_thread __rcu *thread); extern void md_check_recovery(struct mddev *mddev); extern void md_reap_sync_thread(struct mddev *mddev); extern enum sync_action md_sync_action(struct mddev *mddev); extern enum sync_action md_sync_action_by_name(const char *page); extern const char *md_sync_action_name(enum sync_action action); extern void md_write_start(struct mddev *mddev, struct bio *bi); extern void md_write_inc(struct mddev *mddev, struct bio *bi); extern void md_write_end(struct mddev *mddev); extern void md_done_sync(struct mddev *mddev, int blocks, int ok); extern void md_error(struct mddev *mddev, struct md_rdev *rdev); extern void md_finish_reshape(struct mddev *mddev); void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, struct bio *bio, sector_t start, sector_t size); void md_account_bio(struct mddev *mddev, struct bio **bio); void md_free_cloned_bio(struct bio *bio); extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio); extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, sector_t sector, int size, struct page *page); extern int md_super_wait(struct mddev *mddev); extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, struct page *page, blk_opf_t opf, bool metadata_op); extern void md_do_sync(struct md_thread *thread); extern void md_new_event(void); extern void md_allow_write(struct mddev *mddev); extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev); extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors); extern int md_check_no_bitmap(struct mddev *mddev); extern int md_integrity_register(struct mddev *mddev); extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); extern int mddev_init(struct mddev *mddev); extern void mddev_destroy(struct mddev *mddev); void md_init_stacking_limits(struct queue_limits *lim); struct mddev *md_alloc(dev_t dev, char *name); void mddev_put(struct mddev *mddev); extern int md_run(struct mddev *mddev); extern int md_start(struct mddev *mddev); extern void md_stop(struct mddev *mddev); extern void md_stop_writes(struct mddev *mddev); extern int md_rdev_init(struct md_rdev *rdev); extern void md_rdev_clear(struct md_rdev *rdev); extern bool md_handle_request(struct mddev *mddev, struct bio *bio); extern int mddev_suspend(struct mddev *mddev, bool interruptible); extern void mddev_resume(struct mddev *mddev); extern void md_idle_sync_thread(struct mddev *mddev); extern void md_frozen_sync_thread(struct mddev *mddev); extern void md_unfrozen_sync_thread(struct mddev *mddev); extern void md_reload_sb(struct mddev *mddev, int raid_disk); extern void md_update_sb(struct mddev *mddev, int force); extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev); extern void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev); struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr); struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev); static inline bool is_rdev_broken(struct md_rdev *rdev) { return !disk_live(rdev->bdev->bd_disk); } static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev) { int faulty = test_bit(Faulty, &rdev->flags); if (atomic_dec_and_test(&rdev->nr_pending) && faulty) { set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); } } extern const struct md_cluster_operations *md_cluster_ops; static inline int mddev_is_clustered(struct mddev *mddev) { return mddev->cluster_info && mddev->bitmap_info.nodes > 1; } /* clear unsupported mddev_flags */ static inline void mddev_clear_unsupported_flags(struct mddev *mddev, unsigned long unsupported_flags) { mddev->flags &= ~unsupported_flags; } static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio) { if (bio_op(bio) == REQ_OP_WRITE_ZEROES && !bio->bi_bdev->bd_disk->queue->limits.max_write_zeroes_sectors) mddev->gendisk->queue->limits.max_write_zeroes_sectors = 0; } static inline int mddev_suspend_and_lock(struct mddev *mddev) { int ret; ret = mddev_suspend(mddev, true); if (ret) return ret; ret = mddev_lock(mddev); if (ret) mddev_resume(mddev); return ret; } static inline void mddev_suspend_and_lock_nointr(struct mddev *mddev) { mddev_suspend(mddev, false); mutex_lock(&mddev->reconfig_mutex); } static inline void mddev_unlock_and_resume(struct mddev *mddev) { mddev_unlock(mddev); mddev_resume(mddev); } struct mdu_array_info_s; struct mdu_disk_info_s; extern int mdp_major; extern struct workqueue_struct *md_bitmap_wq; void md_autostart_arrays(int part); int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info); int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info); int do_md_run(struct mddev *mddev); #define MDDEV_STACK_INTEGRITY (1u << 0) int mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim, unsigned int flags); int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev); void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes); extern const struct block_device_operations md_fops; /* * MD devices can be used undeneath by DM, in which case ->gendisk is NULL. */ static inline bool mddev_is_dm(struct mddev *mddev) { return !mddev->gendisk; } static inline void mddev_trace_remap(struct mddev *mddev, struct bio *bio, sector_t sector) { if (!mddev_is_dm(mddev)) trace_block_bio_remap(bio, disk_devt(mddev->gendisk), sector); } #define mddev_add_trace_msg(mddev, fmt, args...) \ do { \ if (!mddev_is_dm(mddev)) \ blk_add_trace_msg((mddev)->gendisk->queue, fmt, ##args); \ } while (0) #endif /* _MD_MD_H */ |
45 45 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 | // SPDX-License-Identifier: GPL-2.0 #include <linux/cred.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/quotaops.h> #include <linux/sched.h> #include <linux/slab.h> #include <net/netlink.h> #include <net/genetlink.h> static const struct genl_multicast_group quota_mcgrps[] = { { .name = "events", }, }; /* Netlink family structure for quota */ static struct genl_family quota_genl_family __ro_after_init = { .module = THIS_MODULE, .hdrsize = 0, .name = "VFS_DQUOT", .version = 1, .maxattr = QUOTA_NL_A_MAX, .mcgrps = quota_mcgrps, .n_mcgrps = ARRAY_SIZE(quota_mcgrps), }; /** * quota_send_warning - Send warning to userspace about exceeded quota * @qid: The kernel internal quota identifier. * @dev: The device on which the fs is mounted (sb->s_dev) * @warntype: The type of the warning: QUOTA_NL_... * * This can be used by filesystems (including those which don't use * dquot) to send a message to userspace relating to quota limits. * */ void quota_send_warning(struct kqid qid, dev_t dev, const char warntype) { static atomic_t seq; struct sk_buff *skb; void *msg_head; int ret; int msg_size = 4 * nla_total_size(sizeof(u32)) + 2 * nla_total_size_64bit(sizeof(u64)); /* We have to allocate using GFP_NOFS as we are called from a * filesystem performing write and thus further recursion into * the fs to free some data could cause deadlocks. */ skb = genlmsg_new(msg_size, GFP_NOFS); if (!skb) { printk(KERN_ERR "VFS: Not enough memory to send quota warning.\n"); return; } msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq), "a_genl_family, 0, QUOTA_NL_C_WARNING); if (!msg_head) { printk(KERN_ERR "VFS: Cannot store netlink header in quota warning.\n"); goto err_out; } ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, qid.type); if (ret) goto attr_err_out; ret = nla_put_u64_64bit(skb, QUOTA_NL_A_EXCESS_ID, from_kqid_munged(&init_user_ns, qid), QUOTA_NL_A_PAD); if (ret) goto attr_err_out; ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype); if (ret) goto attr_err_out; ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR, MAJOR(dev)); if (ret) goto attr_err_out; ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR, MINOR(dev)); if (ret) goto attr_err_out; ret = nla_put_u64_64bit(skb, QUOTA_NL_A_CAUSED_ID, from_kuid_munged(&init_user_ns, current_uid()), QUOTA_NL_A_PAD); if (ret) goto attr_err_out; genlmsg_end(skb, msg_head); genlmsg_multicast("a_genl_family, skb, 0, 0, GFP_NOFS); return; attr_err_out: printk(KERN_ERR "VFS: Not enough space to compose quota message!\n"); err_out: kfree_skb(skb); } EXPORT_SYMBOL(quota_send_warning); static int __init quota_init(void) { if (genl_register_family("a_genl_family) != 0) printk(KERN_ERR "VFS: Failed to create quota netlink interface.\n"); return 0; }; fs_initcall(quota_init); |
1 1 1 2 1 1 1 5 7 1 6 6 20 76 2 112 20 6911 5435 19 12 20 15 7 17 17 15 5 85 15 75 20 54 14 7 74 17 73 10 65 1 46 1014 19 47 17 6 8 6 6 22 22 22 22 486 486 22 25 46 9 39 24 2 3 17 4477 4482 174 26 46 3 3 1 126 46 3 174 3316 3314 50 49 74 73 261 261 24 24 88 410 494 494 1221 1221 23 140 139 187 187 126 126 127 126 3 13 7692 4405 4408 16 9 8 313 11727 11716 1 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 | // SPDX-License-Identifier: GPL-2.0-only /* * Landlock LSM - Filesystem management and hooks * * Copyright © 2016-2020 Mickaël Salaün <mic@digikod.net> * Copyright © 2018-2020 ANSSI * Copyright © 2021-2022 Microsoft Corporation * Copyright © 2022 Günther Noack <gnoack3000@gmail.com> * Copyright © 2023-2024 Google LLC */ #include <asm/ioctls.h> #include <kunit/test.h> #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/bits.h> #include <linux/compiler_types.h> #include <linux/dcache.h> #include <linux/err.h> #include <linux/falloc.h> #include <linux/fs.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/limits.h> #include <linux/list.h> #include <linux/lsm_hooks.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/path.h> #include <linux/rcupdate.h> #include <linux/spinlock.h> #include <linux/stat.h> #include <linux/types.h> #include <linux/wait_bit.h> #include <linux/workqueue.h> #include <uapi/linux/fiemap.h> #include <uapi/linux/landlock.h> #include "common.h" #include "cred.h" #include "fs.h" #include "limits.h" #include "object.h" #include "ruleset.h" #include "setup.h" /* Underlying object management */ static void release_inode(struct landlock_object *const object) __releases(object->lock) { struct inode *const inode = object->underobj; struct super_block *sb; if (!inode) { spin_unlock(&object->lock); return; } /* * Protects against concurrent use by hook_sb_delete() of the reference * to the underlying inode. */ object->underobj = NULL; /* * Makes sure that if the filesystem is concurrently unmounted, * hook_sb_delete() will wait for us to finish iput(). */ sb = inode->i_sb; atomic_long_inc(&landlock_superblock(sb)->inode_refs); spin_unlock(&object->lock); /* * Because object->underobj was not NULL, hook_sb_delete() and * get_inode_object() guarantee that it is safe to reset * landlock_inode(inode)->object while it is not NULL. It is therefore * not necessary to lock inode->i_lock. */ rcu_assign_pointer(landlock_inode(inode)->object, NULL); /* * Now, new rules can safely be tied to @inode with get_inode_object(). */ iput(inode); if (atomic_long_dec_and_test(&landlock_superblock(sb)->inode_refs)) wake_up_var(&landlock_superblock(sb)->inode_refs); } static const struct landlock_object_underops landlock_fs_underops = { .release = release_inode }; /* IOCTL helpers */ /** * is_masked_device_ioctl - Determine whether an IOCTL command is always * permitted with Landlock for device files. These commands can not be * restricted on device files by enforcing a Landlock policy. * * @cmd: The IOCTL command that is supposed to be run. * * By default, any IOCTL on a device file requires the * LANDLOCK_ACCESS_FS_IOCTL_DEV right. However, we blanket-permit some * commands, if: * * 1. The command is implemented in fs/ioctl.c's do_vfs_ioctl(), * not in f_ops->unlocked_ioctl() or f_ops->compat_ioctl(). * * 2. The command is harmless when invoked on devices. * * We also permit commands that do not make sense for devices, but where the * do_vfs_ioctl() implementation returns a more conventional error code. * * Any new IOCTL commands that are implemented in fs/ioctl.c's do_vfs_ioctl() * should be considered for inclusion here. * * Returns: true if the IOCTL @cmd can not be restricted with Landlock for * device files. */ static __attribute_const__ bool is_masked_device_ioctl(const unsigned int cmd) { switch (cmd) { /* * FIOCLEX, FIONCLEX, FIONBIO and FIOASYNC manipulate the FD's * close-on-exec and the file's buffered-IO and async flags. These * operations are also available through fcntl(2), and are * unconditionally permitted in Landlock. */ case FIOCLEX: case FIONCLEX: case FIONBIO: case FIOASYNC: /* * FIOQSIZE queries the size of a regular file, directory, or link. * * We still permit it, because it always returns -ENOTTY for * other file types. */ case FIOQSIZE: /* * FIFREEZE and FITHAW freeze and thaw the file system which the * given file belongs to. Requires CAP_SYS_ADMIN. * * These commands operate on the file system's superblock rather * than on the file itself. The same operations can also be * done through any other file or directory on the same file * system, so it is safe to permit these. */ case FIFREEZE: case FITHAW: /* * FS_IOC_FIEMAP queries information about the allocation of * blocks within a file. * * This IOCTL command only makes sense for regular files and is * not implemented by devices. It is harmless to permit. */ case FS_IOC_FIEMAP: /* * FIGETBSZ queries the file system's block size for a file or * directory. * * This command operates on the file system's superblock rather * than on the file itself. The same operation can also be done * through any other file or directory on the same file system, * so it is safe to permit it. */ case FIGETBSZ: /* * FICLONE, FICLONERANGE and FIDEDUPERANGE make files share * their underlying storage ("reflink") between source and * destination FDs, on file systems which support that. * * These IOCTL commands only apply to regular files * and are harmless to permit for device files. */ case FICLONE: case FICLONERANGE: case FIDEDUPERANGE: /* * FS_IOC_GETFSUUID and FS_IOC_GETFSSYSFSPATH both operate on * the file system superblock, not on the specific file, so * these operations are available through any other file on the * same file system as well. */ case FS_IOC_GETFSUUID: case FS_IOC_GETFSSYSFSPATH: return true; /* * FIONREAD, FS_IOC_GETFLAGS, FS_IOC_SETFLAGS, FS_IOC_FSGETXATTR and * FS_IOC_FSSETXATTR are forwarded to device implementations. */ /* * file_ioctl() commands (FIBMAP, FS_IOC_RESVSP, FS_IOC_RESVSP64, * FS_IOC_UNRESVSP, FS_IOC_UNRESVSP64 and FS_IOC_ZERO_RANGE) are * forwarded to device implementations, so not permitted. */ /* Other commands are guarded by the access right. */ default: return false; } } /* * is_masked_device_ioctl_compat - same as the helper above, but checking the * "compat" IOCTL commands. * * The IOCTL commands with special handling in compat-mode should behave the * same as their non-compat counterparts. */ static __attribute_const__ bool is_masked_device_ioctl_compat(const unsigned int cmd) { switch (cmd) { /* FICLONE is permitted, same as in the non-compat variant. */ case FICLONE: return true; #if defined(CONFIG_X86_64) /* * FS_IOC_RESVSP_32, FS_IOC_RESVSP64_32, FS_IOC_UNRESVSP_32, * FS_IOC_UNRESVSP64_32, FS_IOC_ZERO_RANGE_32: not blanket-permitted, * for consistency with their non-compat variants. */ case FS_IOC_RESVSP_32: case FS_IOC_RESVSP64_32: case FS_IOC_UNRESVSP_32: case FS_IOC_UNRESVSP64_32: case FS_IOC_ZERO_RANGE_32: #endif /* * FS_IOC32_GETFLAGS, FS_IOC32_SETFLAGS are forwarded to their device * implementations. */ case FS_IOC32_GETFLAGS: case FS_IOC32_SETFLAGS: return false; default: return is_masked_device_ioctl(cmd); } } /* Ruleset management */ static struct landlock_object *get_inode_object(struct inode *const inode) { struct landlock_object *object, *new_object; struct landlock_inode_security *inode_sec = landlock_inode(inode); rcu_read_lock(); retry: object = rcu_dereference(inode_sec->object); if (object) { if (likely(refcount_inc_not_zero(&object->usage))) { rcu_read_unlock(); return object; } /* * We are racing with release_inode(), the object is going * away. Wait for release_inode(), then retry. */ spin_lock(&object->lock); spin_unlock(&object->lock); goto retry; } rcu_read_unlock(); /* * If there is no object tied to @inode, then create a new one (without * holding any locks). */ new_object = landlock_create_object(&landlock_fs_underops, inode); if (IS_ERR(new_object)) return new_object; /* * Protects against concurrent calls to get_inode_object() or * hook_sb_delete(). */ spin_lock(&inode->i_lock); if (unlikely(rcu_access_pointer(inode_sec->object))) { /* Someone else just created the object, bail out and retry. */ spin_unlock(&inode->i_lock); kfree(new_object); rcu_read_lock(); goto retry; } /* * @inode will be released by hook_sb_delete() on its superblock * shutdown, or by release_inode() when no more ruleset references the * related object. */ ihold(inode); rcu_assign_pointer(inode_sec->object, new_object); spin_unlock(&inode->i_lock); return new_object; } /* All access rights that can be tied to files. */ /* clang-format off */ #define ACCESS_FILE ( \ LANDLOCK_ACCESS_FS_EXECUTE | \ LANDLOCK_ACCESS_FS_WRITE_FILE | \ LANDLOCK_ACCESS_FS_READ_FILE | \ LANDLOCK_ACCESS_FS_TRUNCATE | \ LANDLOCK_ACCESS_FS_IOCTL_DEV) /* clang-format on */ /* * @path: Should have been checked by get_path_from_fd(). */ int landlock_append_fs_rule(struct landlock_ruleset *const ruleset, const struct path *const path, access_mask_t access_rights) { int err; struct landlock_id id = { .type = LANDLOCK_KEY_INODE, }; /* Files only get access rights that make sense. */ if (!d_is_dir(path->dentry) && (access_rights | ACCESS_FILE) != ACCESS_FILE) return -EINVAL; if (WARN_ON_ONCE(ruleset->num_layers != 1)) return -EINVAL; /* Transforms relative access rights to absolute ones. */ access_rights |= LANDLOCK_MASK_ACCESS_FS & ~landlock_get_fs_access_mask(ruleset, 0); id.key.object = get_inode_object(d_backing_inode(path->dentry)); if (IS_ERR(id.key.object)) return PTR_ERR(id.key.object); mutex_lock(&ruleset->lock); err = landlock_insert_rule(ruleset, id, access_rights); mutex_unlock(&ruleset->lock); /* * No need to check for an error because landlock_insert_rule() * increments the refcount for the new object if needed. */ landlock_put_object(id.key.object); return err; } /* Access-control management */ /* * The lifetime of the returned rule is tied to @domain. * * Returns NULL if no rule is found or if @dentry is negative. */ static const struct landlock_rule * find_rule(const struct landlock_ruleset *const domain, const struct dentry *const dentry) { const struct landlock_rule *rule; const struct inode *inode; struct landlock_id id = { .type = LANDLOCK_KEY_INODE, }; /* Ignores nonexistent leafs. */ if (d_is_negative(dentry)) return NULL; inode = d_backing_inode(dentry); rcu_read_lock(); id.key.object = rcu_dereference(landlock_inode(inode)->object); rule = landlock_find_rule(domain, id); rcu_read_unlock(); return rule; } /* * Allows access to pseudo filesystems that will never be mountable (e.g. * sockfs, pipefs), but can still be reachable through * /proc/<pid>/fd/<file-descriptor> */ static bool is_nouser_or_private(const struct dentry *dentry) { return (dentry->d_sb->s_flags & SB_NOUSER) || (d_is_positive(dentry) && unlikely(IS_PRIVATE(d_backing_inode(dentry)))); } static access_mask_t get_raw_handled_fs_accesses(const struct landlock_ruleset *const domain) { access_mask_t access_dom = 0; size_t layer_level; for (layer_level = 0; layer_level < domain->num_layers; layer_level++) access_dom |= landlock_get_raw_fs_access_mask(domain, layer_level); return access_dom; } static access_mask_t get_handled_fs_accesses(const struct landlock_ruleset *const domain) { /* Handles all initially denied by default access rights. */ return get_raw_handled_fs_accesses(domain) | LANDLOCK_ACCESS_FS_INITIALLY_DENIED; } static const struct landlock_ruleset * get_fs_domain(const struct landlock_ruleset *const domain) { if (!domain || !get_raw_handled_fs_accesses(domain)) return NULL; return domain; } static const struct landlock_ruleset *get_current_fs_domain(void) { return get_fs_domain(landlock_get_current_domain()); } /* * Check that a destination file hierarchy has more restrictions than a source * file hierarchy. This is only used for link and rename actions. * * @layer_masks_child2: Optional child masks. */ static bool no_more_access( const layer_mask_t (*const layer_masks_parent1)[LANDLOCK_NUM_ACCESS_FS], const layer_mask_t (*const layer_masks_child1)[LANDLOCK_NUM_ACCESS_FS], const bool child1_is_directory, const layer_mask_t (*const layer_masks_parent2)[LANDLOCK_NUM_ACCESS_FS], const layer_mask_t (*const layer_masks_child2)[LANDLOCK_NUM_ACCESS_FS], const bool child2_is_directory) { unsigned long access_bit; for (access_bit = 0; access_bit < ARRAY_SIZE(*layer_masks_parent2); access_bit++) { /* Ignores accesses that only make sense for directories. */ const bool is_file_access = !!(BIT_ULL(access_bit) & ACCESS_FILE); if (child1_is_directory || is_file_access) { /* * Checks if the destination restrictions are a * superset of the source ones (i.e. inherited access * rights without child exceptions): * restrictions(parent2) >= restrictions(child1) */ if ((((*layer_masks_parent1)[access_bit] & (*layer_masks_child1)[access_bit]) | (*layer_masks_parent2)[access_bit]) != (*layer_masks_parent2)[access_bit]) return false; } if (!layer_masks_child2) continue; if (child2_is_directory || is_file_access) { /* * Checks inverted restrictions for RENAME_EXCHANGE: * restrictions(parent1) >= restrictions(child2) */ if ((((*layer_masks_parent2)[access_bit] & (*layer_masks_child2)[access_bit]) | (*layer_masks_parent1)[access_bit]) != (*layer_masks_parent1)[access_bit]) return false; } } return true; } #define NMA_TRUE(...) KUNIT_EXPECT_TRUE(test, no_more_access(__VA_ARGS__)) #define NMA_FALSE(...) KUNIT_EXPECT_FALSE(test, no_more_access(__VA_ARGS__)) #ifdef CONFIG_SECURITY_LANDLOCK_KUNIT_TEST static void test_no_more_access(struct kunit *const test) { const layer_mask_t rx0[LANDLOCK_NUM_ACCESS_FS] = { [BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0), [BIT_INDEX(LANDLOCK_ACCESS_FS_READ_FILE)] = BIT_ULL(0), }; const layer_mask_t mx0[LANDLOCK_NUM_ACCESS_FS] = { [BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0), [BIT_INDEX(LANDLOCK_ACCESS_FS_MAKE_REG)] = BIT_ULL(0), }; const layer_mask_t x0[LANDLOCK_NUM_ACCESS_FS] = { [BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0), }; const layer_mask_t x1[LANDLOCK_NUM_ACCESS_FS] = { [BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(1), }; const layer_mask_t x01[LANDLOCK_NUM_ACCESS_FS] = { [BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0) | BIT_ULL(1), }; const layer_mask_t allows_all[LANDLOCK_NUM_ACCESS_FS] = {}; /* Checks without restriction. */ NMA_TRUE(&x0, &allows_all, false, &allows_all, NULL, false); NMA_TRUE(&allows_all, &x0, false, &allows_all, NULL, false); NMA_FALSE(&x0, &x0, false, &allows_all, NULL, false); /* * Checks that we can only refer a file if no more access could be * inherited. */ NMA_TRUE(&x0, &x0, false, &rx0, NULL, false); NMA_TRUE(&rx0, &rx0, false, &rx0, NULL, false); NMA_FALSE(&rx0, &rx0, false, &x0, NULL, false); NMA_FALSE(&rx0, &rx0, false, &x1, NULL, false); /* Checks allowed referring with different nested domains. */ NMA_TRUE(&x0, &x1, false, &x0, NULL, false); NMA_TRUE(&x1, &x0, false, &x0, NULL, false); NMA_TRUE(&x0, &x01, false, &x0, NULL, false); NMA_TRUE(&x0, &x01, false, &rx0, NULL, false); NMA_TRUE(&x01, &x0, false, &x0, NULL, false); NMA_TRUE(&x01, &x0, false, &rx0, NULL, false); NMA_FALSE(&x01, &x01, false, &x0, NULL, false); /* Checks that file access rights are also enforced for a directory. */ NMA_FALSE(&rx0, &rx0, true, &x0, NULL, false); /* Checks that directory access rights don't impact file referring... */ NMA_TRUE(&mx0, &mx0, false, &x0, NULL, false); /* ...but only directory referring. */ NMA_FALSE(&mx0, &mx0, true, &x0, NULL, false); /* Checks directory exchange. */ NMA_TRUE(&mx0, &mx0, true, &mx0, &mx0, true); NMA_TRUE(&mx0, &mx0, true, &mx0, &x0, true); NMA_FALSE(&mx0, &mx0, true, &x0, &mx0, true); NMA_FALSE(&mx0, &mx0, true, &x0, &x0, true); NMA_FALSE(&mx0, &mx0, true, &x1, &x1, true); /* Checks file exchange with directory access rights... */ NMA_TRUE(&mx0, &mx0, false, &mx0, &mx0, false); NMA_TRUE(&mx0, &mx0, false, &mx0, &x0, false); NMA_TRUE(&mx0, &mx0, false, &x0, &mx0, false); NMA_TRUE(&mx0, &mx0, false, &x0, &x0, false); /* ...and with file access rights. */ NMA_TRUE(&rx0, &rx0, false, &rx0, &rx0, false); NMA_TRUE(&rx0, &rx0, false, &rx0, &x0, false); NMA_FALSE(&rx0, &rx0, false, &x0, &rx0, false); NMA_FALSE(&rx0, &rx0, false, &x0, &x0, false); NMA_FALSE(&rx0, &rx0, false, &x1, &x1, false); /* * Allowing the following requests should not be a security risk * because domain 0 denies execute access, and domain 1 is always * nested with domain 0. However, adding an exception for this case * would mean to check all nested domains to make sure none can get * more privileges (e.g. processes only sandboxed by domain 0). * Moreover, this behavior (i.e. composition of N domains) could then * be inconsistent compared to domain 1's ruleset alone (e.g. it might * be denied to link/rename with domain 1's ruleset, whereas it would * be allowed if nested on top of domain 0). Another drawback would be * to create a cover channel that could enable sandboxed processes to * infer most of the filesystem restrictions from their domain. To * make it simple, efficient, safe, and more consistent, this case is * always denied. */ NMA_FALSE(&x1, &x1, false, &x0, NULL, false); NMA_FALSE(&x1, &x1, false, &rx0, NULL, false); NMA_FALSE(&x1, &x1, true, &x0, NULL, false); NMA_FALSE(&x1, &x1, true, &rx0, NULL, false); /* Checks the same case of exclusive domains with a file... */ NMA_TRUE(&x1, &x1, false, &x01, NULL, false); NMA_FALSE(&x1, &x1, false, &x01, &x0, false); NMA_FALSE(&x1, &x1, false, &x01, &x01, false); NMA_FALSE(&x1, &x1, false, &x0, &x0, false); /* ...and with a directory. */ NMA_FALSE(&x1, &x1, false, &x0, &x0, true); NMA_FALSE(&x1, &x1, true, &x0, &x0, false); NMA_FALSE(&x1, &x1, true, &x0, &x0, true); } #endif /* CONFIG_SECURITY_LANDLOCK_KUNIT_TEST */ #undef NMA_TRUE #undef NMA_FALSE /* * Removes @layer_masks accesses that are not requested. * * Returns true if the request is allowed, false otherwise. */ static bool scope_to_request(const access_mask_t access_request, layer_mask_t (*const layer_masks)[LANDLOCK_NUM_ACCESS_FS]) { const unsigned long access_req = access_request; unsigned long access_bit; if (WARN_ON_ONCE(!layer_masks)) return true; for_each_clear_bit(access_bit, &access_req, ARRAY_SIZE(*layer_masks)) (*layer_masks)[access_bit] = 0; return !memchr_inv(layer_masks, 0, sizeof(*layer_masks)); } #ifdef CONFIG_SECURITY_LANDLOCK_KUNIT_TEST static void test_scope_to_request_with_exec_none(struct kunit *const test) { /* Allows everything. */ layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {}; /* Checks and scopes with execute. */ KUNIT_EXPECT_TRUE(test, scope_to_request(LANDLOCK_ACCESS_FS_EXECUTE, &layer_masks)); KUNIT_EXPECT_EQ(test, 0, layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)]); KUNIT_EXPECT_EQ(test, 0, layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)]); } static void test_scope_to_request_with_exec_some(struct kunit *const test) { /* Denies execute and write. */ layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = { [BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0), [BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)] = BIT_ULL(1), }; /* Checks and scopes with execute. */ KUNIT_EXPECT_FALSE(test, scope_to_request(LANDLOCK_ACCESS_FS_EXECUTE, &layer_masks)); KUNIT_EXPECT_EQ(test, BIT_ULL(0), layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)]); KUNIT_EXPECT_EQ(test, 0, layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)]); } static void test_scope_to_request_without_access(struct kunit *const test) { /* Denies execute and write. */ layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = { [BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0), [BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)] = BIT_ULL(1), }; /* Checks and scopes without access request. */ KUNIT_EXPECT_TRUE(test, scope_to_request(0, &layer_masks)); KUNIT_EXPECT_EQ(test, 0, layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)]); KUNIT_EXPECT_EQ(test, 0, layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)]); } #endif /* CONFIG_SECURITY_LANDLOCK_KUNIT_TEST */ /* * Returns true if there is at least one access right different than * LANDLOCK_ACCESS_FS_REFER. */ static bool is_eacces(const layer_mask_t (*const layer_masks)[LANDLOCK_NUM_ACCESS_FS], const access_mask_t access_request) { unsigned long access_bit; /* LANDLOCK_ACCESS_FS_REFER alone must return -EXDEV. */ const unsigned long access_check = access_request & ~LANDLOCK_ACCESS_FS_REFER; if (!layer_masks) return false; for_each_set_bit(access_bit, &access_check, ARRAY_SIZE(*layer_masks)) { if ((*layer_masks)[access_bit]) return true; } return false; } #define IE_TRUE(...) KUNIT_EXPECT_TRUE(test, is_eacces(__VA_ARGS__)) #define IE_FALSE(...) KUNIT_EXPECT_FALSE(test, is_eacces(__VA_ARGS__)) #ifdef CONFIG_SECURITY_LANDLOCK_KUNIT_TEST static void test_is_eacces_with_none(struct kunit *const test) { const layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {}; IE_FALSE(&layer_masks, 0); IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_REFER); IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_EXECUTE); IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_WRITE_FILE); } static void test_is_eacces_with_refer(struct kunit *const test) { const layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = { [BIT_INDEX(LANDLOCK_ACCESS_FS_REFER)] = BIT_ULL(0), }; IE_FALSE(&layer_masks, 0); IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_REFER); IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_EXECUTE); IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_WRITE_FILE); } static void test_is_eacces_with_write(struct kunit *const test) { const layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = { [BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)] = BIT_ULL(0), }; IE_FALSE(&layer_masks, 0); IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_REFER); IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_EXECUTE); IE_TRUE(&layer_masks, LANDLOCK_ACCESS_FS_WRITE_FILE); } #endif /* CONFIG_SECURITY_LANDLOCK_KUNIT_TEST */ #undef IE_TRUE #undef IE_FALSE /** * is_access_to_paths_allowed - Check accesses for requests with a common path * * @domain: Domain to check against. * @path: File hierarchy to walk through. * @access_request_parent1: Accesses to check, once @layer_masks_parent1 is * equal to @layer_masks_parent2 (if any). This is tied to the unique * requested path for most actions, or the source in case of a refer action * (i.e. rename or link), or the source and destination in case of * RENAME_EXCHANGE. * @layer_masks_parent1: Pointer to a matrix of layer masks per access * masks, identifying the layers that forbid a specific access. Bits from * this matrix can be unset according to the @path walk. An empty matrix * means that @domain allows all possible Landlock accesses (i.e. not only * those identified by @access_request_parent1). This matrix can * initially refer to domain layer masks and, when the accesses for the * destination and source are the same, to requested layer masks. * @dentry_child1: Dentry to the initial child of the parent1 path. This * pointer must be NULL for non-refer actions (i.e. not link nor rename). * @access_request_parent2: Similar to @access_request_parent1 but for a * request involving a source and a destination. This refers to the * destination, except in case of RENAME_EXCHANGE where it also refers to * the source. Must be set to 0 when using a simple path request. * @layer_masks_parent2: Similar to @layer_masks_parent1 but for a refer * action. This must be NULL otherwise. * @dentry_child2: Dentry to the initial child of the parent2 path. This * pointer is only set for RENAME_EXCHANGE actions and must be NULL * otherwise. * * This helper first checks that the destination has a superset of restrictions * compared to the source (if any) for a common path. Because of * RENAME_EXCHANGE actions, source and destinations may be swapped. It then * checks that the collected accesses and the remaining ones are enough to * allow the request. * * Returns: * - true if the access request is granted; * - false otherwise. */ static bool is_access_to_paths_allowed( const struct landlock_ruleset *const domain, const struct path *const path, const access_mask_t access_request_parent1, layer_mask_t (*const layer_masks_parent1)[LANDLOCK_NUM_ACCESS_FS], const struct dentry *const dentry_child1, const access_mask_t access_request_parent2, layer_mask_t (*const layer_masks_parent2)[LANDLOCK_NUM_ACCESS_FS], const struct dentry *const dentry_child2) { bool allowed_parent1 = false, allowed_parent2 = false, is_dom_check, child1_is_directory = true, child2_is_directory = true; struct path walker_path; access_mask_t access_masked_parent1, access_masked_parent2; layer_mask_t _layer_masks_child1[LANDLOCK_NUM_ACCESS_FS], _layer_masks_child2[LANDLOCK_NUM_ACCESS_FS]; layer_mask_t(*layer_masks_child1)[LANDLOCK_NUM_ACCESS_FS] = NULL, (*layer_masks_child2)[LANDLOCK_NUM_ACCESS_FS] = NULL; if (!access_request_parent1 && !access_request_parent2) return true; if (WARN_ON_ONCE(!domain || !path)) return true; if (is_nouser_or_private(path->dentry)) return true; if (WARN_ON_ONCE(domain->num_layers < 1 || !layer_masks_parent1)) return false; if (unlikely(layer_masks_parent2)) { if (WARN_ON_ONCE(!dentry_child1)) return false; /* * For a double request, first check for potential privilege * escalation by looking at domain handled accesses (which are * a superset of the meaningful requested accesses). */ access_masked_parent1 = access_masked_parent2 = get_handled_fs_accesses(domain); is_dom_check = true; } else { if (WARN_ON_ONCE(dentry_child1 || dentry_child2)) return false; /* For a simple request, only check for requested accesses. */ access_masked_parent1 = access_request_parent1; access_masked_parent2 = access_request_parent2; is_dom_check = false; } if (unlikely(dentry_child1)) { landlock_unmask_layers( find_rule(domain, dentry_child1), landlock_init_layer_masks( domain, LANDLOCK_MASK_ACCESS_FS, &_layer_masks_child1, LANDLOCK_KEY_INODE), &_layer_masks_child1, ARRAY_SIZE(_layer_masks_child1)); layer_masks_child1 = &_layer_masks_child1; child1_is_directory = d_is_dir(dentry_child1); } if (unlikely(dentry_child2)) { landlock_unmask_layers( find_rule(domain, dentry_child2), landlock_init_layer_masks( domain, LANDLOCK_MASK_ACCESS_FS, &_layer_masks_child2, LANDLOCK_KEY_INODE), &_layer_masks_child2, ARRAY_SIZE(_layer_masks_child2)); layer_masks_child2 = &_layer_masks_child2; child2_is_directory = d_is_dir(dentry_child2); } walker_path = *path; path_get(&walker_path); /* * We need to walk through all the hierarchy to not miss any relevant * restriction. */ while (true) { struct dentry *parent_dentry; const struct landlock_rule *rule; /* * If at least all accesses allowed on the destination are * already allowed on the source, respectively if there is at * least as much as restrictions on the destination than on the * source, then we can safely refer files from the source to * the destination without risking a privilege escalation. * This also applies in the case of RENAME_EXCHANGE, which * implies checks on both direction. This is crucial for * standalone multilayered security policies. Furthermore, * this helps avoid policy writers to shoot themselves in the * foot. */ if (unlikely(is_dom_check && no_more_access( layer_masks_parent1, layer_masks_child1, child1_is_directory, layer_masks_parent2, layer_masks_child2, child2_is_directory))) { allowed_parent1 = scope_to_request( access_request_parent1, layer_masks_parent1); allowed_parent2 = scope_to_request( access_request_parent2, layer_masks_parent2); /* Stops when all accesses are granted. */ if (allowed_parent1 && allowed_parent2) break; /* * Now, downgrades the remaining checks from domain * handled accesses to requested accesses. */ is_dom_check = false; access_masked_parent1 = access_request_parent1; access_masked_parent2 = access_request_parent2; } rule = find_rule(domain, walker_path.dentry); allowed_parent1 = landlock_unmask_layers( rule, access_masked_parent1, layer_masks_parent1, ARRAY_SIZE(*layer_masks_parent1)); allowed_parent2 = landlock_unmask_layers( rule, access_masked_parent2, layer_masks_parent2, ARRAY_SIZE(*layer_masks_parent2)); /* Stops when a rule from each layer grants access. */ if (allowed_parent1 && allowed_parent2) break; jump_up: if (walker_path.dentry == walker_path.mnt->mnt_root) { if (follow_up(&walker_path)) { /* Ignores hidden mount points. */ goto jump_up; } else { /* * Stops at the real root. Denies access * because not all layers have granted access. */ break; } } if (unlikely(IS_ROOT(walker_path.dentry))) { /* * Stops at disconnected root directories. Only allows * access to internal filesystems (e.g. nsfs, which is * reachable through /proc/<pid>/ns/<namespace>). */ allowed_parent1 = allowed_parent2 = !!(walker_path.mnt->mnt_flags & MNT_INTERNAL); break; } parent_dentry = dget_parent(walker_path.dentry); dput(walker_path.dentry); walker_path.dentry = parent_dentry; } path_put(&walker_path); return allowed_parent1 && allowed_parent2; } static int check_access_path(const struct landlock_ruleset *const domain, const struct path *const path, access_mask_t access_request) { layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {}; access_request = landlock_init_layer_masks( domain, access_request, &layer_masks, LANDLOCK_KEY_INODE); if (is_access_to_paths_allowed(domain, path, access_request, &layer_masks, NULL, 0, NULL, NULL)) return 0; return -EACCES; } static int current_check_access_path(const struct path *const path, const access_mask_t access_request) { const struct landlock_ruleset *const dom = get_current_fs_domain(); if (!dom) return 0; return check_access_path(dom, path, access_request); } static access_mask_t get_mode_access(const umode_t mode) { switch (mode & S_IFMT) { case S_IFLNK: return LANDLOCK_ACCESS_FS_MAKE_SYM; case 0: /* A zero mode translates to S_IFREG. */ case S_IFREG: return LANDLOCK_ACCESS_FS_MAKE_REG; case S_IFDIR: return LANDLOCK_ACCESS_FS_MAKE_DIR; case S_IFCHR: return LANDLOCK_ACCESS_FS_MAKE_CHAR; case S_IFBLK: return LANDLOCK_ACCESS_FS_MAKE_BLOCK; case S_IFIFO: return LANDLOCK_ACCESS_FS_MAKE_FIFO; case S_IFSOCK: return LANDLOCK_ACCESS_FS_MAKE_SOCK; default: WARN_ON_ONCE(1); return 0; } } static access_mask_t maybe_remove(const struct dentry *const dentry) { if (d_is_negative(dentry)) return 0; return d_is_dir(dentry) ? LANDLOCK_ACCESS_FS_REMOVE_DIR : LANDLOCK_ACCESS_FS_REMOVE_FILE; } /** * collect_domain_accesses - Walk through a file path and collect accesses * * @domain: Domain to check against. * @mnt_root: Last directory to check. * @dir: Directory to start the walk from. * @layer_masks_dom: Where to store the collected accesses. * * This helper is useful to begin a path walk from the @dir directory to a * @mnt_root directory used as a mount point. This mount point is the common * ancestor between the source and the destination of a renamed and linked * file. While walking from @dir to @mnt_root, we record all the domain's * allowed accesses in @layer_masks_dom. * * This is similar to is_access_to_paths_allowed() but much simpler because it * only handles walking on the same mount point and only checks one set of * accesses. * * Returns: * - true if all the domain access rights are allowed for @dir; * - false if the walk reached @mnt_root. */ static bool collect_domain_accesses( const struct landlock_ruleset *const domain, const struct dentry *const mnt_root, struct dentry *dir, layer_mask_t (*const layer_masks_dom)[LANDLOCK_NUM_ACCESS_FS]) { unsigned long access_dom; bool ret = false; if (WARN_ON_ONCE(!domain || !mnt_root || !dir || !layer_masks_dom)) return true; if (is_nouser_or_private(dir)) return true; access_dom = landlock_init_layer_masks(domain, LANDLOCK_MASK_ACCESS_FS, layer_masks_dom, LANDLOCK_KEY_INODE); dget(dir); while (true) { struct dentry *parent_dentry; /* Gets all layers allowing all domain accesses. */ if (landlock_unmask_layers(find_rule(domain, dir), access_dom, layer_masks_dom, ARRAY_SIZE(*layer_masks_dom))) { /* * Stops when all handled accesses are allowed by at * least one rule in each layer. */ ret = true; break; } /* We should not reach a root other than @mnt_root. */ if (dir == mnt_root || WARN_ON_ONCE(IS_ROOT(dir))) break; parent_dentry = dget_parent(dir); dput(dir); dir = parent_dentry; } dput(dir); return ret; } /** * current_check_refer_path - Check if a rename or link action is allowed * * @old_dentry: File or directory requested to be moved or linked. * @new_dir: Destination parent directory. * @new_dentry: Destination file or directory. * @removable: Sets to true if it is a rename operation. * @exchange: Sets to true if it is a rename operation with RENAME_EXCHANGE. * * Because of its unprivileged constraints, Landlock relies on file hierarchies * (and not only inodes) to tie access rights to files. Being able to link or * rename a file hierarchy brings some challenges. Indeed, moving or linking a * file (i.e. creating a new reference to an inode) can have an impact on the * actions allowed for a set of files if it would change its parent directory * (i.e. reparenting). * * To avoid trivial access right bypasses, Landlock first checks if the file or * directory requested to be moved would gain new access rights inherited from * its new hierarchy. Before returning any error, Landlock then checks that * the parent source hierarchy and the destination hierarchy would allow the * link or rename action. If it is not the case, an error with EACCES is * returned to inform user space that there is no way to remove or create the * requested source file type. If it should be allowed but the new inherited * access rights would be greater than the source access rights, then the * kernel returns an error with EXDEV. Prioritizing EACCES over EXDEV enables * user space to abort the whole operation if there is no way to do it, or to * manually copy the source to the destination if this remains allowed, e.g. * because file creation is allowed on the destination directory but not direct * linking. * * To achieve this goal, the kernel needs to compare two file hierarchies: the * one identifying the source file or directory (including itself), and the * destination one. This can be seen as a multilayer partial ordering problem. * The kernel walks through these paths and collects in a matrix the access * rights that are denied per layer. These matrices are then compared to see * if the destination one has more (or the same) restrictions as the source * one. If this is the case, the requested action will not return EXDEV, which * doesn't mean the action is allowed. The parent hierarchy of the source * (i.e. parent directory), and the destination hierarchy must also be checked * to verify that they explicitly allow such action (i.e. referencing, * creation and potentially removal rights). The kernel implementation is then * required to rely on potentially four matrices of access rights: one for the * source file or directory (i.e. the child), a potentially other one for the * other source/destination (in case of RENAME_EXCHANGE), one for the source * parent hierarchy and a last one for the destination hierarchy. These * ephemeral matrices take some space on the stack, which limits the number of * layers to a deemed reasonable number: 16. * * Returns: * - 0 if access is allowed; * - -EXDEV if @old_dentry would inherit new access rights from @new_dir; * - -EACCES if file removal or creation is denied. */ static int current_check_refer_path(struct dentry *const old_dentry, const struct path *const new_dir, struct dentry *const new_dentry, const bool removable, const bool exchange) { const struct landlock_ruleset *const dom = get_current_fs_domain(); bool allow_parent1, allow_parent2; access_mask_t access_request_parent1, access_request_parent2; struct path mnt_dir; struct dentry *old_parent; layer_mask_t layer_masks_parent1[LANDLOCK_NUM_ACCESS_FS] = {}, layer_masks_parent2[LANDLOCK_NUM_ACCESS_FS] = {}; if (!dom) return 0; if (WARN_ON_ONCE(dom->num_layers < 1)) return -EACCES; if (unlikely(d_is_negative(old_dentry))) return -ENOENT; if (exchange) { if (unlikely(d_is_negative(new_dentry))) return -ENOENT; access_request_parent1 = get_mode_access(d_backing_inode(new_dentry)->i_mode); } else { access_request_parent1 = 0; } access_request_parent2 = get_mode_access(d_backing_inode(old_dentry)->i_mode); if (removable) { access_request_parent1 |= maybe_remove(old_dentry); access_request_parent2 |= maybe_remove(new_dentry); } /* The mount points are the same for old and new paths, cf. EXDEV. */ if (old_dentry->d_parent == new_dir->dentry) { /* * The LANDLOCK_ACCESS_FS_REFER access right is not required * for same-directory referer (i.e. no reparenting). */ access_request_parent1 = landlock_init_layer_masks( dom, access_request_parent1 | access_request_parent2, &layer_masks_parent1, LANDLOCK_KEY_INODE); if (is_access_to_paths_allowed( dom, new_dir, access_request_parent1, &layer_masks_parent1, NULL, 0, NULL, NULL)) return 0; return -EACCES; } access_request_parent1 |= LANDLOCK_ACCESS_FS_REFER; access_request_parent2 |= LANDLOCK_ACCESS_FS_REFER; /* Saves the common mount point. */ mnt_dir.mnt = new_dir->mnt; mnt_dir.dentry = new_dir->mnt->mnt_root; /* * old_dentry may be the root of the common mount point and * !IS_ROOT(old_dentry) at the same time (e.g. with open_tree() and * OPEN_TREE_CLONE). We do not need to call dget(old_parent) because * we keep a reference to old_dentry. */ old_parent = (old_dentry == mnt_dir.dentry) ? old_dentry : old_dentry->d_parent; /* new_dir->dentry is equal to new_dentry->d_parent */ allow_parent1 = collect_domain_accesses(dom, mnt_dir.dentry, old_parent, &layer_masks_parent1); allow_parent2 = collect_domain_accesses( dom, mnt_dir.dentry, new_dir->dentry, &layer_masks_parent2); if (allow_parent1 && allow_parent2) return 0; /* * To be able to compare source and destination domain access rights, * take into account the @old_dentry access rights aggregated with its * parent access rights. This will be useful to compare with the * destination parent access rights. */ if (is_access_to_paths_allowed( dom, &mnt_dir, access_request_parent1, &layer_masks_parent1, old_dentry, access_request_parent2, &layer_masks_parent2, exchange ? new_dentry : NULL)) return 0; /* * This prioritizes EACCES over EXDEV for all actions, including * renames with RENAME_EXCHANGE. */ if (likely(is_eacces(&layer_masks_parent1, access_request_parent1) || is_eacces(&layer_masks_parent2, access_request_parent2))) return -EACCES; /* * Gracefully forbids reparenting if the destination directory * hierarchy is not a superset of restrictions of the source directory * hierarchy, or if LANDLOCK_ACCESS_FS_REFER is not allowed by the * source or the destination. */ return -EXDEV; } /* Inode hooks */ static void hook_inode_free_security(struct inode *const inode) { /* * All inodes must already have been untied from their object by * release_inode() or hook_sb_delete(). */ WARN_ON_ONCE(landlock_inode(inode)->object); } /* Super-block hooks */ /* * Release the inodes used in a security policy. * * Cf. fsnotify_unmount_inodes() and invalidate_inodes() */ static void hook_sb_delete(struct super_block *const sb) { struct inode *inode, *prev_inode = NULL; if (!landlock_initialized) return; spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { struct landlock_object *object; /* Only handles referenced inodes. */ if (!atomic_read(&inode->i_count)) continue; /* * Protects against concurrent modification of inode (e.g. * from get_inode_object()). */ spin_lock(&inode->i_lock); /* * Checks I_FREEING and I_WILL_FREE to protect against a race * condition when release_inode() just called iput(), which * could lead to a NULL dereference of inode->security or a * second call to iput() for the same Landlock object. Also * checks I_NEW because such inode cannot be tied to an object. */ if (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) { spin_unlock(&inode->i_lock); continue; } rcu_read_lock(); object = rcu_dereference(landlock_inode(inode)->object); if (!object) { rcu_read_unlock(); spin_unlock(&inode->i_lock); continue; } /* Keeps a reference to this inode until the next loop walk. */ __iget(inode); spin_unlock(&inode->i_lock); /* * If there is no concurrent release_inode() ongoing, then we * are in charge of calling iput() on this inode, otherwise we * will just wait for it to finish. */ spin_lock(&object->lock); if (object->underobj == inode) { object->underobj = NULL; spin_unlock(&object->lock); rcu_read_unlock(); /* * Because object->underobj was not NULL, * release_inode() and get_inode_object() guarantee * that it is safe to reset * landlock_inode(inode)->object while it is not NULL. * It is therefore not necessary to lock inode->i_lock. */ rcu_assign_pointer(landlock_inode(inode)->object, NULL); /* * At this point, we own the ihold() reference that was * originally set up by get_inode_object() and the * __iget() reference that we just set in this loop * walk. Therefore the following call to iput() will * not sleep nor drop the inode because there is now at * least two references to it. */ iput(inode); } else { spin_unlock(&object->lock); rcu_read_unlock(); } if (prev_inode) { /* * At this point, we still own the __iget() reference * that we just set in this loop walk. Therefore we * can drop the list lock and know that the inode won't * disappear from under us until the next loop walk. */ spin_unlock(&sb->s_inode_list_lock); /* * We can now actually put the inode reference from the * previous loop walk, which is not needed anymore. */ iput(prev_inode); cond_resched(); spin_lock(&sb->s_inode_list_lock); } prev_inode = inode; } spin_unlock(&sb->s_inode_list_lock); /* Puts the inode reference from the last loop walk, if any. */ if (prev_inode) iput(prev_inode); /* Waits for pending iput() in release_inode(). */ wait_var_event(&landlock_superblock(sb)->inode_refs, !atomic_long_read(&landlock_superblock(sb)->inode_refs)); } /* * Because a Landlock security policy is defined according to the filesystem * topology (i.e. the mount namespace), changing it may grant access to files * not previously allowed. * * To make it simple, deny any filesystem topology modification by landlocked * processes. Non-landlocked processes may still change the namespace of a * landlocked process, but this kind of threat must be handled by a system-wide * access-control security policy. * * This could be lifted in the future if Landlock can safely handle mount * namespace updates requested by a landlocked process. Indeed, we could * update the current domain (which is currently read-only) by taking into * account the accesses of the source and the destination of a new mount point. * However, it would also require to make all the child domains dynamically * inherit these new constraints. Anyway, for backward compatibility reasons, * a dedicated user space option would be required (e.g. as a ruleset flag). */ static int hook_sb_mount(const char *const dev_name, const struct path *const path, const char *const type, const unsigned long flags, void *const data) { if (!get_current_fs_domain()) return 0; return -EPERM; } static int hook_move_mount(const struct path *const from_path, const struct path *const to_path) { if (!get_current_fs_domain()) return 0; return -EPERM; } /* * Removing a mount point may reveal a previously hidden file hierarchy, which * may then grant access to files, which may have previously been forbidden. */ static int hook_sb_umount(struct vfsmount *const mnt, const int flags) { if (!get_current_fs_domain()) return 0; return -EPERM; } static int hook_sb_remount(struct super_block *const sb, void *const mnt_opts) { if (!get_current_fs_domain()) return 0; return -EPERM; } /* * pivot_root(2), like mount(2), changes the current mount namespace. It must * then be forbidden for a landlocked process. * * However, chroot(2) may be allowed because it only changes the relative root * directory of the current process. Moreover, it can be used to restrict the * view of the filesystem. */ static int hook_sb_pivotroot(const struct path *const old_path, const struct path *const new_path) { if (!get_current_fs_domain()) return 0; return -EPERM; } /* Path hooks */ static int hook_path_link(struct dentry *const old_dentry, const struct path *const new_dir, struct dentry *const new_dentry) { return current_check_refer_path(old_dentry, new_dir, new_dentry, false, false); } static int hook_path_rename(const struct path *const old_dir, struct dentry *const old_dentry, const struct path *const new_dir, struct dentry *const new_dentry, const unsigned int flags) { /* old_dir refers to old_dentry->d_parent and new_dir->mnt */ return current_check_refer_path(old_dentry, new_dir, new_dentry, true, !!(flags & RENAME_EXCHANGE)); } static int hook_path_mkdir(const struct path *const dir, struct dentry *const dentry, const umode_t mode) { return current_check_access_path(dir, LANDLOCK_ACCESS_FS_MAKE_DIR); } static int hook_path_mknod(const struct path *const dir, struct dentry *const dentry, const umode_t mode, const unsigned int dev) { const struct landlock_ruleset *const dom = get_current_fs_domain(); if (!dom) return 0; return check_access_path(dom, dir, get_mode_access(mode)); } static int hook_path_symlink(const struct path *const dir, struct dentry *const dentry, const char *const old_name) { return current_check_access_path(dir, LANDLOCK_ACCESS_FS_MAKE_SYM); } static int hook_path_unlink(const struct path *const dir, struct dentry *const dentry) { return current_check_access_path(dir, LANDLOCK_ACCESS_FS_REMOVE_FILE); } static int hook_path_rmdir(const struct path *const dir, struct dentry *const dentry) { return current_check_access_path(dir, LANDLOCK_ACCESS_FS_REMOVE_DIR); } static int hook_path_truncate(const struct path *const path) { return current_check_access_path(path, LANDLOCK_ACCESS_FS_TRUNCATE); } /* File hooks */ /** * get_required_file_open_access - Get access needed to open a file * * @file: File being opened. * * Returns the access rights that are required for opening the given file, * depending on the file type and open mode. */ static access_mask_t get_required_file_open_access(const struct file *const file) { access_mask_t access = 0; if (file->f_mode & FMODE_READ) { /* A directory can only be opened in read mode. */ if (S_ISDIR(file_inode(file)->i_mode)) return LANDLOCK_ACCESS_FS_READ_DIR; access = LANDLOCK_ACCESS_FS_READ_FILE; } if (file->f_mode & FMODE_WRITE) access |= LANDLOCK_ACCESS_FS_WRITE_FILE; /* __FMODE_EXEC is indeed part of f_flags, not f_mode. */ if (file->f_flags & __FMODE_EXEC) access |= LANDLOCK_ACCESS_FS_EXECUTE; return access; } static int hook_file_alloc_security(struct file *const file) { /* * Grants all access rights, even if most of them are not checked later * on. It is more consistent. * * Notably, file descriptors for regular files can also be acquired * without going through the file_open hook, for example when using * memfd_create(2). */ landlock_file(file)->allowed_access = LANDLOCK_MASK_ACCESS_FS; return 0; } static bool is_device(const struct file *const file) { const struct inode *inode = file_inode(file); return S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode); } static int hook_file_open(struct file *const file) { layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {}; access_mask_t open_access_request, full_access_request, allowed_access, optional_access; const struct landlock_ruleset *const dom = get_fs_domain(landlock_cred(file->f_cred)->domain); if (!dom) return 0; /* * Because a file may be opened with O_PATH, get_required_file_open_access() * may return 0. This case will be handled with a future Landlock * evolution. */ open_access_request = get_required_file_open_access(file); /* * We look up more access than what we immediately need for open(), so * that we can later authorize operations on opened files. */ optional_access = LANDLOCK_ACCESS_FS_TRUNCATE; if (is_device(file)) optional_access |= LANDLOCK_ACCESS_FS_IOCTL_DEV; full_access_request = open_access_request | optional_access; if (is_access_to_paths_allowed( dom, &file->f_path, landlock_init_layer_masks(dom, full_access_request, &layer_masks, LANDLOCK_KEY_INODE), &layer_masks, NULL, 0, NULL, NULL)) { allowed_access = full_access_request; } else { unsigned long access_bit; const unsigned long access_req = full_access_request; /* * Calculate the actual allowed access rights from layer_masks. * Add each access right to allowed_access which has not been * vetoed by any layer. */ allowed_access = 0; for_each_set_bit(access_bit, &access_req, ARRAY_SIZE(layer_masks)) { if (!layer_masks[access_bit]) allowed_access |= BIT_ULL(access_bit); } } /* * For operations on already opened files (i.e. ftruncate()), it is the * access rights at the time of open() which decide whether the * operation is permitted. Therefore, we record the relevant subset of * file access rights in the opened struct file. */ landlock_file(file)->allowed_access = allowed_access; if ((open_access_request & allowed_access) == open_access_request) return 0; return -EACCES; } static int hook_file_truncate(struct file *const file) { /* * Allows truncation if the truncate right was available at the time of * opening the file, to get a consistent access check as for read, write * and execute operations. * * Note: For checks done based on the file's Landlock allowed access, we * enforce them independently of whether the current thread is in a * Landlock domain, so that open files passed between independent * processes retain their behaviour. */ if (landlock_file(file)->allowed_access & LANDLOCK_ACCESS_FS_TRUNCATE) return 0; return -EACCES; } static int hook_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { access_mask_t allowed_access = landlock_file(file)->allowed_access; /* * It is the access rights at the time of opening the file which * determine whether IOCTL can be used on the opened file later. * * The access right is attached to the opened file in hook_file_open(). */ if (allowed_access & LANDLOCK_ACCESS_FS_IOCTL_DEV) return 0; if (!is_device(file)) return 0; if (is_masked_device_ioctl(cmd)) return 0; return -EACCES; } static int hook_file_ioctl_compat(struct file *file, unsigned int cmd, unsigned long arg) { access_mask_t allowed_access = landlock_file(file)->allowed_access; /* * It is the access rights at the time of opening the file which * determine whether IOCTL can be used on the opened file later. * * The access right is attached to the opened file in hook_file_open(). */ if (allowed_access & LANDLOCK_ACCESS_FS_IOCTL_DEV) return 0; if (!is_device(file)) return 0; if (is_masked_device_ioctl_compat(cmd)) return 0; return -EACCES; } static struct security_hook_list landlock_hooks[] __ro_after_init = { LSM_HOOK_INIT(inode_free_security, hook_inode_free_security), LSM_HOOK_INIT(sb_delete, hook_sb_delete), LSM_HOOK_INIT(sb_mount, hook_sb_mount), LSM_HOOK_INIT(move_mount, hook_move_mount), LSM_HOOK_INIT(sb_umount, hook_sb_umount), LSM_HOOK_INIT(sb_remount, hook_sb_remount), LSM_HOOK_INIT(sb_pivotroot, hook_sb_pivotroot), LSM_HOOK_INIT(path_link, hook_path_link), LSM_HOOK_INIT(path_rename, hook_path_rename), LSM_HOOK_INIT(path_mkdir, hook_path_mkdir), LSM_HOOK_INIT(path_mknod, hook_path_mknod), LSM_HOOK_INIT(path_symlink, hook_path_symlink), LSM_HOOK_INIT(path_unlink, hook_path_unlink), LSM_HOOK_INIT(path_rmdir, hook_path_rmdir), LSM_HOOK_INIT(path_truncate, hook_path_truncate), LSM_HOOK_INIT(file_alloc_security, hook_file_alloc_security), LSM_HOOK_INIT(file_open, hook_file_open), LSM_HOOK_INIT(file_truncate, hook_file_truncate), LSM_HOOK_INIT(file_ioctl, hook_file_ioctl), LSM_HOOK_INIT(file_ioctl_compat, hook_file_ioctl_compat), }; __init void landlock_add_fs_hooks(void) { security_add_hooks(landlock_hooks, ARRAY_SIZE(landlock_hooks), &landlock_lsmid); } #ifdef CONFIG_SECURITY_LANDLOCK_KUNIT_TEST /* clang-format off */ static struct kunit_case test_cases[] = { KUNIT_CASE(test_no_more_access), KUNIT_CASE(test_scope_to_request_with_exec_none), KUNIT_CASE(test_scope_to_request_with_exec_some), KUNIT_CASE(test_scope_to_request_without_access), KUNIT_CASE(test_is_eacces_with_none), KUNIT_CASE(test_is_eacces_with_refer), KUNIT_CASE(test_is_eacces_with_write), {} }; /* clang-format on */ static struct kunit_suite test_suite = { .name = "landlock_fs", .test_cases = test_cases, }; kunit_test_suite(test_suite); #endif /* CONFIG_SECURITY_LANDLOCK_KUNIT_TEST */ |
3 3 3 3 3 2 1 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 | // SPDX-License-Identifier: GPL-2.0-or-later /* * DVB USB Linux driver for Afatech AF9015 DVB-T USB2.0 receiver * * Copyright (C) 2007 Antti Palosaari <crope@iki.fi> * * Thanks to Afatech who kindly provided information. */ #include "af9015.h" static int dvb_usb_af9015_remote; module_param_named(remote, dvb_usb_af9015_remote, int, 0644); MODULE_PARM_DESC(remote, "select remote"); DVB_DEFINE_MOD_OPT_ADAPTER_NR(adapter_nr); static int af9015_ctrl_msg(struct dvb_usb_device *d, struct req_t *req) { #define REQ_HDR_LEN 8 /* send header size */ #define ACK_HDR_LEN 2 /* rece header size */ struct af9015_state *state = d_to_priv(d); struct usb_interface *intf = d->intf; int ret, wlen, rlen; u8 write = 1; mutex_lock(&d->usb_mutex); state->buf[0] = req->cmd; state->buf[1] = state->seq++; state->buf[2] = req->i2c_addr << 1; state->buf[3] = req->addr >> 8; state->buf[4] = req->addr & 0xff; state->buf[5] = req->mbox; state->buf[6] = req->addr_len; state->buf[7] = req->data_len; switch (req->cmd) { case GET_CONFIG: case READ_MEMORY: case RECONNECT_USB: write = 0; break; case READ_I2C: write = 0; state->buf[2] |= 0x01; /* set I2C direction */ fallthrough; case WRITE_I2C: state->buf[0] = READ_WRITE_I2C; break; case WRITE_MEMORY: if (((req->addr & 0xff00) == 0xff00) || ((req->addr & 0xff00) == 0xae00)) state->buf[0] = WRITE_VIRTUAL_MEMORY; break; case WRITE_VIRTUAL_MEMORY: case COPY_FIRMWARE: case DOWNLOAD_FIRMWARE: case BOOT: break; default: dev_err(&intf->dev, "unknown cmd %d\n", req->cmd); ret = -EIO; goto error; } /* Buffer overflow check */ if ((write && (req->data_len > BUF_LEN - REQ_HDR_LEN)) || (!write && (req->data_len > BUF_LEN - ACK_HDR_LEN))) { dev_err(&intf->dev, "too much data, cmd %u, len %u\n", req->cmd, req->data_len); ret = -EINVAL; goto error; } /* * Write receives seq + status = 2 bytes * Read receives seq + status + data = 2 + N bytes */ wlen = REQ_HDR_LEN; rlen = ACK_HDR_LEN; if (write) { wlen += req->data_len; memcpy(&state->buf[REQ_HDR_LEN], req->data, req->data_len); } else { rlen += req->data_len; } /* no ack for these packets */ if (req->cmd == DOWNLOAD_FIRMWARE || req->cmd == RECONNECT_USB) rlen = 0; ret = dvb_usbv2_generic_rw_locked(d, state->buf, wlen, state->buf, rlen); if (ret) goto error; /* check status */ if (rlen && state->buf[1]) { dev_err(&intf->dev, "cmd failed %u\n", state->buf[1]); ret = -EIO; goto error; } /* read request, copy returned data to return buf */ if (!write) memcpy(req->data, &state->buf[ACK_HDR_LEN], req->data_len); error: mutex_unlock(&d->usb_mutex); return ret; } static int af9015_write_reg_i2c(struct dvb_usb_device *d, u8 addr, u16 reg, u8 val) { struct af9015_state *state = d_to_priv(d); struct req_t req = {WRITE_I2C, addr, reg, 1, 1, 1, &val}; if (addr == state->af9013_i2c_addr[0] || addr == state->af9013_i2c_addr[1]) req.addr_len = 3; return af9015_ctrl_msg(d, &req); } static int af9015_read_reg_i2c(struct dvb_usb_device *d, u8 addr, u16 reg, u8 *val) { struct af9015_state *state = d_to_priv(d); struct req_t req = {READ_I2C, addr, reg, 0, 1, 1, val}; if (addr == state->af9013_i2c_addr[0] || addr == state->af9013_i2c_addr[1]) req.addr_len = 3; return af9015_ctrl_msg(d, &req); } static int af9015_i2c_xfer(struct i2c_adapter *adap, struct i2c_msg msg[], int num) { struct dvb_usb_device *d = i2c_get_adapdata(adap); struct af9015_state *state = d_to_priv(d); struct usb_interface *intf = d->intf; int ret; u16 addr; u8 mbox, addr_len; struct req_t req; /* * I2C multiplexing: * There could be two tuners, both using same I2C address. Demodulator * I2C-gate is only possibility to select correct tuner. * * ........................................... * . AF9015 integrates AF9013 demodulator . * . ____________ ____________ . ____________ * .| USB IF | | demod |. | tuner | * .|------------| |------------|. |------------| * .| AF9015 | | AF9013 |. | MXL5003 | * .| |--+--I2C-----|-----/ -----|.----I2C-----| | * .| | | | addr 0x1c |. | addr 0x63 | * .|____________| | |____________|. |____________| * .................|......................... * | ____________ ____________ * | | demod | | tuner | * | |------------| |------------| * | | AF9013 | | MXL5003 | * +--I2C-----|-----/ -----|-----I2C-----| | * | addr 0x1d | | addr 0x63 | * |____________| |____________| */ if (msg[0].len == 0 || msg[0].flags & I2C_M_RD) { addr = 0x0000; mbox = 0; addr_len = 0; } else if (msg[0].len == 1) { addr = msg[0].buf[0]; mbox = 0; addr_len = 1; } else if (msg[0].len == 2) { addr = msg[0].buf[0] << 8 | msg[0].buf[1] << 0; mbox = 0; addr_len = 2; } else { addr = msg[0].buf[0] << 8 | msg[0].buf[1] << 0; mbox = msg[0].buf[2]; addr_len = 3; } if (num == 1 && !(msg[0].flags & I2C_M_RD)) { /* i2c write */ if (msg[0].len > 21) { ret = -EOPNOTSUPP; goto err; } if (msg[0].addr == state->af9013_i2c_addr[0]) req.cmd = WRITE_MEMORY; else req.cmd = WRITE_I2C; req.i2c_addr = msg[0].addr; req.addr = addr; req.mbox = mbox; req.addr_len = addr_len; req.data_len = msg[0].len - addr_len; req.data = &msg[0].buf[addr_len]; ret = af9015_ctrl_msg(d, &req); } else if (num == 2 && !(msg[0].flags & I2C_M_RD) && (msg[1].flags & I2C_M_RD)) { /* i2c write + read */ if (msg[0].len > 3 || msg[1].len > 61) { ret = -EOPNOTSUPP; goto err; } if (msg[0].addr == state->af9013_i2c_addr[0]) req.cmd = READ_MEMORY; else req.cmd = READ_I2C; req.i2c_addr = msg[0].addr; req.addr = addr; req.mbox = mbox; req.addr_len = addr_len; req.data_len = msg[1].len; req.data = &msg[1].buf[0]; ret = af9015_ctrl_msg(d, &req); } else if (num == 1 && (msg[0].flags & I2C_M_RD)) { /* i2c read */ if (msg[0].len > 61) { ret = -EOPNOTSUPP; goto err; } if (msg[0].addr == state->af9013_i2c_addr[0]) { ret = -EINVAL; goto err; } req.cmd = READ_I2C; req.i2c_addr = msg[0].addr; req.addr = addr; req.mbox = mbox; req.addr_len = addr_len; req.data_len = msg[0].len; req.data = &msg[0].buf[0]; ret = af9015_ctrl_msg(d, &req); } else { ret = -EOPNOTSUPP; dev_dbg(&intf->dev, "unknown msg, num %u\n", num); } if (ret) goto err; return num; err: dev_dbg(&intf->dev, "failed %d\n", ret); return ret; } static u32 af9015_i2c_func(struct i2c_adapter *adapter) { return I2C_FUNC_I2C; } static struct i2c_algorithm af9015_i2c_algo = { .master_xfer = af9015_i2c_xfer, .functionality = af9015_i2c_func, }; static int af9015_identify_state(struct dvb_usb_device *d, const char **name) { struct usb_interface *intf = d->intf; int ret; u8 reply; struct req_t req = {GET_CONFIG, 0, 0, 0, 0, 1, &reply}; ret = af9015_ctrl_msg(d, &req); if (ret) return ret; dev_dbg(&intf->dev, "reply %02x\n", reply); if (reply == 0x02) ret = WARM; else ret = COLD; return ret; } static int af9015_download_firmware(struct dvb_usb_device *d, const struct firmware *firmware) { struct af9015_state *state = d_to_priv(d); struct usb_interface *intf = d->intf; int ret, i, rem; struct req_t req = {DOWNLOAD_FIRMWARE, 0, 0, 0, 0, 0, NULL}; u16 checksum; dev_dbg(&intf->dev, "\n"); /* Calc checksum, we need it when copy firmware to slave demod */ for (i = 0, checksum = 0; i < firmware->size; i++) checksum += firmware->data[i]; state->firmware_size = firmware->size; state->firmware_checksum = checksum; #define LEN_MAX (BUF_LEN - REQ_HDR_LEN) /* Max payload size */ for (rem = firmware->size; rem > 0; rem -= LEN_MAX) { req.data_len = min(LEN_MAX, rem); req.data = (u8 *)&firmware->data[firmware->size - rem]; req.addr = 0x5100 + firmware->size - rem; ret = af9015_ctrl_msg(d, &req); if (ret) { dev_err(&intf->dev, "firmware download failed %d\n", ret); goto err; } } req.cmd = BOOT; req.data_len = 0; ret = af9015_ctrl_msg(d, &req); if (ret) { dev_err(&intf->dev, "firmware boot failed %d\n", ret); goto err; } return 0; err: dev_dbg(&intf->dev, "failed %d\n", ret); return ret; } #define AF9015_EEPROM_SIZE 256 /* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */ #define GOLDEN_RATIO_PRIME_32 0x9e370001UL /* hash (and dump) eeprom */ static int af9015_eeprom_hash(struct dvb_usb_device *d) { struct af9015_state *state = d_to_priv(d); struct usb_interface *intf = d->intf; int ret, i; u8 buf[AF9015_EEPROM_SIZE]; struct req_t req = {READ_I2C, AF9015_I2C_EEPROM, 0, 0, 1, 1, NULL}; /* read eeprom */ for (i = 0; i < AF9015_EEPROM_SIZE; i++) { req.addr = i; req.data = &buf[i]; ret = af9015_ctrl_msg(d, &req); if (ret < 0) goto err; } /* calculate checksum */ for (i = 0; i < AF9015_EEPROM_SIZE / sizeof(u32); i++) { state->eeprom_sum *= GOLDEN_RATIO_PRIME_32; state->eeprom_sum += le32_to_cpu(((__le32 *)buf)[i]); } for (i = 0; i < AF9015_EEPROM_SIZE; i += 16) dev_dbg(&intf->dev, "%*ph\n", 16, buf + i); dev_dbg(&intf->dev, "eeprom sum %.8x\n", state->eeprom_sum); return 0; err: dev_dbg(&intf->dev, "failed %d\n", ret); return ret; } static int af9015_read_config(struct dvb_usb_device *d) { struct af9015_state *state = d_to_priv(d); struct usb_interface *intf = d->intf; int ret; u8 val, i, offset = 0; struct req_t req = {READ_I2C, AF9015_I2C_EEPROM, 0, 0, 1, 1, &val}; dev_dbg(&intf->dev, "\n"); /* IR remote controller */ req.addr = AF9015_EEPROM_IR_MODE; /* first message will timeout often due to possible hw bug */ for (i = 0; i < 4; i++) { ret = af9015_ctrl_msg(d, &req); if (!ret) break; } if (ret) goto error; ret = af9015_eeprom_hash(d); if (ret) goto error; state->ir_mode = val; dev_dbg(&intf->dev, "ir mode %02x\n", val); /* TS mode - one or two receivers */ req.addr = AF9015_EEPROM_TS_MODE; ret = af9015_ctrl_msg(d, &req); if (ret) goto error; state->dual_mode = val; dev_dbg(&intf->dev, "ts mode %02x\n", state->dual_mode); state->af9013_i2c_addr[0] = AF9015_I2C_DEMOD; if (state->dual_mode) { /* read 2nd demodulator I2C address */ req.addr = AF9015_EEPROM_DEMOD2_I2C; ret = af9015_ctrl_msg(d, &req); if (ret) goto error; state->af9013_i2c_addr[1] = val >> 1; } for (i = 0; i < state->dual_mode + 1; i++) { if (i == 1) offset = AF9015_EEPROM_OFFSET; /* xtal */ req.addr = AF9015_EEPROM_XTAL_TYPE1 + offset; ret = af9015_ctrl_msg(d, &req); if (ret) goto error; switch (val) { case 0: state->af9013_pdata[i].clk = 28800000; break; case 1: state->af9013_pdata[i].clk = 20480000; break; case 2: state->af9013_pdata[i].clk = 28000000; break; case 3: state->af9013_pdata[i].clk = 25000000; break; } dev_dbg(&intf->dev, "[%d] xtal %02x, clk %u\n", i, val, state->af9013_pdata[i].clk); /* IF frequency */ req.addr = AF9015_EEPROM_IF1H + offset; ret = af9015_ctrl_msg(d, &req); if (ret) goto error; state->af9013_pdata[i].if_frequency = val << 8; req.addr = AF9015_EEPROM_IF1L + offset; ret = af9015_ctrl_msg(d, &req); if (ret) goto error; state->af9013_pdata[i].if_frequency += val; state->af9013_pdata[i].if_frequency *= 1000; dev_dbg(&intf->dev, "[%d] if frequency %u\n", i, state->af9013_pdata[i].if_frequency); /* MT2060 IF1 */ req.addr = AF9015_EEPROM_MT2060_IF1H + offset; ret = af9015_ctrl_msg(d, &req); if (ret) goto error; state->mt2060_if1[i] = val << 8; req.addr = AF9015_EEPROM_MT2060_IF1L + offset; ret = af9015_ctrl_msg(d, &req); if (ret) goto error; state->mt2060_if1[i] += val; dev_dbg(&intf->dev, "[%d] MT2060 IF1 %u\n", i, state->mt2060_if1[i]); /* tuner */ req.addr = AF9015_EEPROM_TUNER_ID1 + offset; ret = af9015_ctrl_msg(d, &req); if (ret) goto error; switch (val) { case AF9013_TUNER_ENV77H11D5: case AF9013_TUNER_MT2060: case AF9013_TUNER_QT1010: case AF9013_TUNER_UNKNOWN: case AF9013_TUNER_MT2060_2: case AF9013_TUNER_TDA18271: case AF9013_TUNER_QT1010A: case AF9013_TUNER_TDA18218: state->af9013_pdata[i].spec_inv = 1; break; case AF9013_TUNER_MXL5003D: case AF9013_TUNER_MXL5005D: case AF9013_TUNER_MXL5005R: case AF9013_TUNER_MXL5007T: state->af9013_pdata[i].spec_inv = 0; break; case AF9013_TUNER_MC44S803: state->af9013_pdata[i].gpio[1] = AF9013_GPIO_LO; state->af9013_pdata[i].spec_inv = 1; break; default: dev_err(&intf->dev, "tuner id %02x not supported, please report!\n", val); return -ENODEV; } state->af9013_pdata[i].tuner = val; dev_dbg(&intf->dev, "[%d] tuner id %02x\n", i, val); } error: if (ret) dev_err(&intf->dev, "eeprom read failed %d\n", ret); /* * AverMedia AVerTV Volar Black HD (A850) device have bad EEPROM * content :-( Override some wrong values here. Ditto for the * AVerTV Red HD+ (A850T) device. */ if (le16_to_cpu(d->udev->descriptor.idVendor) == USB_VID_AVERMEDIA && ((le16_to_cpu(d->udev->descriptor.idProduct) == USB_PID_AVERMEDIA_A850) || (le16_to_cpu(d->udev->descriptor.idProduct) == USB_PID_AVERMEDIA_A850T))) { dev_dbg(&intf->dev, "AverMedia A850: overriding config\n"); /* disable dual mode */ state->dual_mode = 0; /* set correct IF */ state->af9013_pdata[0].if_frequency = 4570000; } return ret; } static int af9015_get_stream_config(struct dvb_frontend *fe, u8 *ts_type, struct usb_data_stream_properties *stream) { struct dvb_usb_device *d = fe_to_d(fe); struct usb_interface *intf = d->intf; dev_dbg(&intf->dev, "adap %u\n", fe_to_adap(fe)->id); if (d->udev->speed == USB_SPEED_FULL) stream->u.bulk.buffersize = 5 * 188; return 0; } static int af9015_streaming_ctrl(struct dvb_frontend *fe, int onoff) { struct dvb_usb_device *d = fe_to_d(fe); struct af9015_state *state = d_to_priv(d); struct usb_interface *intf = d->intf; int ret; unsigned int utmp1, utmp2, reg1, reg2; u8 buf[2]; const unsigned int adap_id = fe_to_adap(fe)->id; dev_dbg(&intf->dev, "adap id %d, onoff %d\n", adap_id, onoff); if (!state->usb_ts_if_configured[adap_id]) { dev_dbg(&intf->dev, "set usb and ts interface\n"); /* USB IF stream settings */ utmp1 = (d->udev->speed == USB_SPEED_FULL ? 5 : 87) * 188 / 4; utmp2 = (d->udev->speed == USB_SPEED_FULL ? 64 : 512) / 4; buf[0] = (utmp1 >> 0) & 0xff; buf[1] = (utmp1 >> 8) & 0xff; if (adap_id == 0) { /* 1st USB IF (EP4) stream settings */ reg1 = 0xdd88; reg2 = 0xdd0c; } else { /* 2nd USB IF (EP5) stream settings */ reg1 = 0xdd8a; reg2 = 0xdd0d; } ret = regmap_bulk_write(state->regmap, reg1, buf, 2); if (ret) goto err; ret = regmap_write(state->regmap, reg2, utmp2); if (ret) goto err; /* TS IF settings */ if (state->dual_mode) { utmp1 = 0x01; utmp2 = 0x10; } else { utmp1 = 0x00; utmp2 = 0x00; } ret = regmap_update_bits(state->regmap, 0xd50b, 0x01, utmp1); if (ret) goto err; ret = regmap_update_bits(state->regmap, 0xd520, 0x10, utmp2); if (ret) goto err; state->usb_ts_if_configured[adap_id] = true; } if (adap_id == 0 && onoff) { /* Adapter 0 stream on. EP4: clear NAK, enable, clear reset */ ret = regmap_update_bits(state->regmap, 0xdd13, 0x20, 0x00); if (ret) goto err; ret = regmap_update_bits(state->regmap, 0xdd11, 0x20, 0x20); if (ret) goto err; ret = regmap_update_bits(state->regmap, 0xd507, 0x04, 0x00); if (ret) goto err; } else if (adap_id == 1 && onoff) { /* Adapter 1 stream on. EP5: clear NAK, enable, clear reset */ ret = regmap_update_bits(state->regmap, 0xdd13, 0x40, 0x00); if (ret) goto err; ret = regmap_update_bits(state->regmap, 0xdd11, 0x40, 0x40); if (ret) goto err; ret = regmap_update_bits(state->regmap, 0xd50b, 0x02, 0x00); if (ret) goto err; } else if (adap_id == 0 && !onoff) { /* Adapter 0 stream off. EP4: set reset, disable, set NAK */ ret = regmap_update_bits(state->regmap, 0xd507, 0x04, 0x04); if (ret) goto err; ret = regmap_update_bits(state->regmap, 0xdd11, 0x20, 0x00); if (ret) goto err; ret = regmap_update_bits(state->regmap, 0xdd13, 0x20, 0x20); if (ret) goto err; } else if (adap_id == 1 && !onoff) { /* Adapter 1 stream off. EP5: set reset, disable, set NAK */ ret = regmap_update_bits(state->regmap, 0xd50b, 0x02, 0x02); if (ret) goto err; ret = regmap_update_bits(state->regmap, 0xdd11, 0x40, 0x00); if (ret) goto err; ret = regmap_update_bits(state->regmap, 0xdd13, 0x40, 0x40); if (ret) goto err; } return 0; err: dev_dbg(&intf->dev, "failed %d\n", ret); return ret; } static int af9015_get_adapter_count(struct dvb_usb_device *d) { struct af9015_state *state = d_to_priv(d); return state->dual_mode + 1; } /* override demod callbacks for resource locking */ static int af9015_af9013_set_frontend(struct dvb_frontend *fe) { int ret; struct af9015_state *state = fe_to_priv(fe); if (mutex_lock_interruptible(&state->fe_mutex)) return -EAGAIN; ret = state->set_frontend[fe_to_adap(fe)->id](fe); mutex_unlock(&state->fe_mutex); return ret; } /* override demod callbacks for resource locking */ static int af9015_af9013_read_status(struct dvb_frontend *fe, enum fe_status *status) { int ret; struct af9015_state *state = fe_to_priv(fe); if (mutex_lock_interruptible(&state->fe_mutex)) return -EAGAIN; ret = state->read_status[fe_to_adap(fe)->id](fe, status); mutex_unlock(&state->fe_mutex); return ret; } /* override demod callbacks for resource locking */ static int af9015_af9013_init(struct dvb_frontend *fe) { int ret; struct af9015_state *state = fe_to_priv(fe); if (mutex_lock_interruptible(&state->fe_mutex)) return -EAGAIN; ret = state->init[fe_to_adap(fe)->id](fe); mutex_unlock(&state->fe_mutex); return ret; } /* override demod callbacks for resource locking */ static int af9015_af9013_sleep(struct dvb_frontend *fe) { int ret; struct af9015_state *state = fe_to_priv(fe); if (mutex_lock_interruptible(&state->fe_mutex)) return -EAGAIN; ret = state->sleep[fe_to_adap(fe)->id](fe); mutex_unlock(&state->fe_mutex); return ret; } /* override tuner callbacks for resource locking */ static int af9015_tuner_init(struct dvb_frontend *fe) { int ret; struct af9015_state *state = fe_to_priv(fe); if (mutex_lock_interruptible(&state->fe_mutex)) return -EAGAIN; ret = state->tuner_init[fe_to_adap(fe)->id](fe); mutex_unlock(&state->fe_mutex); return ret; } /* override tuner callbacks for resource locking */ static int af9015_tuner_sleep(struct dvb_frontend *fe) { int ret; struct af9015_state *state = fe_to_priv(fe); if (mutex_lock_interruptible(&state->fe_mutex)) return -EAGAIN; ret = state->tuner_sleep[fe_to_adap(fe)->id](fe); mutex_unlock(&state->fe_mutex); return ret; } static int af9015_copy_firmware(struct dvb_usb_device *d) { struct af9015_state *state = d_to_priv(d); struct usb_interface *intf = d->intf; int ret; unsigned long timeout; u8 val, firmware_info[4]; struct req_t req = {COPY_FIRMWARE, 0, 0x5100, 0, 0, 4, firmware_info}; dev_dbg(&intf->dev, "\n"); firmware_info[0] = (state->firmware_size >> 8) & 0xff; firmware_info[1] = (state->firmware_size >> 0) & 0xff; firmware_info[2] = (state->firmware_checksum >> 8) & 0xff; firmware_info[3] = (state->firmware_checksum >> 0) & 0xff; /* Check whether firmware is already running */ ret = af9015_read_reg_i2c(d, state->af9013_i2c_addr[1], 0x98be, &val); if (ret) goto err; dev_dbg(&intf->dev, "firmware status %02x\n", val); if (val == 0x0c) return 0; /* Set i2c clock to 625kHz to speed up firmware copy */ ret = regmap_write(state->regmap, 0xd416, 0x04); if (ret) goto err; /* Copy firmware from master demod to slave demod */ ret = af9015_ctrl_msg(d, &req); if (ret) { dev_err(&intf->dev, "firmware copy cmd failed %d\n", ret); goto err; } /* Set i2c clock to 125kHz */ ret = regmap_write(state->regmap, 0xd416, 0x14); if (ret) goto err; /* Boot firmware */ ret = af9015_write_reg_i2c(d, state->af9013_i2c_addr[1], 0xe205, 0x01); if (ret) goto err; /* Poll firmware ready */ for (val = 0x00, timeout = jiffies + msecs_to_jiffies(1000); !time_after(jiffies, timeout) && val != 0x0c && val != 0x04;) { msleep(20); /* Check firmware status. 0c=OK, 04=fail */ ret = af9015_read_reg_i2c(d, state->af9013_i2c_addr[1], 0x98be, &val); if (ret) goto err; dev_dbg(&intf->dev, "firmware status %02x\n", val); } dev_dbg(&intf->dev, "firmware boot took %u ms\n", jiffies_to_msecs(jiffies) - (jiffies_to_msecs(timeout) - 1000)); if (val == 0x04) { ret = -ENODEV; dev_err(&intf->dev, "firmware did not run\n"); goto err; } else if (val != 0x0c) { ret = -ETIMEDOUT; dev_err(&intf->dev, "firmware boot timeout\n"); goto err; } return 0; err: dev_dbg(&intf->dev, "failed %d\n", ret); return ret; } static int af9015_af9013_frontend_attach(struct dvb_usb_adapter *adap) { struct af9015_state *state = adap_to_priv(adap); struct dvb_usb_device *d = adap_to_d(adap); struct usb_interface *intf = d->intf; struct i2c_client *client; int ret; dev_dbg(&intf->dev, "adap id %u\n", adap->id); if (adap->id == 0) { state->af9013_pdata[0].ts_mode = AF9013_TS_MODE_USB; memcpy(state->af9013_pdata[0].api_version, "\x0\x1\x9\x0", 4); state->af9013_pdata[0].gpio[0] = AF9013_GPIO_HI; state->af9013_pdata[0].gpio[3] = AF9013_GPIO_TUNER_ON; } else if (adap->id == 1) { state->af9013_pdata[1].ts_mode = AF9013_TS_MODE_SERIAL; state->af9013_pdata[1].ts_output_pin = 7; memcpy(state->af9013_pdata[1].api_version, "\x0\x1\x9\x0", 4); state->af9013_pdata[1].gpio[0] = AF9013_GPIO_TUNER_ON; state->af9013_pdata[1].gpio[1] = AF9013_GPIO_LO; /* copy firmware to 2nd demodulator */ if (state->dual_mode) { /* Wait 2nd demodulator ready */ msleep(100); ret = af9015_copy_firmware(adap_to_d(adap)); if (ret) { dev_err(&intf->dev, "firmware copy to 2nd frontend failed, will disable it\n"); state->dual_mode = 0; goto err; } } else { ret = -ENODEV; goto err; } } /* Add I2C demod */ client = dvb_module_probe("af9013", NULL, &d->i2c_adap, state->af9013_i2c_addr[adap->id], &state->af9013_pdata[adap->id]); if (!client) { ret = -ENODEV; goto err; } adap->fe[0] = state->af9013_pdata[adap->id].get_dvb_frontend(client); state->demod_i2c_client[adap->id] = client; /* * AF9015 firmware does not like if it gets interrupted by I2C adapter * request on some critical phases. During normal operation I2C adapter * is used only 2nd demodulator and tuner on dual tuner devices. * Override demodulator callbacks and use mutex for limit access to * those "critical" paths to keep AF9015 happy. */ if (adap->fe[0]) { state->set_frontend[adap->id] = adap->fe[0]->ops.set_frontend; adap->fe[0]->ops.set_frontend = af9015_af9013_set_frontend; state->read_status[adap->id] = adap->fe[0]->ops.read_status; adap->fe[0]->ops.read_status = af9015_af9013_read_status; state->init[adap->id] = adap->fe[0]->ops.init; adap->fe[0]->ops.init = af9015_af9013_init; state->sleep[adap->id] = adap->fe[0]->ops.sleep; adap->fe[0]->ops.sleep = af9015_af9013_sleep; } return 0; err: dev_dbg(&intf->dev, "failed %d\n", ret); return ret; } static int af9015_frontend_detach(struct dvb_usb_adapter *adap) { struct af9015_state *state = adap_to_priv(adap); struct dvb_usb_device *d = adap_to_d(adap); struct usb_interface *intf = d->intf; struct i2c_client *client; dev_dbg(&intf->dev, "adap id %u\n", adap->id); /* Remove I2C demod */ client = state->demod_i2c_client[adap->id]; dvb_module_release(client); return 0; } static struct mt2060_config af9015_mt2060_config = { .i2c_address = 0x60, .clock_out = 0, }; static struct qt1010_config af9015_qt1010_config = { .i2c_address = 0x62, }; static struct tda18271_config af9015_tda18271_config = { .gate = TDA18271_GATE_DIGITAL, .small_i2c = TDA18271_16_BYTE_CHUNK_INIT, }; static struct mxl5005s_config af9015_mxl5003_config = { .i2c_address = 0x63, .if_freq = IF_FREQ_4570000HZ, .xtal_freq = CRYSTAL_FREQ_16000000HZ, .agc_mode = MXL_SINGLE_AGC, .tracking_filter = MXL_TF_DEFAULT, .rssi_enable = MXL_RSSI_ENABLE, .cap_select = MXL_CAP_SEL_ENABLE, .div_out = MXL_DIV_OUT_4, .clock_out = MXL_CLOCK_OUT_DISABLE, .output_load = MXL5005S_IF_OUTPUT_LOAD_200_OHM, .top = MXL5005S_TOP_25P2, .mod_mode = MXL_DIGITAL_MODE, .if_mode = MXL_ZERO_IF, .AgcMasterByte = 0x00, }; static struct mxl5005s_config af9015_mxl5005_config = { .i2c_address = 0x63, .if_freq = IF_FREQ_4570000HZ, .xtal_freq = CRYSTAL_FREQ_16000000HZ, .agc_mode = MXL_SINGLE_AGC, .tracking_filter = MXL_TF_OFF, .rssi_enable = MXL_RSSI_ENABLE, .cap_select = MXL_CAP_SEL_ENABLE, .div_out = MXL_DIV_OUT_4, .clock_out = MXL_CLOCK_OUT_DISABLE, .output_load = MXL5005S_IF_OUTPUT_LOAD_200_OHM, .top = MXL5005S_TOP_25P2, .mod_mode = MXL_DIGITAL_MODE, .if_mode = MXL_ZERO_IF, .AgcMasterByte = 0x00, }; static struct mc44s803_config af9015_mc44s803_config = { .i2c_address = 0x60, .dig_out = 1, }; static struct tda18218_config af9015_tda18218_config = { .i2c_address = 0x60, .i2c_wr_max = 21, /* max wr bytes AF9015 I2C adap can handle at once */ }; static struct mxl5007t_config af9015_mxl5007t_config = { .xtal_freq_hz = MxL_XTAL_24_MHZ, .if_freq_hz = MxL_IF_4_57_MHZ, }; static int af9015_tuner_attach(struct dvb_usb_adapter *adap) { struct dvb_usb_device *d = adap_to_d(adap); struct af9015_state *state = d_to_priv(d); struct usb_interface *intf = d->intf; struct i2c_client *client; struct i2c_adapter *adapter; int ret; dev_dbg(&intf->dev, "adap id %u\n", adap->id); client = state->demod_i2c_client[adap->id]; adapter = state->af9013_pdata[adap->id].get_i2c_adapter(client); switch (state->af9013_pdata[adap->id].tuner) { case AF9013_TUNER_MT2060: case AF9013_TUNER_MT2060_2: ret = dvb_attach(mt2060_attach, adap->fe[0], adapter, &af9015_mt2060_config, state->mt2060_if1[adap->id]) == NULL ? -ENODEV : 0; break; case AF9013_TUNER_QT1010: case AF9013_TUNER_QT1010A: ret = dvb_attach(qt1010_attach, adap->fe[0], adapter, &af9015_qt1010_config) == NULL ? -ENODEV : 0; break; case AF9013_TUNER_TDA18271: ret = dvb_attach(tda18271_attach, adap->fe[0], 0x60, adapter, &af9015_tda18271_config) == NULL ? -ENODEV : 0; break; case AF9013_TUNER_TDA18218: ret = dvb_attach(tda18218_attach, adap->fe[0], adapter, &af9015_tda18218_config) == NULL ? -ENODEV : 0; break; case AF9013_TUNER_MXL5003D: ret = dvb_attach(mxl5005s_attach, adap->fe[0], adapter, &af9015_mxl5003_config) == NULL ? -ENODEV : 0; break; case AF9013_TUNER_MXL5005D: case AF9013_TUNER_MXL5005R: ret = dvb_attach(mxl5005s_attach, adap->fe[0], adapter, &af9015_mxl5005_config) == NULL ? -ENODEV : 0; break; case AF9013_TUNER_ENV77H11D5: ret = dvb_attach(dvb_pll_attach, adap->fe[0], 0x60, adapter, DVB_PLL_TDA665X) == NULL ? -ENODEV : 0; break; case AF9013_TUNER_MC44S803: ret = dvb_attach(mc44s803_attach, adap->fe[0], adapter, &af9015_mc44s803_config) == NULL ? -ENODEV : 0; break; case AF9013_TUNER_MXL5007T: ret = dvb_attach(mxl5007t_attach, adap->fe[0], adapter, 0x60, &af9015_mxl5007t_config) == NULL ? -ENODEV : 0; break; case AF9013_TUNER_UNKNOWN: default: dev_err(&intf->dev, "unknown tuner, tuner id %02x\n", state->af9013_pdata[adap->id].tuner); ret = -ENODEV; } if (adap->fe[0]->ops.tuner_ops.init) { state->tuner_init[adap->id] = adap->fe[0]->ops.tuner_ops.init; adap->fe[0]->ops.tuner_ops.init = af9015_tuner_init; } if (adap->fe[0]->ops.tuner_ops.sleep) { state->tuner_sleep[adap->id] = adap->fe[0]->ops.tuner_ops.sleep; adap->fe[0]->ops.tuner_ops.sleep = af9015_tuner_sleep; } return ret; } static int af9015_pid_filter_ctrl(struct dvb_usb_adapter *adap, int onoff) { struct af9015_state *state = adap_to_priv(adap); struct af9013_platform_data *pdata = &state->af9013_pdata[adap->id]; int ret; mutex_lock(&state->fe_mutex); ret = pdata->pid_filter_ctrl(adap->fe[0], onoff); mutex_unlock(&state->fe_mutex); return ret; } static int af9015_pid_filter(struct dvb_usb_adapter *adap, int index, u16 pid, int onoff) { struct af9015_state *state = adap_to_priv(adap); struct af9013_platform_data *pdata = &state->af9013_pdata[adap->id]; int ret; mutex_lock(&state->fe_mutex); ret = pdata->pid_filter(adap->fe[0], index, pid, onoff); mutex_unlock(&state->fe_mutex); return ret; } static int af9015_init(struct dvb_usb_device *d) { struct af9015_state *state = d_to_priv(d); struct usb_interface *intf = d->intf; int ret; dev_dbg(&intf->dev, "\n"); mutex_init(&state->fe_mutex); /* init RC canary */ ret = regmap_write(state->regmap, 0x98e9, 0xff); if (ret) goto error; error: return ret; } #if IS_ENABLED(CONFIG_RC_CORE) struct af9015_rc_setup { unsigned int id; char *rc_codes; }; static char *af9015_rc_setup_match(unsigned int id, const struct af9015_rc_setup *table) { for (; table->rc_codes; table++) if (table->id == id) return table->rc_codes; return NULL; } static const struct af9015_rc_setup af9015_rc_setup_modparam[] = { { AF9015_REMOTE_A_LINK_DTU_M, RC_MAP_ALINK_DTU_M }, { AF9015_REMOTE_MSI_DIGIVOX_MINI_II_V3, RC_MAP_MSI_DIGIVOX_II }, { AF9015_REMOTE_MYGICTV_U718, RC_MAP_TOTAL_MEDIA_IN_HAND }, { AF9015_REMOTE_DIGITTRADE_DVB_T, RC_MAP_DIGITTRADE }, { AF9015_REMOTE_AVERMEDIA_KS, RC_MAP_AVERMEDIA_RM_KS }, { } }; static const struct af9015_rc_setup af9015_rc_setup_hashes[] = { { 0xb8feb708, RC_MAP_MSI_DIGIVOX_II }, { 0xa3703d00, RC_MAP_ALINK_DTU_M }, { 0x9b7dc64e, RC_MAP_TOTAL_MEDIA_IN_HAND }, /* MYGICTV U718 */ { 0x5d49e3db, RC_MAP_DIGITTRADE }, /* LC-Power LC-USB-DVBT */ { } }; static int af9015_rc_query(struct dvb_usb_device *d) { struct af9015_state *state = d_to_priv(d); struct usb_interface *intf = d->intf; int ret; u8 buf[17]; /* read registers needed to detect remote controller code */ ret = regmap_bulk_read(state->regmap, 0x98d9, buf, sizeof(buf)); if (ret) goto error; /* If any of these are non-zero, assume invalid data */ if (buf[1] || buf[2] || buf[3]) { dev_dbg(&intf->dev, "invalid data\n"); return 0; } /* Check for repeat of previous code */ if ((state->rc_repeat != buf[6] || buf[0]) && !memcmp(&buf[12], state->rc_last, 4)) { dev_dbg(&intf->dev, "key repeated\n"); rc_repeat(d->rc_dev); state->rc_repeat = buf[6]; return 0; } /* Only process key if canary killed */ if (buf[16] != 0xff && buf[0] != 0x01) { enum rc_proto proto; dev_dbg(&intf->dev, "key pressed %*ph\n", 4, buf + 12); /* Reset the canary */ ret = regmap_write(state->regmap, 0x98e9, 0xff); if (ret) goto error; /* Remember this key */ memcpy(state->rc_last, &buf[12], 4); if (buf[14] == (u8)~buf[15]) { if (buf[12] == (u8)~buf[13]) { /* NEC */ state->rc_keycode = RC_SCANCODE_NEC(buf[12], buf[14]); proto = RC_PROTO_NEC; } else { /* NEC extended*/ state->rc_keycode = RC_SCANCODE_NECX(buf[12] << 8 | buf[13], buf[14]); proto = RC_PROTO_NECX; } } else { /* 32 bit NEC */ state->rc_keycode = RC_SCANCODE_NEC32(buf[12] << 24 | buf[13] << 16 | buf[14] << 8 | buf[15]); proto = RC_PROTO_NEC32; } rc_keydown(d->rc_dev, proto, state->rc_keycode, 0); } else { dev_dbg(&intf->dev, "no key press\n"); /* Invalidate last keypress */ /* Not really needed, but helps with debug */ state->rc_last[2] = state->rc_last[3]; } state->rc_repeat = buf[6]; state->rc_failed = false; error: if (ret) { dev_warn(&intf->dev, "rc query failed %d\n", ret); /* allow random errors as dvb-usb will stop polling on error */ if (!state->rc_failed) ret = 0; state->rc_failed = true; } return ret; } static int af9015_get_rc_config(struct dvb_usb_device *d, struct dvb_usb_rc *rc) { struct af9015_state *state = d_to_priv(d); u16 vid = le16_to_cpu(d->udev->descriptor.idVendor); if (state->ir_mode == AF9015_IR_MODE_DISABLED) return 0; /* try to load remote based module param */ if (!rc->map_name) rc->map_name = af9015_rc_setup_match(dvb_usb_af9015_remote, af9015_rc_setup_modparam); /* try to load remote based eeprom hash */ if (!rc->map_name) rc->map_name = af9015_rc_setup_match(state->eeprom_sum, af9015_rc_setup_hashes); /* try to load remote based USB iManufacturer string */ if (!rc->map_name && vid == USB_VID_AFATECH) { /* * Check USB manufacturer and product strings and try * to determine correct remote in case of chip vendor * reference IDs are used. * DO NOT ADD ANYTHING NEW HERE. Use hashes instead. */ char manufacturer[10]; memset(manufacturer, 0, sizeof(manufacturer)); usb_string(d->udev, d->udev->descriptor.iManufacturer, manufacturer, sizeof(manufacturer)); if (!strcmp("MSI", manufacturer)) { /* * iManufacturer 1 MSI * iProduct 2 MSI K-VOX */ rc->map_name = af9015_rc_setup_match(AF9015_REMOTE_MSI_DIGIVOX_MINI_II_V3, af9015_rc_setup_modparam); } } /* load empty to enable rc */ if (!rc->map_name) rc->map_name = RC_MAP_EMPTY; rc->allowed_protos = RC_PROTO_BIT_NEC | RC_PROTO_BIT_NECX | RC_PROTO_BIT_NEC32; rc->query = af9015_rc_query; rc->interval = 500; return 0; } #else #define af9015_get_rc_config NULL #endif static int af9015_regmap_write(void *context, const void *data, size_t count) { struct dvb_usb_device *d = context; struct usb_interface *intf = d->intf; int ret; u16 reg = ((u8 *)data)[0] << 8 | ((u8 *)data)[1] << 0; u8 *val = &((u8 *)data)[2]; const unsigned int len = count - 2; struct req_t req = {WRITE_MEMORY, 0, reg, 0, 0, len, val}; ret = af9015_ctrl_msg(d, &req); if (ret) goto err; return 0; err: dev_dbg(&intf->dev, "failed %d\n", ret); return ret; } static int af9015_regmap_read(void *context, const void *reg_buf, size_t reg_size, void *val_buf, size_t val_size) { struct dvb_usb_device *d = context; struct usb_interface *intf = d->intf; int ret; u16 reg = ((u8 *)reg_buf)[0] << 8 | ((u8 *)reg_buf)[1] << 0; u8 *val = &((u8 *)val_buf)[0]; const unsigned int len = val_size; struct req_t req = {READ_MEMORY, 0, reg, 0, 0, len, val}; ret = af9015_ctrl_msg(d, &req); if (ret) goto err; return 0; err: dev_dbg(&intf->dev, "failed %d\n", ret); return ret; } static int af9015_probe(struct dvb_usb_device *d) { struct af9015_state *state = d_to_priv(d); struct usb_interface *intf = d->intf; struct usb_device *udev = interface_to_usbdev(intf); int ret; char manufacturer[sizeof("ITE Technologies, Inc.")]; static const struct regmap_config regmap_config = { .reg_bits = 16, .val_bits = 8, }; static const struct regmap_bus regmap_bus = { .read = af9015_regmap_read, .write = af9015_regmap_write, }; dev_dbg(&intf->dev, "\n"); memset(manufacturer, 0, sizeof(manufacturer)); usb_string(udev, udev->descriptor.iManufacturer, manufacturer, sizeof(manufacturer)); /* * There is two devices having same ID but different chipset. One uses * AF9015 and the other IT9135 chipset. Only difference seen on lsusb * is iManufacturer string. * * idVendor 0x0ccd TerraTec Electronic GmbH * idProduct 0x0099 * bcdDevice 2.00 * iManufacturer 1 Afatech * iProduct 2 DVB-T 2 * * idVendor 0x0ccd TerraTec Electronic GmbH * idProduct 0x0099 * bcdDevice 2.00 * iManufacturer 1 ITE Technologies, Inc. * iProduct 2 DVB-T TV Stick */ if ((le16_to_cpu(udev->descriptor.idVendor) == USB_VID_TERRATEC) && (le16_to_cpu(udev->descriptor.idProduct) == 0x0099)) { if (!strcmp("ITE Technologies, Inc.", manufacturer)) { ret = -ENODEV; dev_dbg(&intf->dev, "rejecting device\n"); goto err; } } state->regmap = regmap_init(&intf->dev, ®map_bus, d, ®map_config); if (IS_ERR(state->regmap)) { ret = PTR_ERR(state->regmap); goto err; } return 0; err: dev_dbg(&intf->dev, "failed %d\n", ret); return ret; } static void af9015_disconnect(struct dvb_usb_device *d) { struct af9015_state *state = d_to_priv(d); struct usb_interface *intf = d->intf; dev_dbg(&intf->dev, "\n"); regmap_exit(state->regmap); } /* * Interface 0 is used by DVB-T receiver and * interface 1 is for remote controller (HID) */ static const struct dvb_usb_device_properties af9015_props = { .driver_name = KBUILD_MODNAME, .owner = THIS_MODULE, .adapter_nr = adapter_nr, .size_of_priv = sizeof(struct af9015_state), .generic_bulk_ctrl_endpoint = 0x02, .generic_bulk_ctrl_endpoint_response = 0x81, .probe = af9015_probe, .disconnect = af9015_disconnect, .identify_state = af9015_identify_state, .firmware = AF9015_FIRMWARE, .download_firmware = af9015_download_firmware, .i2c_algo = &af9015_i2c_algo, .read_config = af9015_read_config, .frontend_attach = af9015_af9013_frontend_attach, .frontend_detach = af9015_frontend_detach, .tuner_attach = af9015_tuner_attach, .init = af9015_init, .get_rc_config = af9015_get_rc_config, .get_stream_config = af9015_get_stream_config, .streaming_ctrl = af9015_streaming_ctrl, .get_adapter_count = af9015_get_adapter_count, .adapter = { { .caps = DVB_USB_ADAP_HAS_PID_FILTER | DVB_USB_ADAP_PID_FILTER_CAN_BE_TURNED_OFF, .pid_filter_count = 32, .pid_filter = af9015_pid_filter, .pid_filter_ctrl = af9015_pid_filter_ctrl, .stream = DVB_USB_STREAM_BULK(0x84, 6, 87 * 188), }, { .caps = DVB_USB_ADAP_HAS_PID_FILTER | DVB_USB_ADAP_PID_FILTER_CAN_BE_TURNED_OFF, .pid_filter_count = 32, .pid_filter = af9015_pid_filter, .pid_filter_ctrl = af9015_pid_filter_ctrl, .stream = DVB_USB_STREAM_BULK(0x85, 6, 87 * 188), }, }, }; static const struct usb_device_id af9015_id_table[] = { { DVB_USB_DEVICE(USB_VID_AFATECH, USB_PID_AFATECH_AF9015_9015, &af9015_props, "Afatech AF9015 reference design", NULL) }, { DVB_USB_DEVICE(USB_VID_AFATECH, USB_PID_AFATECH_AF9015_9016, &af9015_props, "Afatech AF9015 reference design", NULL) }, { DVB_USB_DEVICE(USB_VID_LEADTEK, USB_PID_WINFAST_DTV_DONGLE_GOLD, &af9015_props, "Leadtek WinFast DTV Dongle Gold", RC_MAP_LEADTEK_Y04G0051) }, { DVB_USB_DEVICE(USB_VID_PINNACLE, USB_PID_PINNACLE_PCTV71E, &af9015_props, "Pinnacle PCTV 71e", NULL) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_KWORLD_399U, &af9015_props, "KWorld PlusTV Dual DVB-T Stick (DVB-T 399U)", NULL) }, { DVB_USB_DEVICE(USB_VID_VISIONPLUS, USB_PID_TINYTWIN, &af9015_props, "DigitalNow TinyTwin", RC_MAP_AZUREWAVE_AD_TU700) }, { DVB_USB_DEVICE(USB_VID_VISIONPLUS, USB_PID_AZUREWAVE_AD_TU700, &af9015_props, "TwinHan AzureWave AD-TU700(704J)", RC_MAP_AZUREWAVE_AD_TU700) }, { DVB_USB_DEVICE(USB_VID_TERRATEC, USB_PID_TERRATEC_CINERGY_T_USB_XE_REV2, &af9015_props, "TerraTec Cinergy T USB XE", NULL) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_KWORLD_PC160_2T, &af9015_props, "KWorld PlusTV Dual DVB-T PCI (DVB-T PC160-2T)", NULL) }, { DVB_USB_DEVICE(USB_VID_AVERMEDIA, USB_PID_AVERMEDIA_VOLAR_X, &af9015_props, "AVerMedia AVerTV DVB-T Volar X", RC_MAP_AVERMEDIA_M135A) }, { DVB_USB_DEVICE(USB_VID_XTENSIONS, USB_PID_XTENSIONS_XD_380, &af9015_props, "Xtensions XD-380", NULL) }, { DVB_USB_DEVICE(USB_VID_MSI_2, USB_PID_MSI_DIGIVOX_DUO, &af9015_props, "MSI DIGIVOX Duo", RC_MAP_MSI_DIGIVOX_III) }, { DVB_USB_DEVICE(USB_VID_AVERMEDIA, USB_PID_AVERMEDIA_VOLAR_X_2, &af9015_props, "Fujitsu-Siemens Slim Mobile USB DVB-T", NULL) }, { DVB_USB_DEVICE(USB_VID_TELESTAR, USB_PID_TELESTAR_STARSTICK_2, &af9015_props, "Telestar Starstick 2", NULL) }, { DVB_USB_DEVICE(USB_VID_AVERMEDIA, USB_PID_AVERMEDIA_A309, &af9015_props, "AVerMedia A309", NULL) }, { DVB_USB_DEVICE(USB_VID_MSI_2, USB_PID_MSI_DIGI_VOX_MINI_III, &af9015_props, "MSI Digi VOX mini III", RC_MAP_MSI_DIGIVOX_III) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_KWORLD_395U, &af9015_props, "KWorld USB DVB-T TV Stick II (VS-DVB-T 395U)", NULL) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_KWORLD_395U_2, &af9015_props, "KWorld USB DVB-T TV Stick II (VS-DVB-T 395U)", NULL) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_KWORLD_395U_3, &af9015_props, "KWorld USB DVB-T TV Stick II (VS-DVB-T 395U)", NULL) }, { DVB_USB_DEVICE(USB_VID_AFATECH, USB_PID_TREKSTOR_DVBT, &af9015_props, "TrekStor DVB-T USB Stick", RC_MAP_TREKSTOR) }, { DVB_USB_DEVICE(USB_VID_AVERMEDIA, USB_PID_AVERMEDIA_A850, &af9015_props, "AverMedia AVerTV Volar Black HD (A850)", NULL) }, { DVB_USB_DEVICE(USB_VID_AVERMEDIA, USB_PID_AVERMEDIA_A805, &af9015_props, "AverMedia AVerTV Volar GPS 805 (A805)", NULL) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_CONCEPTRONIC_CTVDIGRCU, &af9015_props, "Conceptronic USB2.0 DVB-T CTVDIGRCU V3.0", NULL) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_KWORLD_MC810, &af9015_props, "KWorld Digital MC-810", NULL) }, { DVB_USB_DEVICE(USB_VID_KYE, USB_PID_GENIUS_TVGO_DVB_T03, &af9015_props, "Genius TVGo DVB-T03", NULL) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_KWORLD_399U_2, &af9015_props, "KWorld PlusTV Dual DVB-T Stick (DVB-T 399U)", NULL) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_KWORLD_PC160_T, &af9015_props, "KWorld PlusTV DVB-T PCI Pro Card (DVB-T PC160-T)", NULL) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_SVEON_STV20, &af9015_props, "Sveon STV20 Tuner USB DVB-T HDTV", NULL) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_TINYTWIN_2, &af9015_props, "DigitalNow TinyTwin v2", RC_MAP_DIGITALNOW_TINYTWIN) }, { DVB_USB_DEVICE(USB_VID_LEADTEK, USB_PID_WINFAST_DTV2000DS, &af9015_props, "Leadtek WinFast DTV2000DS", RC_MAP_LEADTEK_Y04G0051) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_KWORLD_UB383_T, &af9015_props, "KWorld USB DVB-T Stick Mobile (UB383-T)", NULL) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_KWORLD_395U_4, &af9015_props, "KWorld USB DVB-T TV Stick II (VS-DVB-T 395U)", NULL) }, { DVB_USB_DEVICE(USB_VID_AVERMEDIA, USB_PID_AVERMEDIA_A815M, &af9015_props, "AverMedia AVerTV Volar M (A815Mac)", NULL) }, { DVB_USB_DEVICE(USB_VID_TERRATEC, USB_PID_TERRATEC_CINERGY_T_STICK_RC, &af9015_props, "TerraTec Cinergy T Stick RC", RC_MAP_TERRATEC_SLIM_2) }, /* XXX: that same ID [0ccd:0099] is used by af9035 driver too */ { DVB_USB_DEVICE(USB_VID_TERRATEC, USB_PID_TERRATEC_CINERGY_T_STICK_DUAL_RC, &af9015_props, "TerraTec Cinergy T Stick Dual RC", RC_MAP_TERRATEC_SLIM) }, { DVB_USB_DEVICE(USB_VID_AVERMEDIA, USB_PID_AVERMEDIA_A850T, &af9015_props, "AverMedia AVerTV Red HD+ (A850T)", NULL) }, { DVB_USB_DEVICE(USB_VID_GTEK, USB_PID_TINYTWIN_3, &af9015_props, "DigitalNow TinyTwin v3", RC_MAP_DIGITALNOW_TINYTWIN) }, { DVB_USB_DEVICE(USB_VID_KWORLD_2, USB_PID_SVEON_STV22, &af9015_props, "Sveon STV22 Dual USB DVB-T Tuner HDTV", RC_MAP_MSI_DIGIVOX_III) }, { } }; MODULE_DEVICE_TABLE(usb, af9015_id_table); /* usb specific object needed to register this driver with the usb subsystem */ static struct usb_driver af9015_usb_driver = { .name = KBUILD_MODNAME, .id_table = af9015_id_table, .probe = dvb_usbv2_probe, .disconnect = dvb_usbv2_disconnect, .suspend = dvb_usbv2_suspend, .resume = dvb_usbv2_resume, .reset_resume = dvb_usbv2_reset_resume, .no_dynamic_id = 1, .soft_unbind = 1, }; module_usb_driver(af9015_usb_driver); MODULE_AUTHOR("Antti Palosaari <crope@iki.fi>"); MODULE_DESCRIPTION("Afatech AF9015 driver"); MODULE_LICENSE("GPL"); MODULE_FIRMWARE(AF9015_FIRMWARE); |
5 5 24 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 | /* SPDX-License-Identifier: GPL-2.0 */ #if !defined(_DRM_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ) #define _DRM_TRACE_H_ #include <linux/stringify.h> #include <linux/types.h> #include <linux/tracepoint.h> struct drm_file; #undef TRACE_SYSTEM #define TRACE_SYSTEM drm #define TRACE_INCLUDE_FILE drm_trace TRACE_EVENT(drm_vblank_event, TP_PROTO(int crtc, unsigned int seq, ktime_t time, bool high_prec), TP_ARGS(crtc, seq, time, high_prec), TP_STRUCT__entry( __field(int, crtc) __field(unsigned int, seq) __field(ktime_t, time) __field(bool, high_prec) ), TP_fast_assign( __entry->crtc = crtc; __entry->seq = seq; __entry->time = time; __entry->high_prec = high_prec; ), TP_printk("crtc=%d, seq=%u, time=%lld, high-prec=%s", __entry->crtc, __entry->seq, __entry->time, __entry->high_prec ? "true" : "false") ); TRACE_EVENT(drm_vblank_event_queued, TP_PROTO(struct drm_file *file, int crtc, unsigned int seq), TP_ARGS(file, crtc, seq), TP_STRUCT__entry( __field(struct drm_file *, file) __field(int, crtc) __field(unsigned int, seq) ), TP_fast_assign( __entry->file = file; __entry->crtc = crtc; __entry->seq = seq; ), TP_printk("file=%p, crtc=%d, seq=%u", __entry->file, __entry->crtc, \ __entry->seq) ); TRACE_EVENT(drm_vblank_event_delivered, TP_PROTO(struct drm_file *file, int crtc, unsigned int seq), TP_ARGS(file, crtc, seq), TP_STRUCT__entry( __field(struct drm_file *, file) __field(int, crtc) __field(unsigned int, seq) ), TP_fast_assign( __entry->file = file; __entry->crtc = crtc; __entry->seq = seq; ), TP_printk("file=%p, crtc=%d, seq=%u", __entry->file, __entry->crtc, \ __entry->seq) ); #endif /* _DRM_TRACE_H_ */ /* This part must be outside protection */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH ../../drivers/gpu/drm #include <trace/define_trace.h> |
1 1 1 2 2 1 1 1 1 1 24 24 25 25 1 24 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 | // SPDX-License-Identifier: GPL-2.0 /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * Basic Transport Functions exploiting Infiniband API * * Copyright IBM Corp. 2016 * * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> */ #include <linux/socket.h> #include <linux/if_vlan.h> #include <linux/random.h> #include <linux/workqueue.h> #include <linux/wait.h> #include <linux/reboot.h> #include <linux/mutex.h> #include <linux/list.h> #include <linux/smc.h> #include <net/tcp.h> #include <net/sock.h> #include <rdma/ib_verbs.h> #include <rdma/ib_cache.h> #include "smc.h" #include "smc_clc.h" #include "smc_core.h" #include "smc_ib.h" #include "smc_wr.h" #include "smc_llc.h" #include "smc_cdc.h" #include "smc_close.h" #include "smc_ism.h" #include "smc_netlink.h" #include "smc_stats.h" #include "smc_tracepoint.h" #define SMC_LGR_NUM_INCR 256 #define SMC_LGR_FREE_DELAY_SERV (600 * HZ) #define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10 * HZ) struct smc_lgr_list smc_lgr_list = { /* established link groups */ .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock), .list = LIST_HEAD_INIT(smc_lgr_list.list), .num = 0, }; static atomic_t lgr_cnt = ATOMIC_INIT(0); /* number of existing link groups */ static DECLARE_WAIT_QUEUE_HEAD(lgrs_deleted); static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, struct smc_buf_desc *buf_desc); static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft); static void smc_link_down_work(struct work_struct *work); /* return head of link group list and its lock for a given link group */ static inline struct list_head *smc_lgr_list_head(struct smc_link_group *lgr, spinlock_t **lgr_lock) { if (lgr->is_smcd) { *lgr_lock = &lgr->smcd->lgr_lock; return &lgr->smcd->lgr_list; } *lgr_lock = &smc_lgr_list.lock; return &smc_lgr_list.list; } static void smc_ibdev_cnt_inc(struct smc_link *lnk) { atomic_inc(&lnk->smcibdev->lnk_cnt_by_port[lnk->ibport - 1]); } static void smc_ibdev_cnt_dec(struct smc_link *lnk) { atomic_dec(&lnk->smcibdev->lnk_cnt_by_port[lnk->ibport - 1]); } static void smc_lgr_schedule_free_work(struct smc_link_group *lgr) { /* client link group creation always follows the server link group * creation. For client use a somewhat higher removal delay time, * otherwise there is a risk of out-of-sync link groups. */ if (!lgr->freeing) { mod_delayed_work(system_wq, &lgr->free_work, (!lgr->is_smcd && lgr->role == SMC_CLNT) ? SMC_LGR_FREE_DELAY_CLNT : SMC_LGR_FREE_DELAY_SERV); } } /* Register connection's alert token in our lookup structure. * To use rbtrees we have to implement our own insert core. * Requires @conns_lock * @smc connection to register * Returns 0 on success, != otherwise. */ static void smc_lgr_add_alert_token(struct smc_connection *conn) { struct rb_node **link, *parent = NULL; u32 token = conn->alert_token_local; link = &conn->lgr->conns_all.rb_node; while (*link) { struct smc_connection *cur = rb_entry(*link, struct smc_connection, alert_node); parent = *link; if (cur->alert_token_local > token) link = &parent->rb_left; else link = &parent->rb_right; } /* Put the new node there */ rb_link_node(&conn->alert_node, parent, link); rb_insert_color(&conn->alert_node, &conn->lgr->conns_all); } /* assign an SMC-R link to the connection */ static int smcr_lgr_conn_assign_link(struct smc_connection *conn, bool first) { enum smc_link_state expected = first ? SMC_LNK_ACTIVATING : SMC_LNK_ACTIVE; int i, j; /* do link balancing */ conn->lnk = NULL; /* reset conn->lnk first */ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { struct smc_link *lnk = &conn->lgr->lnk[i]; if (lnk->state != expected || lnk->link_is_asym) continue; if (conn->lgr->role == SMC_CLNT) { conn->lnk = lnk; /* temporary, SMC server assigns link*/ break; } if (conn->lgr->conns_num % 2) { for (j = i + 1; j < SMC_LINKS_PER_LGR_MAX; j++) { struct smc_link *lnk2; lnk2 = &conn->lgr->lnk[j]; if (lnk2->state == expected && !lnk2->link_is_asym) { conn->lnk = lnk2; break; } } } if (!conn->lnk) conn->lnk = lnk; break; } if (!conn->lnk) return SMC_CLC_DECL_NOACTLINK; atomic_inc(&conn->lnk->conn_cnt); return 0; } /* Register connection in link group by assigning an alert token * registered in a search tree. * Requires @conns_lock * Note that '0' is a reserved value and not assigned. */ static int smc_lgr_register_conn(struct smc_connection *conn, bool first) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); static atomic_t nexttoken = ATOMIC_INIT(0); int rc; if (!conn->lgr->is_smcd) { rc = smcr_lgr_conn_assign_link(conn, first); if (rc) { conn->lgr = NULL; return rc; } } /* find a new alert_token_local value not yet used by some connection * in this link group */ sock_hold(&smc->sk); /* sock_put in smc_lgr_unregister_conn() */ while (!conn->alert_token_local) { conn->alert_token_local = atomic_inc_return(&nexttoken); if (smc_lgr_find_conn(conn->alert_token_local, conn->lgr)) conn->alert_token_local = 0; } smc_lgr_add_alert_token(conn); conn->lgr->conns_num++; return 0; } /* Unregister connection and reset the alert token of the given connection< */ static void __smc_lgr_unregister_conn(struct smc_connection *conn) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); struct smc_link_group *lgr = conn->lgr; rb_erase(&conn->alert_node, &lgr->conns_all); if (conn->lnk) atomic_dec(&conn->lnk->conn_cnt); lgr->conns_num--; conn->alert_token_local = 0; sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */ } /* Unregister connection from lgr */ static void smc_lgr_unregister_conn(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; if (!smc_conn_lgr_valid(conn)) return; write_lock_bh(&lgr->conns_lock); if (conn->alert_token_local) { __smc_lgr_unregister_conn(conn); } write_unlock_bh(&lgr->conns_lock); } int smc_nl_get_sys_info(struct sk_buff *skb, struct netlink_callback *cb) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); char hostname[SMC_MAX_HOSTNAME_LEN + 1]; char smc_seid[SMC_MAX_EID_LEN + 1]; struct nlattr *attrs; u8 *seid = NULL; u8 *host = NULL; void *nlh; nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &smc_gen_nl_family, NLM_F_MULTI, SMC_NETLINK_GET_SYS_INFO); if (!nlh) goto errmsg; if (cb_ctx->pos[0]) goto errout; attrs = nla_nest_start(skb, SMC_GEN_SYS_INFO); if (!attrs) goto errout; if (nla_put_u8(skb, SMC_NLA_SYS_VER, SMC_V2)) goto errattr; if (nla_put_u8(skb, SMC_NLA_SYS_REL, SMC_RELEASE)) goto errattr; if (nla_put_u8(skb, SMC_NLA_SYS_IS_ISM_V2, smc_ism_is_v2_capable())) goto errattr; if (nla_put_u8(skb, SMC_NLA_SYS_IS_SMCR_V2, true)) goto errattr; smc_clc_get_hostname(&host); if (host) { memcpy(hostname, host, SMC_MAX_HOSTNAME_LEN); hostname[SMC_MAX_HOSTNAME_LEN] = 0; if (nla_put_string(skb, SMC_NLA_SYS_LOCAL_HOST, hostname)) goto errattr; } if (smc_ism_is_v2_capable()) { smc_ism_get_system_eid(&seid); memcpy(smc_seid, seid, SMC_MAX_EID_LEN); smc_seid[SMC_MAX_EID_LEN] = 0; if (nla_put_string(skb, SMC_NLA_SYS_SEID, smc_seid)) goto errattr; } nla_nest_end(skb, attrs); genlmsg_end(skb, nlh); cb_ctx->pos[0] = 1; return skb->len; errattr: nla_nest_cancel(skb, attrs); errout: genlmsg_cancel(skb, nlh); errmsg: return skb->len; } /* Fill SMC_NLA_LGR_D_V2_COMMON/SMC_NLA_LGR_R_V2_COMMON nested attributes */ static int smc_nl_fill_lgr_v2_common(struct smc_link_group *lgr, struct sk_buff *skb, struct netlink_callback *cb, struct nlattr *v2_attrs) { char smc_host[SMC_MAX_HOSTNAME_LEN + 1]; char smc_eid[SMC_MAX_EID_LEN + 1]; if (nla_put_u8(skb, SMC_NLA_LGR_V2_VER, lgr->smc_version)) goto errv2attr; if (nla_put_u8(skb, SMC_NLA_LGR_V2_REL, lgr->peer_smc_release)) goto errv2attr; if (nla_put_u8(skb, SMC_NLA_LGR_V2_OS, lgr->peer_os)) goto errv2attr; memcpy(smc_host, lgr->peer_hostname, SMC_MAX_HOSTNAME_LEN); smc_host[SMC_MAX_HOSTNAME_LEN] = 0; if (nla_put_string(skb, SMC_NLA_LGR_V2_PEER_HOST, smc_host)) goto errv2attr; memcpy(smc_eid, lgr->negotiated_eid, SMC_MAX_EID_LEN); smc_eid[SMC_MAX_EID_LEN] = 0; if (nla_put_string(skb, SMC_NLA_LGR_V2_NEG_EID, smc_eid)) goto errv2attr; nla_nest_end(skb, v2_attrs); return 0; errv2attr: nla_nest_cancel(skb, v2_attrs); return -EMSGSIZE; } static int smc_nl_fill_smcr_lgr_v2(struct smc_link_group *lgr, struct sk_buff *skb, struct netlink_callback *cb) { struct nlattr *v2_attrs; v2_attrs = nla_nest_start(skb, SMC_NLA_LGR_R_V2); if (!v2_attrs) goto errattr; if (nla_put_u8(skb, SMC_NLA_LGR_R_V2_DIRECT, !lgr->uses_gateway)) goto errv2attr; if (nla_put_u8(skb, SMC_NLA_LGR_R_V2_MAX_CONNS, lgr->max_conns)) goto errv2attr; if (nla_put_u8(skb, SMC_NLA_LGR_R_V2_MAX_LINKS, lgr->max_links)) goto errv2attr; nla_nest_end(skb, v2_attrs); return 0; errv2attr: nla_nest_cancel(skb, v2_attrs); errattr: return -EMSGSIZE; } static int smc_nl_fill_lgr(struct smc_link_group *lgr, struct sk_buff *skb, struct netlink_callback *cb) { char smc_target[SMC_MAX_PNETID_LEN + 1]; struct nlattr *attrs, *v2_attrs; attrs = nla_nest_start(skb, SMC_GEN_LGR_SMCR); if (!attrs) goto errout; if (nla_put_u32(skb, SMC_NLA_LGR_R_ID, *((u32 *)&lgr->id))) goto errattr; if (nla_put_u32(skb, SMC_NLA_LGR_R_CONNS_NUM, lgr->conns_num)) goto errattr; if (nla_put_u8(skb, SMC_NLA_LGR_R_ROLE, lgr->role)) goto errattr; if (nla_put_u8(skb, SMC_NLA_LGR_R_TYPE, lgr->type)) goto errattr; if (nla_put_u8(skb, SMC_NLA_LGR_R_BUF_TYPE, lgr->buf_type)) goto errattr; if (nla_put_u8(skb, SMC_NLA_LGR_R_VLAN_ID, lgr->vlan_id)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_LGR_R_NET_COOKIE, lgr->net->net_cookie, SMC_NLA_LGR_R_PAD)) goto errattr; memcpy(smc_target, lgr->pnet_id, SMC_MAX_PNETID_LEN); smc_target[SMC_MAX_PNETID_LEN] = 0; if (nla_put_string(skb, SMC_NLA_LGR_R_PNETID, smc_target)) goto errattr; if (lgr->smc_version > SMC_V1) { v2_attrs = nla_nest_start(skb, SMC_NLA_LGR_R_V2_COMMON); if (!v2_attrs) goto errattr; if (smc_nl_fill_lgr_v2_common(lgr, skb, cb, v2_attrs)) goto errattr; if (smc_nl_fill_smcr_lgr_v2(lgr, skb, cb)) goto errattr; } nla_nest_end(skb, attrs); return 0; errattr: nla_nest_cancel(skb, attrs); errout: return -EMSGSIZE; } static int smc_nl_fill_lgr_link(struct smc_link_group *lgr, struct smc_link *link, struct sk_buff *skb, struct netlink_callback *cb) { char smc_ibname[IB_DEVICE_NAME_MAX]; u8 smc_gid_target[41]; struct nlattr *attrs; u32 link_uid = 0; void *nlh; nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &smc_gen_nl_family, NLM_F_MULTI, SMC_NETLINK_GET_LINK_SMCR); if (!nlh) goto errmsg; attrs = nla_nest_start(skb, SMC_GEN_LINK_SMCR); if (!attrs) goto errout; if (nla_put_u8(skb, SMC_NLA_LINK_ID, link->link_id)) goto errattr; if (nla_put_u32(skb, SMC_NLA_LINK_STATE, link->state)) goto errattr; if (nla_put_u32(skb, SMC_NLA_LINK_CONN_CNT, atomic_read(&link->conn_cnt))) goto errattr; if (nla_put_u8(skb, SMC_NLA_LINK_IB_PORT, link->ibport)) goto errattr; if (nla_put_u32(skb, SMC_NLA_LINK_NET_DEV, link->ndev_ifidx)) goto errattr; snprintf(smc_ibname, sizeof(smc_ibname), "%s", link->ibname); if (nla_put_string(skb, SMC_NLA_LINK_IB_DEV, smc_ibname)) goto errattr; memcpy(&link_uid, link->link_uid, sizeof(link_uid)); if (nla_put_u32(skb, SMC_NLA_LINK_UID, link_uid)) goto errattr; memcpy(&link_uid, link->peer_link_uid, sizeof(link_uid)); if (nla_put_u32(skb, SMC_NLA_LINK_PEER_UID, link_uid)) goto errattr; memset(smc_gid_target, 0, sizeof(smc_gid_target)); smc_gid_be16_convert(smc_gid_target, link->gid); if (nla_put_string(skb, SMC_NLA_LINK_GID, smc_gid_target)) goto errattr; memset(smc_gid_target, 0, sizeof(smc_gid_target)); smc_gid_be16_convert(smc_gid_target, link->peer_gid); if (nla_put_string(skb, SMC_NLA_LINK_PEER_GID, smc_gid_target)) goto errattr; nla_nest_end(skb, attrs); genlmsg_end(skb, nlh); return 0; errattr: nla_nest_cancel(skb, attrs); errout: genlmsg_cancel(skb, nlh); errmsg: return -EMSGSIZE; } static int smc_nl_handle_lgr(struct smc_link_group *lgr, struct sk_buff *skb, struct netlink_callback *cb, bool list_links) { void *nlh; int i; nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &smc_gen_nl_family, NLM_F_MULTI, SMC_NETLINK_GET_LGR_SMCR); if (!nlh) goto errmsg; if (smc_nl_fill_lgr(lgr, skb, cb)) goto errout; genlmsg_end(skb, nlh); if (!list_links) goto out; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (!smc_link_usable(&lgr->lnk[i])) continue; if (smc_nl_fill_lgr_link(lgr, &lgr->lnk[i], skb, cb)) goto errout; } out: return 0; errout: genlmsg_cancel(skb, nlh); errmsg: return -EMSGSIZE; } static void smc_nl_fill_lgr_list(struct smc_lgr_list *smc_lgr, struct sk_buff *skb, struct netlink_callback *cb, bool list_links) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); struct smc_link_group *lgr; int snum = cb_ctx->pos[0]; int num = 0; spin_lock_bh(&smc_lgr->lock); list_for_each_entry(lgr, &smc_lgr->list, list) { if (num < snum) goto next; if (smc_nl_handle_lgr(lgr, skb, cb, list_links)) goto errout; next: num++; } errout: spin_unlock_bh(&smc_lgr->lock); cb_ctx->pos[0] = num; } static int smc_nl_fill_smcd_lgr(struct smc_link_group *lgr, struct sk_buff *skb, struct netlink_callback *cb) { char smc_pnet[SMC_MAX_PNETID_LEN + 1]; struct smcd_dev *smcd = lgr->smcd; struct smcd_gid smcd_gid; struct nlattr *attrs; void *nlh; nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &smc_gen_nl_family, NLM_F_MULTI, SMC_NETLINK_GET_LGR_SMCD); if (!nlh) goto errmsg; attrs = nla_nest_start(skb, SMC_GEN_LGR_SMCD); if (!attrs) goto errout; if (nla_put_u32(skb, SMC_NLA_LGR_D_ID, *((u32 *)&lgr->id))) goto errattr; smcd->ops->get_local_gid(smcd, &smcd_gid); if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_GID, smcd_gid.gid, SMC_NLA_LGR_D_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_EXT_GID, smcd_gid.gid_ext, SMC_NLA_LGR_D_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_PEER_GID, lgr->peer_gid.gid, SMC_NLA_LGR_D_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_PEER_EXT_GID, lgr->peer_gid.gid_ext, SMC_NLA_LGR_D_PAD)) goto errattr; if (nla_put_u8(skb, SMC_NLA_LGR_D_VLAN_ID, lgr->vlan_id)) goto errattr; if (nla_put_u32(skb, SMC_NLA_LGR_D_CONNS_NUM, lgr->conns_num)) goto errattr; if (nla_put_u32(skb, SMC_NLA_LGR_D_CHID, smc_ism_get_chid(lgr->smcd))) goto errattr; memcpy(smc_pnet, lgr->smcd->pnetid, SMC_MAX_PNETID_LEN); smc_pnet[SMC_MAX_PNETID_LEN] = 0; if (nla_put_string(skb, SMC_NLA_LGR_D_PNETID, smc_pnet)) goto errattr; if (lgr->smc_version > SMC_V1) { struct nlattr *v2_attrs; v2_attrs = nla_nest_start(skb, SMC_NLA_LGR_D_V2_COMMON); if (!v2_attrs) goto errattr; if (smc_nl_fill_lgr_v2_common(lgr, skb, cb, v2_attrs)) goto errattr; } nla_nest_end(skb, attrs); genlmsg_end(skb, nlh); return 0; errattr: nla_nest_cancel(skb, attrs); errout: genlmsg_cancel(skb, nlh); errmsg: return -EMSGSIZE; } static int smc_nl_handle_smcd_lgr(struct smcd_dev *dev, struct sk_buff *skb, struct netlink_callback *cb) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); struct smc_link_group *lgr; int snum = cb_ctx->pos[1]; int rc = 0, num = 0; spin_lock_bh(&dev->lgr_lock); list_for_each_entry(lgr, &dev->lgr_list, list) { if (!lgr->is_smcd) continue; if (num < snum) goto next; rc = smc_nl_fill_smcd_lgr(lgr, skb, cb); if (rc) goto errout; next: num++; } errout: spin_unlock_bh(&dev->lgr_lock); cb_ctx->pos[1] = num; return rc; } static int smc_nl_fill_smcd_dev(struct smcd_dev_list *dev_list, struct sk_buff *skb, struct netlink_callback *cb) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); struct smcd_dev *smcd_dev; int snum = cb_ctx->pos[0]; int rc = 0, num = 0; mutex_lock(&dev_list->mutex); list_for_each_entry(smcd_dev, &dev_list->list, list) { if (list_empty(&smcd_dev->lgr_list)) continue; if (num < snum) goto next; rc = smc_nl_handle_smcd_lgr(smcd_dev, skb, cb); if (rc) goto errout; next: num++; } errout: mutex_unlock(&dev_list->mutex); cb_ctx->pos[0] = num; return rc; } int smcr_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb) { bool list_links = false; smc_nl_fill_lgr_list(&smc_lgr_list, skb, cb, list_links); return skb->len; } int smcr_nl_get_link(struct sk_buff *skb, struct netlink_callback *cb) { bool list_links = true; smc_nl_fill_lgr_list(&smc_lgr_list, skb, cb, list_links); return skb->len; } int smcd_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb) { smc_nl_fill_smcd_dev(&smcd_dev_list, skb, cb); return skb->len; } void smc_lgr_cleanup_early(struct smc_link_group *lgr) { spinlock_t *lgr_lock; if (!lgr) return; smc_lgr_list_head(lgr, &lgr_lock); spin_lock_bh(lgr_lock); /* do not use this link group for new connections */ if (!list_empty(&lgr->list)) list_del_init(&lgr->list); spin_unlock_bh(lgr_lock); __smc_lgr_terminate(lgr, true); } static void smcr_lgr_link_deactivate_all(struct smc_link_group *lgr) { int i; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { struct smc_link *lnk = &lgr->lnk[i]; if (smc_link_sendable(lnk)) lnk->state = SMC_LNK_INACTIVE; } wake_up_all(&lgr->llc_msg_waiter); wake_up_all(&lgr->llc_flow_waiter); } static void smc_lgr_free(struct smc_link_group *lgr); static void smc_lgr_free_work(struct work_struct *work) { struct smc_link_group *lgr = container_of(to_delayed_work(work), struct smc_link_group, free_work); spinlock_t *lgr_lock; bool conns; smc_lgr_list_head(lgr, &lgr_lock); spin_lock_bh(lgr_lock); if (lgr->freeing) { spin_unlock_bh(lgr_lock); return; } read_lock_bh(&lgr->conns_lock); conns = RB_EMPTY_ROOT(&lgr->conns_all); read_unlock_bh(&lgr->conns_lock); if (!conns) { /* number of lgr connections is no longer zero */ spin_unlock_bh(lgr_lock); return; } list_del_init(&lgr->list); /* remove from smc_lgr_list */ lgr->freeing = 1; /* this instance does the freeing, no new schedule */ spin_unlock_bh(lgr_lock); cancel_delayed_work(&lgr->free_work); if (!lgr->is_smcd && !lgr->terminating) smc_llc_send_link_delete_all(lgr, true, SMC_LLC_DEL_PROG_INIT_TERM); if (lgr->is_smcd && !lgr->terminating) smc_ism_signal_shutdown(lgr); if (!lgr->is_smcd) smcr_lgr_link_deactivate_all(lgr); smc_lgr_free(lgr); } static void smc_lgr_terminate_work(struct work_struct *work) { struct smc_link_group *lgr = container_of(work, struct smc_link_group, terminate_work); __smc_lgr_terminate(lgr, true); } /* return next unique link id for the lgr */ static u8 smcr_next_link_id(struct smc_link_group *lgr) { u8 link_id; int i; while (1) { again: link_id = ++lgr->next_link_id; if (!link_id) /* skip zero as link_id */ link_id = ++lgr->next_link_id; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (smc_link_usable(&lgr->lnk[i]) && lgr->lnk[i].link_id == link_id) goto again; } break; } return link_id; } static void smcr_copy_dev_info_to_link(struct smc_link *link) { struct smc_ib_device *smcibdev = link->smcibdev; snprintf(link->ibname, sizeof(link->ibname), "%s", smcibdev->ibdev->name); link->ndev_ifidx = smcibdev->ndev_ifidx[link->ibport - 1]; } int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, u8 link_idx, struct smc_init_info *ini) { struct smc_ib_device *smcibdev; u8 rndvec[3]; int rc; if (lgr->smc_version == SMC_V2) { lnk->smcibdev = ini->smcrv2.ib_dev_v2; lnk->ibport = ini->smcrv2.ib_port_v2; } else { lnk->smcibdev = ini->ib_dev; lnk->ibport = ini->ib_port; } get_device(&lnk->smcibdev->ibdev->dev); atomic_inc(&lnk->smcibdev->lnk_cnt); refcount_set(&lnk->refcnt, 1); /* link refcnt is set to 1 */ lnk->clearing = 0; lnk->path_mtu = lnk->smcibdev->pattr[lnk->ibport - 1].active_mtu; lnk->link_id = smcr_next_link_id(lgr); lnk->lgr = lgr; smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */ lnk->link_idx = link_idx; lnk->wr_rx_id_compl = 0; smc_ibdev_cnt_inc(lnk); smcr_copy_dev_info_to_link(lnk); atomic_set(&lnk->conn_cnt, 0); smc_llc_link_set_uid(lnk); INIT_WORK(&lnk->link_down_wrk, smc_link_down_work); if (!lnk->smcibdev->initialized) { rc = (int)smc_ib_setup_per_ibdev(lnk->smcibdev); if (rc) goto out; } get_random_bytes(rndvec, sizeof(rndvec)); lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16); rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport, ini->vlan_id, lnk->gid, &lnk->sgid_index, lgr->smc_version == SMC_V2 ? &ini->smcrv2 : NULL); if (rc) goto out; rc = smc_llc_link_init(lnk); if (rc) goto out; rc = smc_wr_alloc_link_mem(lnk); if (rc) goto clear_llc_lnk; rc = smc_ib_create_protection_domain(lnk); if (rc) goto free_link_mem; rc = smc_ib_create_queue_pair(lnk); if (rc) goto dealloc_pd; rc = smc_wr_create_link(lnk); if (rc) goto destroy_qp; lnk->state = SMC_LNK_ACTIVATING; return 0; destroy_qp: smc_ib_destroy_queue_pair(lnk); dealloc_pd: smc_ib_dealloc_protection_domain(lnk); free_link_mem: smc_wr_free_link_mem(lnk); clear_llc_lnk: smc_llc_link_clear(lnk, false); out: smc_ibdev_cnt_dec(lnk); put_device(&lnk->smcibdev->ibdev->dev); smcibdev = lnk->smcibdev; memset(lnk, 0, sizeof(struct smc_link)); lnk->state = SMC_LNK_UNUSED; if (!atomic_dec_return(&smcibdev->lnk_cnt)) wake_up(&smcibdev->lnks_deleted); smc_lgr_put(lgr); /* lgr_hold above */ return rc; } /* create a new SMC link group */ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) { struct smc_link_group *lgr; struct list_head *lgr_list; struct smcd_dev *smcd; struct smc_link *lnk; spinlock_t *lgr_lock; u8 link_idx; int rc = 0; int i; if (ini->is_smcd && ini->vlan_id) { if (smc_ism_get_vlan(ini->ism_dev[ini->ism_selected], ini->vlan_id)) { rc = SMC_CLC_DECL_ISMVLANERR; goto out; } } lgr = kzalloc(sizeof(*lgr), GFP_KERNEL); if (!lgr) { rc = SMC_CLC_DECL_MEM; goto ism_put_vlan; } lgr->tx_wq = alloc_workqueue("smc_tx_wq-%*phN", 0, 0, SMC_LGR_ID_SIZE, &lgr->id); if (!lgr->tx_wq) { rc = -ENOMEM; goto free_lgr; } lgr->is_smcd = ini->is_smcd; lgr->sync_err = 0; lgr->terminating = 0; lgr->freeing = 0; lgr->vlan_id = ini->vlan_id; refcount_set(&lgr->refcnt, 1); /* set lgr refcnt to 1 */ init_rwsem(&lgr->sndbufs_lock); init_rwsem(&lgr->rmbs_lock); rwlock_init(&lgr->conns_lock); for (i = 0; i < SMC_RMBE_SIZES; i++) { INIT_LIST_HEAD(&lgr->sndbufs[i]); INIT_LIST_HEAD(&lgr->rmbs[i]); } lgr->next_link_id = 0; smc_lgr_list.num += SMC_LGR_NUM_INCR; memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE); INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work); INIT_WORK(&lgr->terminate_work, smc_lgr_terminate_work); lgr->conns_all = RB_ROOT; if (ini->is_smcd) { /* SMC-D specific settings */ smcd = ini->ism_dev[ini->ism_selected]; get_device(smcd->ops->get_dev(smcd)); lgr->peer_gid.gid = ini->ism_peer_gid[ini->ism_selected].gid; lgr->peer_gid.gid_ext = ini->ism_peer_gid[ini->ism_selected].gid_ext; lgr->smcd = ini->ism_dev[ini->ism_selected]; lgr_list = &ini->ism_dev[ini->ism_selected]->lgr_list; lgr_lock = &lgr->smcd->lgr_lock; lgr->smc_version = ini->smcd_version; lgr->peer_shutdown = 0; atomic_inc(&ini->ism_dev[ini->ism_selected]->lgr_cnt); } else { /* SMC-R specific settings */ struct smc_ib_device *ibdev; int ibport; lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; lgr->smc_version = ini->smcr_version; memcpy(lgr->peer_systemid, ini->peer_systemid, SMC_SYSTEMID_LEN); if (lgr->smc_version == SMC_V2) { ibdev = ini->smcrv2.ib_dev_v2; ibport = ini->smcrv2.ib_port_v2; lgr->saddr = ini->smcrv2.saddr; lgr->uses_gateway = ini->smcrv2.uses_gateway; memcpy(lgr->nexthop_mac, ini->smcrv2.nexthop_mac, ETH_ALEN); lgr->max_conns = ini->max_conns; lgr->max_links = ini->max_links; } else { ibdev = ini->ib_dev; ibport = ini->ib_port; lgr->max_conns = SMC_CONN_PER_LGR_MAX; lgr->max_links = SMC_LINKS_ADD_LNK_MAX; } memcpy(lgr->pnet_id, ibdev->pnetid[ibport - 1], SMC_MAX_PNETID_LEN); rc = smc_wr_alloc_lgr_mem(lgr); if (rc) goto free_wq; smc_llc_lgr_init(lgr, smc); link_idx = SMC_SINGLE_LINK; lnk = &lgr->lnk[link_idx]; rc = smcr_link_init(lgr, lnk, link_idx, ini); if (rc) { smc_wr_free_lgr_mem(lgr); goto free_wq; } lgr->net = smc_ib_net(lnk->smcibdev); lgr_list = &smc_lgr_list.list; lgr_lock = &smc_lgr_list.lock; lgr->buf_type = lgr->net->smc.sysctl_smcr_buf_type; atomic_inc(&lgr_cnt); } smc->conn.lgr = lgr; spin_lock_bh(lgr_lock); list_add_tail(&lgr->list, lgr_list); spin_unlock_bh(lgr_lock); return 0; free_wq: destroy_workqueue(lgr->tx_wq); free_lgr: kfree(lgr); ism_put_vlan: if (ini->is_smcd && ini->vlan_id) smc_ism_put_vlan(ini->ism_dev[ini->ism_selected], ini->vlan_id); out: if (rc < 0) { if (rc == -ENOMEM) rc = SMC_CLC_DECL_MEM; else rc = SMC_CLC_DECL_INTERR; } return rc; } static int smc_write_space(struct smc_connection *conn) { int buffer_len = conn->peer_rmbe_size; union smc_host_cursor prod; union smc_host_cursor cons; int space; smc_curs_copy(&prod, &conn->local_tx_ctrl.prod, conn); smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn); /* determine rx_buf space */ space = buffer_len - smc_curs_diff(buffer_len, &cons, &prod); return space; } static int smc_switch_cursor(struct smc_sock *smc, struct smc_cdc_tx_pend *pend, struct smc_wr_buf *wr_buf) { struct smc_connection *conn = &smc->conn; union smc_host_cursor cons, fin; int rc = 0; int diff; smc_curs_copy(&conn->tx_curs_sent, &conn->tx_curs_fin, conn); smc_curs_copy(&fin, &conn->local_tx_ctrl_fin, conn); /* set prod cursor to old state, enforce tx_rdma_writes() */ smc_curs_copy(&conn->local_tx_ctrl.prod, &fin, conn); smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn); if (smc_curs_comp(conn->peer_rmbe_size, &cons, &fin) < 0) { /* cons cursor advanced more than fin, and prod was set * fin above, so now prod is smaller than cons. Fix that. */ diff = smc_curs_diff(conn->peer_rmbe_size, &fin, &cons); smc_curs_add(conn->sndbuf_desc->len, &conn->tx_curs_sent, diff); smc_curs_add(conn->sndbuf_desc->len, &conn->tx_curs_fin, diff); smp_mb__before_atomic(); atomic_add(diff, &conn->sndbuf_space); smp_mb__after_atomic(); smc_curs_add(conn->peer_rmbe_size, &conn->local_tx_ctrl.prod, diff); smc_curs_add(conn->peer_rmbe_size, &conn->local_tx_ctrl_fin, diff); } /* recalculate, value is used by tx_rdma_writes() */ atomic_set(&smc->conn.peer_rmbe_space, smc_write_space(conn)); if (smc->sk.sk_state != SMC_INIT && smc->sk.sk_state != SMC_CLOSED) { rc = smcr_cdc_msg_send_validation(conn, pend, wr_buf); if (!rc) { queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work, 0); smc->sk.sk_data_ready(&smc->sk); } } else { smc_wr_tx_put_slot(conn->lnk, (struct smc_wr_tx_pend_priv *)pend); } return rc; } void smc_switch_link_and_count(struct smc_connection *conn, struct smc_link *to_lnk) { atomic_dec(&conn->lnk->conn_cnt); /* link_hold in smc_conn_create() */ smcr_link_put(conn->lnk); conn->lnk = to_lnk; atomic_inc(&conn->lnk->conn_cnt); /* link_put in smc_conn_free() */ smcr_link_hold(conn->lnk); } struct smc_link *smc_switch_conns(struct smc_link_group *lgr, struct smc_link *from_lnk, bool is_dev_err) { struct smc_link *to_lnk = NULL; struct smc_cdc_tx_pend *pend; struct smc_connection *conn; struct smc_wr_buf *wr_buf; struct smc_sock *smc; struct rb_node *node; int i, rc = 0; /* link is inactive, wake up tx waiters */ smc_wr_wakeup_tx_wait(from_lnk); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (!smc_link_active(&lgr->lnk[i]) || i == from_lnk->link_idx) continue; if (is_dev_err && from_lnk->smcibdev == lgr->lnk[i].smcibdev && from_lnk->ibport == lgr->lnk[i].ibport) { continue; } to_lnk = &lgr->lnk[i]; break; } if (!to_lnk || !smc_wr_tx_link_hold(to_lnk)) { smc_lgr_terminate_sched(lgr); return NULL; } again: read_lock_bh(&lgr->conns_lock); for (node = rb_first(&lgr->conns_all); node; node = rb_next(node)) { conn = rb_entry(node, struct smc_connection, alert_node); if (conn->lnk != from_lnk) continue; smc = container_of(conn, struct smc_sock, conn); /* conn->lnk not yet set in SMC_INIT state */ if (smc->sk.sk_state == SMC_INIT) continue; if (smc->sk.sk_state == SMC_CLOSED || smc->sk.sk_state == SMC_PEERCLOSEWAIT1 || smc->sk.sk_state == SMC_PEERCLOSEWAIT2 || smc->sk.sk_state == SMC_APPFINCLOSEWAIT || smc->sk.sk_state == SMC_APPCLOSEWAIT1 || smc->sk.sk_state == SMC_APPCLOSEWAIT2 || smc->sk.sk_state == SMC_PEERFINCLOSEWAIT || smc->sk.sk_state == SMC_PEERABORTWAIT || smc->sk.sk_state == SMC_PROCESSABORT) { spin_lock_bh(&conn->send_lock); smc_switch_link_and_count(conn, to_lnk); spin_unlock_bh(&conn->send_lock); continue; } sock_hold(&smc->sk); read_unlock_bh(&lgr->conns_lock); /* pre-fetch buffer outside of send_lock, might sleep */ rc = smc_cdc_get_free_slot(conn, to_lnk, &wr_buf, NULL, &pend); if (rc) goto err_out; /* avoid race with smcr_tx_sndbuf_nonempty() */ spin_lock_bh(&conn->send_lock); smc_switch_link_and_count(conn, to_lnk); rc = smc_switch_cursor(smc, pend, wr_buf); spin_unlock_bh(&conn->send_lock); sock_put(&smc->sk); if (rc) goto err_out; goto again; } read_unlock_bh(&lgr->conns_lock); smc_wr_tx_link_put(to_lnk); return to_lnk; err_out: smcr_link_down_cond_sched(to_lnk); smc_wr_tx_link_put(to_lnk); return NULL; } static void smcr_buf_unuse(struct smc_buf_desc *buf_desc, bool is_rmb, struct smc_link_group *lgr) { struct rw_semaphore *lock; /* lock buffer list */ int rc; if (is_rmb && buf_desc->is_conf_rkey && !list_empty(&lgr->list)) { /* unregister rmb with peer */ rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); if (!rc) { /* protect against smc_llc_cli_rkey_exchange() */ down_read(&lgr->llc_conf_mutex); smc_llc_do_delete_rkey(lgr, buf_desc); buf_desc->is_conf_rkey = false; up_read(&lgr->llc_conf_mutex); smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); } } if (buf_desc->is_reg_err) { /* buf registration failed, reuse not possible */ lock = is_rmb ? &lgr->rmbs_lock : &lgr->sndbufs_lock; down_write(lock); list_del(&buf_desc->list); up_write(lock); smc_buf_free(lgr, is_rmb, buf_desc); } else { /* memzero_explicit provides potential memory barrier semantics */ memzero_explicit(buf_desc->cpu_addr, buf_desc->len); WRITE_ONCE(buf_desc->used, 0); } } static void smcd_buf_detach(struct smc_connection *conn) { struct smcd_dev *smcd = conn->lgr->smcd; u64 peer_token = conn->peer_token; if (!conn->sndbuf_desc) return; smc_ism_detach_dmb(smcd, peer_token); kfree(conn->sndbuf_desc); conn->sndbuf_desc = NULL; } static void smc_buf_unuse(struct smc_connection *conn, struct smc_link_group *lgr) { if (conn->sndbuf_desc) { if (!lgr->is_smcd && conn->sndbuf_desc->is_vm) { smcr_buf_unuse(conn->sndbuf_desc, false, lgr); } else { memzero_explicit(conn->sndbuf_desc->cpu_addr, conn->sndbuf_desc->len); WRITE_ONCE(conn->sndbuf_desc->used, 0); } } if (conn->rmb_desc) { if (!lgr->is_smcd) { smcr_buf_unuse(conn->rmb_desc, true, lgr); } else { memzero_explicit(conn->rmb_desc->cpu_addr, conn->rmb_desc->len + sizeof(struct smcd_cdc_msg)); WRITE_ONCE(conn->rmb_desc->used, 0); } } } /* remove a finished connection from its link group */ void smc_conn_free(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; if (!lgr || conn->freed) /* Connection has never been registered in a * link group, or has already been freed. */ return; conn->freed = 1; if (!smc_conn_lgr_valid(conn)) /* Connection has already unregistered from * link group. */ goto lgr_put; if (lgr->is_smcd) { if (!list_empty(&lgr->list)) smc_ism_unset_conn(conn); if (smc_ism_support_dmb_nocopy(lgr->smcd)) smcd_buf_detach(conn); tasklet_kill(&conn->rx_tsklet); } else { smc_cdc_wait_pend_tx_wr(conn); if (current_work() != &conn->abort_work) cancel_work_sync(&conn->abort_work); } if (!list_empty(&lgr->list)) { smc_buf_unuse(conn, lgr); /* allow buffer reuse */ smc_lgr_unregister_conn(conn); } if (!lgr->conns_num) smc_lgr_schedule_free_work(lgr); lgr_put: if (!lgr->is_smcd) smcr_link_put(conn->lnk); /* link_hold in smc_conn_create() */ smc_lgr_put(lgr); /* lgr_hold in smc_conn_create() */ } /* unregister a link from a buf_desc */ static void smcr_buf_unmap_link(struct smc_buf_desc *buf_desc, bool is_rmb, struct smc_link *lnk) { if (is_rmb || buf_desc->is_vm) buf_desc->is_reg_mr[lnk->link_idx] = false; if (!buf_desc->is_map_ib[lnk->link_idx]) return; if ((is_rmb || buf_desc->is_vm) && buf_desc->mr[lnk->link_idx]) { smc_ib_put_memory_region(buf_desc->mr[lnk->link_idx]); buf_desc->mr[lnk->link_idx] = NULL; } if (is_rmb) smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_FROM_DEVICE); else smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_TO_DEVICE); sg_free_table(&buf_desc->sgt[lnk->link_idx]); buf_desc->is_map_ib[lnk->link_idx] = false; } /* unmap all buffers of lgr for a deleted link */ static void smcr_buf_unmap_lgr(struct smc_link *lnk) { struct smc_link_group *lgr = lnk->lgr; struct smc_buf_desc *buf_desc, *bf; int i; for (i = 0; i < SMC_RMBE_SIZES; i++) { down_write(&lgr->rmbs_lock); list_for_each_entry_safe(buf_desc, bf, &lgr->rmbs[i], list) smcr_buf_unmap_link(buf_desc, true, lnk); up_write(&lgr->rmbs_lock); down_write(&lgr->sndbufs_lock); list_for_each_entry_safe(buf_desc, bf, &lgr->sndbufs[i], list) smcr_buf_unmap_link(buf_desc, false, lnk); up_write(&lgr->sndbufs_lock); } } static void smcr_rtoken_clear_link(struct smc_link *lnk) { struct smc_link_group *lgr = lnk->lgr; int i; for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { lgr->rtokens[i][lnk->link_idx].rkey = 0; lgr->rtokens[i][lnk->link_idx].dma_addr = 0; } } static void __smcr_link_clear(struct smc_link *lnk) { struct smc_link_group *lgr = lnk->lgr; struct smc_ib_device *smcibdev; smc_wr_free_link_mem(lnk); smc_ibdev_cnt_dec(lnk); put_device(&lnk->smcibdev->ibdev->dev); smcibdev = lnk->smcibdev; memset(lnk, 0, sizeof(struct smc_link)); lnk->state = SMC_LNK_UNUSED; if (!atomic_dec_return(&smcibdev->lnk_cnt)) wake_up(&smcibdev->lnks_deleted); smc_lgr_put(lgr); /* lgr_hold in smcr_link_init() */ } /* must be called under lgr->llc_conf_mutex lock */ void smcr_link_clear(struct smc_link *lnk, bool log) { if (!lnk->lgr || lnk->clearing || lnk->state == SMC_LNK_UNUSED) return; lnk->clearing = 1; lnk->peer_qpn = 0; smc_llc_link_clear(lnk, log); smcr_buf_unmap_lgr(lnk); smcr_rtoken_clear_link(lnk); smc_ib_modify_qp_error(lnk); smc_wr_free_link(lnk); smc_ib_destroy_queue_pair(lnk); smc_ib_dealloc_protection_domain(lnk); smcr_link_put(lnk); /* theoretically last link_put */ } void smcr_link_hold(struct smc_link *lnk) { refcount_inc(&lnk->refcnt); } void smcr_link_put(struct smc_link *lnk) { if (refcount_dec_and_test(&lnk->refcnt)) __smcr_link_clear(lnk); } static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, struct smc_buf_desc *buf_desc) { int i; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) smcr_buf_unmap_link(buf_desc, is_rmb, &lgr->lnk[i]); if (!buf_desc->is_vm && buf_desc->pages) __free_pages(buf_desc->pages, buf_desc->order); else if (buf_desc->is_vm && buf_desc->cpu_addr) vfree(buf_desc->cpu_addr); kfree(buf_desc); } static void smcd_buf_free(struct smc_link_group *lgr, bool is_dmb, struct smc_buf_desc *buf_desc) { if (is_dmb) { /* restore original buf len */ buf_desc->len += sizeof(struct smcd_cdc_msg); smc_ism_unregister_dmb(lgr->smcd, buf_desc); } else { kfree(buf_desc->cpu_addr); } kfree(buf_desc); } static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, struct smc_buf_desc *buf_desc) { if (lgr->is_smcd) smcd_buf_free(lgr, is_rmb, buf_desc); else smcr_buf_free(lgr, is_rmb, buf_desc); } static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb) { struct smc_buf_desc *buf_desc, *bf_desc; struct list_head *buf_list; int i; for (i = 0; i < SMC_RMBE_SIZES; i++) { if (is_rmb) buf_list = &lgr->rmbs[i]; else buf_list = &lgr->sndbufs[i]; list_for_each_entry_safe(buf_desc, bf_desc, buf_list, list) { list_del(&buf_desc->list); smc_buf_free(lgr, is_rmb, buf_desc); } } } static void smc_lgr_free_bufs(struct smc_link_group *lgr) { /* free send buffers */ __smc_lgr_free_bufs(lgr, false); /* free rmbs */ __smc_lgr_free_bufs(lgr, true); } /* won't be freed until no one accesses to lgr anymore */ static void __smc_lgr_free(struct smc_link_group *lgr) { smc_lgr_free_bufs(lgr); if (lgr->is_smcd) { if (!atomic_dec_return(&lgr->smcd->lgr_cnt)) wake_up(&lgr->smcd->lgrs_deleted); } else { smc_wr_free_lgr_mem(lgr); if (!atomic_dec_return(&lgr_cnt)) wake_up(&lgrs_deleted); } kfree(lgr); } /* remove a link group */ static void smc_lgr_free(struct smc_link_group *lgr) { int i; if (!lgr->is_smcd) { down_write(&lgr->llc_conf_mutex); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (lgr->lnk[i].state != SMC_LNK_UNUSED) smcr_link_clear(&lgr->lnk[i], false); } up_write(&lgr->llc_conf_mutex); smc_llc_lgr_clear(lgr); } destroy_workqueue(lgr->tx_wq); if (lgr->is_smcd) { smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); put_device(lgr->smcd->ops->get_dev(lgr->smcd)); } smc_lgr_put(lgr); /* theoretically last lgr_put */ } void smc_lgr_hold(struct smc_link_group *lgr) { refcount_inc(&lgr->refcnt); } void smc_lgr_put(struct smc_link_group *lgr) { if (refcount_dec_and_test(&lgr->refcnt)) __smc_lgr_free(lgr); } static void smc_sk_wake_ups(struct smc_sock *smc) { smc->sk.sk_write_space(&smc->sk); smc->sk.sk_data_ready(&smc->sk); smc->sk.sk_state_change(&smc->sk); } /* kill a connection */ static void smc_conn_kill(struct smc_connection *conn, bool soft) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); if (conn->lgr->is_smcd && conn->lgr->peer_shutdown) conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; else smc_close_abort(conn); conn->killed = 1; smc->sk.sk_err = ECONNABORTED; smc_sk_wake_ups(smc); if (conn->lgr->is_smcd) { smc_ism_unset_conn(conn); if (smc_ism_support_dmb_nocopy(conn->lgr->smcd)) smcd_buf_detach(conn); if (soft) tasklet_kill(&conn->rx_tsklet); else tasklet_unlock_wait(&conn->rx_tsklet); } else { smc_cdc_wait_pend_tx_wr(conn); } smc_lgr_unregister_conn(conn); smc_close_active_abort(smc); } static void smc_lgr_cleanup(struct smc_link_group *lgr) { if (lgr->is_smcd) { smc_ism_signal_shutdown(lgr); } else { u32 rsn = lgr->llc_termination_rsn; if (!rsn) rsn = SMC_LLC_DEL_PROG_INIT_TERM; smc_llc_send_link_delete_all(lgr, false, rsn); smcr_lgr_link_deactivate_all(lgr); } } /* terminate link group * @soft: true if link group shutdown can take its time * false if immediate link group shutdown is required */ static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft) { struct smc_connection *conn; struct smc_sock *smc; struct rb_node *node; if (lgr->terminating) return; /* lgr already terminating */ /* cancel free_work sync, will terminate when lgr->freeing is set */ cancel_delayed_work(&lgr->free_work); lgr->terminating = 1; /* kill remaining link group connections */ read_lock_bh(&lgr->conns_lock); node = rb_first(&lgr->conns_all); while (node) { read_unlock_bh(&lgr->conns_lock); conn = rb_entry(node, struct smc_connection, alert_node); smc = container_of(conn, struct smc_sock, conn); sock_hold(&smc->sk); /* sock_put below */ lock_sock(&smc->sk); smc_conn_kill(conn, soft); release_sock(&smc->sk); sock_put(&smc->sk); /* sock_hold above */ read_lock_bh(&lgr->conns_lock); node = rb_first(&lgr->conns_all); } read_unlock_bh(&lgr->conns_lock); smc_lgr_cleanup(lgr); smc_lgr_free(lgr); } /* unlink link group and schedule termination */ void smc_lgr_terminate_sched(struct smc_link_group *lgr) { spinlock_t *lgr_lock; smc_lgr_list_head(lgr, &lgr_lock); spin_lock_bh(lgr_lock); if (list_empty(&lgr->list) || lgr->terminating || lgr->freeing) { spin_unlock_bh(lgr_lock); return; /* lgr already terminating */ } list_del_init(&lgr->list); lgr->freeing = 1; spin_unlock_bh(lgr_lock); schedule_work(&lgr->terminate_work); } /* Called when peer lgr shutdown (regularly or abnormally) is received */ void smc_smcd_terminate(struct smcd_dev *dev, struct smcd_gid *peer_gid, unsigned short vlan) { struct smc_link_group *lgr, *l; LIST_HEAD(lgr_free_list); /* run common cleanup function and build free list */ spin_lock_bh(&dev->lgr_lock); list_for_each_entry_safe(lgr, l, &dev->lgr_list, list) { if ((!peer_gid->gid || (lgr->peer_gid.gid == peer_gid->gid && !smc_ism_is_emulated(dev) ? 1 : lgr->peer_gid.gid_ext == peer_gid->gid_ext)) && (vlan == VLAN_VID_MASK || lgr->vlan_id == vlan)) { if (peer_gid->gid) /* peer triggered termination */ lgr->peer_shutdown = 1; list_move(&lgr->list, &lgr_free_list); lgr->freeing = 1; } } spin_unlock_bh(&dev->lgr_lock); /* cancel the regular free workers and actually free lgrs */ list_for_each_entry_safe(lgr, l, &lgr_free_list, list) { list_del_init(&lgr->list); schedule_work(&lgr->terminate_work); } } /* Called when an SMCD device is removed or the smc module is unloaded */ void smc_smcd_terminate_all(struct smcd_dev *smcd) { struct smc_link_group *lgr, *lg; LIST_HEAD(lgr_free_list); spin_lock_bh(&smcd->lgr_lock); list_splice_init(&smcd->lgr_list, &lgr_free_list); list_for_each_entry(lgr, &lgr_free_list, list) lgr->freeing = 1; spin_unlock_bh(&smcd->lgr_lock); list_for_each_entry_safe(lgr, lg, &lgr_free_list, list) { list_del_init(&lgr->list); __smc_lgr_terminate(lgr, false); } if (atomic_read(&smcd->lgr_cnt)) wait_event(smcd->lgrs_deleted, !atomic_read(&smcd->lgr_cnt)); } /* Called when an SMCR device is removed or the smc module is unloaded. * If smcibdev is given, all SMCR link groups using this device are terminated. * If smcibdev is NULL, all SMCR link groups are terminated. */ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) { struct smc_link_group *lgr, *lg; LIST_HEAD(lgr_free_list); int i; spin_lock_bh(&smc_lgr_list.lock); if (!smcibdev) { list_splice_init(&smc_lgr_list.list, &lgr_free_list); list_for_each_entry(lgr, &lgr_free_list, list) lgr->freeing = 1; } else { list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) { for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (lgr->lnk[i].smcibdev == smcibdev) smcr_link_down_cond_sched(&lgr->lnk[i]); } } } spin_unlock_bh(&smc_lgr_list.lock); list_for_each_entry_safe(lgr, lg, &lgr_free_list, list) { list_del_init(&lgr->list); smc_llc_set_termination_rsn(lgr, SMC_LLC_DEL_OP_INIT_TERM); __smc_lgr_terminate(lgr, false); } if (smcibdev) { if (atomic_read(&smcibdev->lnk_cnt)) wait_event(smcibdev->lnks_deleted, !atomic_read(&smcibdev->lnk_cnt)); } else { if (atomic_read(&lgr_cnt)) wait_event(lgrs_deleted, !atomic_read(&lgr_cnt)); } } /* set new lgr type and clear all asymmetric link tagging */ void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type) { char *lgr_type = ""; int i; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) if (smc_link_usable(&lgr->lnk[i])) lgr->lnk[i].link_is_asym = false; if (lgr->type == new_type) return; lgr->type = new_type; switch (lgr->type) { case SMC_LGR_NONE: lgr_type = "NONE"; break; case SMC_LGR_SINGLE: lgr_type = "SINGLE"; break; case SMC_LGR_SYMMETRIC: lgr_type = "SYMMETRIC"; break; case SMC_LGR_ASYMMETRIC_PEER: lgr_type = "ASYMMETRIC_PEER"; break; case SMC_LGR_ASYMMETRIC_LOCAL: lgr_type = "ASYMMETRIC_LOCAL"; break; } pr_warn_ratelimited("smc: SMC-R lg %*phN net %llu state changed: " "%s, pnetid %.16s\n", SMC_LGR_ID_SIZE, &lgr->id, lgr->net->net_cookie, lgr_type, lgr->pnet_id); } /* set new lgr type and tag a link as asymmetric */ void smcr_lgr_set_type_asym(struct smc_link_group *lgr, enum smc_lgr_type new_type, int asym_lnk_idx) { smcr_lgr_set_type(lgr, new_type); lgr->lnk[asym_lnk_idx].link_is_asym = true; } /* abort connection, abort_work scheduled from tasklet context */ static void smc_conn_abort_work(struct work_struct *work) { struct smc_connection *conn = container_of(work, struct smc_connection, abort_work); struct smc_sock *smc = container_of(conn, struct smc_sock, conn); lock_sock(&smc->sk); smc_conn_kill(conn, true); release_sock(&smc->sk); sock_put(&smc->sk); /* sock_hold done by schedulers of abort_work */ } void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport) { struct smc_link_group *lgr, *n; spin_lock_bh(&smc_lgr_list.lock); list_for_each_entry_safe(lgr, n, &smc_lgr_list.list, list) { struct smc_link *link; if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id, SMC_MAX_PNETID_LEN) || lgr->type == SMC_LGR_SYMMETRIC || lgr->type == SMC_LGR_ASYMMETRIC_PEER || !rdma_dev_access_netns(smcibdev->ibdev, lgr->net)) continue; if (lgr->type == SMC_LGR_SINGLE && lgr->max_links <= 1) continue; /* trigger local add link processing */ link = smc_llc_usable_link(lgr); if (link) smc_llc_add_link_local(link); } spin_unlock_bh(&smc_lgr_list.lock); } /* link is down - switch connections to alternate link, * must be called under lgr->llc_conf_mutex lock */ static void smcr_link_down(struct smc_link *lnk) { struct smc_link_group *lgr = lnk->lgr; struct smc_link *to_lnk; int del_link_id; if (!lgr || lnk->state == SMC_LNK_UNUSED || list_empty(&lgr->list)) return; to_lnk = smc_switch_conns(lgr, lnk, true); if (!to_lnk) { /* no backup link available */ smcr_link_clear(lnk, true); return; } smcr_lgr_set_type(lgr, SMC_LGR_SINGLE); del_link_id = lnk->link_id; if (lgr->role == SMC_SERV) { /* trigger local delete link processing */ smc_llc_srv_delete_link_local(to_lnk, del_link_id); } else { if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) { /* another llc task is ongoing */ up_write(&lgr->llc_conf_mutex); wait_event_timeout(lgr->llc_flow_waiter, (list_empty(&lgr->list) || lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE), SMC_LLC_WAIT_TIME); down_write(&lgr->llc_conf_mutex); } if (!list_empty(&lgr->list)) { smc_llc_send_delete_link(to_lnk, del_link_id, SMC_LLC_REQ, true, SMC_LLC_DEL_LOST_PATH); smcr_link_clear(lnk, true); } wake_up(&lgr->llc_flow_waiter); /* wake up next waiter */ } } /* must be called under lgr->llc_conf_mutex lock */ void smcr_link_down_cond(struct smc_link *lnk) { if (smc_link_downing(&lnk->state)) { trace_smcr_link_down(lnk, __builtin_return_address(0)); smcr_link_down(lnk); } } /* will get the lgr->llc_conf_mutex lock */ void smcr_link_down_cond_sched(struct smc_link *lnk) { if (smc_link_downing(&lnk->state)) { trace_smcr_link_down(lnk, __builtin_return_address(0)); schedule_work(&lnk->link_down_wrk); } } void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport) { struct smc_link_group *lgr, *n; int i; list_for_each_entry_safe(lgr, n, &smc_lgr_list.list, list) { if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id, SMC_MAX_PNETID_LEN)) continue; /* lgr is not affected */ if (list_empty(&lgr->list)) continue; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { struct smc_link *lnk = &lgr->lnk[i]; if (smc_link_usable(lnk) && lnk->smcibdev == smcibdev && lnk->ibport == ibport) smcr_link_down_cond_sched(lnk); } } } static void smc_link_down_work(struct work_struct *work) { struct smc_link *link = container_of(work, struct smc_link, link_down_wrk); struct smc_link_group *lgr = link->lgr; if (list_empty(&lgr->list)) return; wake_up_all(&lgr->llc_msg_waiter); down_write(&lgr->llc_conf_mutex); smcr_link_down(link); up_write(&lgr->llc_conf_mutex); } static int smc_vlan_by_tcpsk_walk(struct net_device *lower_dev, struct netdev_nested_priv *priv) { unsigned short *vlan_id = (unsigned short *)priv->data; if (is_vlan_dev(lower_dev)) { *vlan_id = vlan_dev_vlan_id(lower_dev); return 1; } return 0; } /* Determine vlan of internal TCP socket. */ int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini) { struct dst_entry *dst = sk_dst_get(clcsock->sk); struct netdev_nested_priv priv; struct net_device *ndev; int rc = 0; ini->vlan_id = 0; if (!dst) { rc = -ENOTCONN; goto out; } if (!dst->dev) { rc = -ENODEV; goto out_rel; } ndev = dst->dev; if (is_vlan_dev(ndev)) { ini->vlan_id = vlan_dev_vlan_id(ndev); goto out_rel; } priv.data = (void *)&ini->vlan_id; rtnl_lock(); netdev_walk_all_lower_dev(ndev, smc_vlan_by_tcpsk_walk, &priv); rtnl_unlock(); out_rel: dst_release(dst); out: return rc; } static bool smcr_lgr_match(struct smc_link_group *lgr, u8 smcr_version, u8 peer_systemid[], u8 peer_gid[], u8 peer_mac_v1[], enum smc_lgr_role role, u32 clcqpn, struct net *net) { struct smc_link *lnk; int i; if (memcmp(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN) || lgr->role != role) return false; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { lnk = &lgr->lnk[i]; if (!smc_link_active(lnk)) continue; /* use verbs API to check netns, instead of lgr->net */ if (!rdma_dev_access_netns(lnk->smcibdev->ibdev, net)) return false; if ((lgr->role == SMC_SERV || lnk->peer_qpn == clcqpn) && !memcmp(lnk->peer_gid, peer_gid, SMC_GID_SIZE) && (smcr_version == SMC_V2 || !memcmp(lnk->peer_mac, peer_mac_v1, ETH_ALEN))) return true; } return false; } static bool smcd_lgr_match(struct smc_link_group *lgr, struct smcd_dev *smcismdev, struct smcd_gid *peer_gid) { if (lgr->peer_gid.gid != peer_gid->gid || lgr->smcd != smcismdev) return false; if (smc_ism_is_emulated(smcismdev) && lgr->peer_gid.gid_ext != peer_gid->gid_ext) return false; return true; } /* create a new SMC connection (and a new link group if necessary) */ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) { struct smc_connection *conn = &smc->conn; struct net *net = sock_net(&smc->sk); struct list_head *lgr_list; struct smc_link_group *lgr; enum smc_lgr_role role; spinlock_t *lgr_lock; int rc = 0; lgr_list = ini->is_smcd ? &ini->ism_dev[ini->ism_selected]->lgr_list : &smc_lgr_list.list; lgr_lock = ini->is_smcd ? &ini->ism_dev[ini->ism_selected]->lgr_lock : &smc_lgr_list.lock; ini->first_contact_local = 1; role = smc->listen_smc ? SMC_SERV : SMC_CLNT; if (role == SMC_CLNT && ini->first_contact_peer) /* create new link group as well */ goto create; /* determine if an existing link group can be reused */ spin_lock_bh(lgr_lock); list_for_each_entry(lgr, lgr_list, list) { write_lock_bh(&lgr->conns_lock); if ((ini->is_smcd ? smcd_lgr_match(lgr, ini->ism_dev[ini->ism_selected], &ini->ism_peer_gid[ini->ism_selected]) : smcr_lgr_match(lgr, ini->smcr_version, ini->peer_systemid, ini->peer_gid, ini->peer_mac, role, ini->ib_clcqpn, net)) && !lgr->sync_err && (ini->smcd_version == SMC_V2 || lgr->vlan_id == ini->vlan_id) && (role == SMC_CLNT || ini->is_smcd || (lgr->conns_num < lgr->max_conns && !bitmap_full(lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX)))) { /* link group found */ ini->first_contact_local = 0; conn->lgr = lgr; rc = smc_lgr_register_conn(conn, false); write_unlock_bh(&lgr->conns_lock); if (!rc && delayed_work_pending(&lgr->free_work)) cancel_delayed_work(&lgr->free_work); break; } write_unlock_bh(&lgr->conns_lock); } spin_unlock_bh(lgr_lock); if (rc) return rc; if (role == SMC_CLNT && !ini->first_contact_peer && ini->first_contact_local) { /* Server reuses a link group, but Client wants to start * a new one * send out_of_sync decline, reason synchr. error */ return SMC_CLC_DECL_SYNCERR; } create: if (ini->first_contact_local) { rc = smc_lgr_create(smc, ini); if (rc) goto out; lgr = conn->lgr; write_lock_bh(&lgr->conns_lock); rc = smc_lgr_register_conn(conn, true); write_unlock_bh(&lgr->conns_lock); if (rc) { smc_lgr_cleanup_early(lgr); goto out; } } smc_lgr_hold(conn->lgr); /* lgr_put in smc_conn_free() */ if (!conn->lgr->is_smcd) smcr_link_hold(conn->lnk); /* link_put in smc_conn_free() */ conn->freed = 0; conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; conn->urg_state = SMC_URG_READ; init_waitqueue_head(&conn->cdc_pend_tx_wq); INIT_WORK(&smc->conn.abort_work, smc_conn_abort_work); if (ini->is_smcd) { conn->rx_off = sizeof(struct smcd_cdc_msg); smcd_cdc_rx_init(conn); /* init tasklet for this conn */ } else { conn->rx_off = 0; } #ifndef KERNEL_HAS_ATOMIC64 spin_lock_init(&conn->acurs_lock); #endif out: return rc; } #define SMCD_DMBE_SIZES 6 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ #define SMCR_RMBE_SIZES 15 /* 0 -> 16KB, 1 -> 32KB, .. 15 -> 512MB */ /* convert the RMB size into the compressed notation (minimum 16K, see * SMCD/R_DMBE_SIZES. * In contrast to plain ilog2, this rounds towards the next power of 2, * so the socket application gets at least its desired sndbuf / rcvbuf size. */ static u8 smc_compress_bufsize(int size, bool is_smcd, bool is_rmb) { u8 compressed; if (size <= SMC_BUF_MIN_SIZE) return 0; size = (size - 1) >> 14; /* convert to 16K multiple */ compressed = min_t(u8, ilog2(size) + 1, is_smcd ? SMCD_DMBE_SIZES : SMCR_RMBE_SIZES); #ifdef CONFIG_ARCH_NO_SG_CHAIN if (!is_smcd && is_rmb) /* RMBs are backed by & limited to max size of scatterlists */ compressed = min_t(u8, compressed, ilog2((SG_MAX_SINGLE_ALLOC * PAGE_SIZE) >> 14)); #endif return compressed; } /* convert the RMB size from compressed notation into integer */ int smc_uncompress_bufsize(u8 compressed) { u32 size; size = 0x00000001 << (((int)compressed) + 14); return (int)size; } /* try to reuse a sndbuf or rmb description slot for a certain * buffer size; if not available, return NULL */ static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize, struct rw_semaphore *lock, struct list_head *buf_list) { struct smc_buf_desc *buf_slot; down_read(lock); list_for_each_entry(buf_slot, buf_list, list) { if (cmpxchg(&buf_slot->used, 0, 1) == 0) { up_read(lock); return buf_slot; } } up_read(lock); return NULL; } /* one of the conditions for announcing a receiver's current window size is * that it "results in a minimum increase in the window size of 10% of the * receive buffer space" [RFC7609] */ static inline int smc_rmb_wnd_update_limit(int rmbe_size) { return max_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2); } /* map an buf to a link */ static int smcr_buf_map_link(struct smc_buf_desc *buf_desc, bool is_rmb, struct smc_link *lnk) { int rc, i, nents, offset, buf_size, size, access_flags; struct scatterlist *sg; void *buf; if (buf_desc->is_map_ib[lnk->link_idx]) return 0; if (buf_desc->is_vm) { buf = buf_desc->cpu_addr; buf_size = buf_desc->len; offset = offset_in_page(buf_desc->cpu_addr); nents = PAGE_ALIGN(buf_size + offset) / PAGE_SIZE; } else { nents = 1; } rc = sg_alloc_table(&buf_desc->sgt[lnk->link_idx], nents, GFP_KERNEL); if (rc) return rc; if (buf_desc->is_vm) { /* virtually contiguous buffer */ for_each_sg(buf_desc->sgt[lnk->link_idx].sgl, sg, nents, i) { size = min_t(int, PAGE_SIZE - offset, buf_size); sg_set_page(sg, vmalloc_to_page(buf), size, offset); buf += size / sizeof(*buf); buf_size -= size; offset = 0; } } else { /* physically contiguous buffer */ sg_set_buf(buf_desc->sgt[lnk->link_idx].sgl, buf_desc->cpu_addr, buf_desc->len); } /* map sg table to DMA address */ rc = smc_ib_buf_map_sg(lnk, buf_desc, is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); /* SMC protocol depends on mapping to one DMA address only */ if (rc != nents) { rc = -EAGAIN; goto free_table; } buf_desc->is_dma_need_sync |= smc_ib_is_sg_need_sync(lnk, buf_desc) << lnk->link_idx; if (is_rmb || buf_desc->is_vm) { /* create a new memory region for the RMB or vzalloced sndbuf */ access_flags = is_rmb ? IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : IB_ACCESS_LOCAL_WRITE; rc = smc_ib_get_memory_region(lnk->roce_pd, access_flags, buf_desc, lnk->link_idx); if (rc) goto buf_unmap; smc_ib_sync_sg_for_device(lnk, buf_desc, is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); } buf_desc->is_map_ib[lnk->link_idx] = true; return 0; buf_unmap: smc_ib_buf_unmap_sg(lnk, buf_desc, is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); free_table: sg_free_table(&buf_desc->sgt[lnk->link_idx]); return rc; } /* register a new buf on IB device, rmb or vzalloced sndbuf * must be called under lgr->llc_conf_mutex lock */ int smcr_link_reg_buf(struct smc_link *link, struct smc_buf_desc *buf_desc) { if (list_empty(&link->lgr->list)) return -ENOLINK; if (!buf_desc->is_reg_mr[link->link_idx]) { /* register memory region for new buf */ if (buf_desc->is_vm) buf_desc->mr[link->link_idx]->iova = (uintptr_t)buf_desc->cpu_addr; if (smc_wr_reg_send(link, buf_desc->mr[link->link_idx])) { buf_desc->is_reg_err = true; return -EFAULT; } buf_desc->is_reg_mr[link->link_idx] = true; } return 0; } static int _smcr_buf_map_lgr(struct smc_link *lnk, struct rw_semaphore *lock, struct list_head *lst, bool is_rmb) { struct smc_buf_desc *buf_desc, *bf; int rc = 0; down_write(lock); list_for_each_entry_safe(buf_desc, bf, lst, list) { if (!buf_desc->used) continue; rc = smcr_buf_map_link(buf_desc, is_rmb, lnk); if (rc) goto out; } out: up_write(lock); return rc; } /* map all used buffers of lgr for a new link */ int smcr_buf_map_lgr(struct smc_link *lnk) { struct smc_link_group *lgr = lnk->lgr; int i, rc = 0; for (i = 0; i < SMC_RMBE_SIZES; i++) { rc = _smcr_buf_map_lgr(lnk, &lgr->rmbs_lock, &lgr->rmbs[i], true); if (rc) return rc; rc = _smcr_buf_map_lgr(lnk, &lgr->sndbufs_lock, &lgr->sndbufs[i], false); if (rc) return rc; } return 0; } /* register all used buffers of lgr for a new link, * must be called under lgr->llc_conf_mutex lock */ int smcr_buf_reg_lgr(struct smc_link *lnk) { struct smc_link_group *lgr = lnk->lgr; struct smc_buf_desc *buf_desc, *bf; int i, rc = 0; /* reg all RMBs for a new link */ down_write(&lgr->rmbs_lock); for (i = 0; i < SMC_RMBE_SIZES; i++) { list_for_each_entry_safe(buf_desc, bf, &lgr->rmbs[i], list) { if (!buf_desc->used) continue; rc = smcr_link_reg_buf(lnk, buf_desc); if (rc) { up_write(&lgr->rmbs_lock); return rc; } } } up_write(&lgr->rmbs_lock); if (lgr->buf_type == SMCR_PHYS_CONT_BUFS) return rc; /* reg all vzalloced sndbufs for a new link */ down_write(&lgr->sndbufs_lock); for (i = 0; i < SMC_RMBE_SIZES; i++) { list_for_each_entry_safe(buf_desc, bf, &lgr->sndbufs[i], list) { if (!buf_desc->used || !buf_desc->is_vm) continue; rc = smcr_link_reg_buf(lnk, buf_desc); if (rc) { up_write(&lgr->sndbufs_lock); return rc; } } } up_write(&lgr->sndbufs_lock); return rc; } static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, bool is_rmb, int bufsize) { struct smc_buf_desc *buf_desc; /* try to alloc a new buffer */ buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); if (!buf_desc) return ERR_PTR(-ENOMEM); switch (lgr->buf_type) { case SMCR_PHYS_CONT_BUFS: case SMCR_MIXED_BUFS: buf_desc->order = get_order(bufsize); buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_NOMEMALLOC | __GFP_COMP | __GFP_NORETRY | __GFP_ZERO, buf_desc->order); if (buf_desc->pages) { buf_desc->cpu_addr = (void *)page_address(buf_desc->pages); buf_desc->len = bufsize; buf_desc->is_vm = false; break; } if (lgr->buf_type == SMCR_PHYS_CONT_BUFS) goto out; fallthrough; // try virtually continguous buf case SMCR_VIRT_CONT_BUFS: buf_desc->order = get_order(bufsize); buf_desc->cpu_addr = vzalloc(PAGE_SIZE << buf_desc->order); if (!buf_desc->cpu_addr) goto out; buf_desc->pages = NULL; buf_desc->len = bufsize; buf_desc->is_vm = true; break; } return buf_desc; out: kfree(buf_desc); return ERR_PTR(-EAGAIN); } /* map buf_desc on all usable links, * unused buffers stay mapped as long as the link is up */ static int smcr_buf_map_usable_links(struct smc_link_group *lgr, struct smc_buf_desc *buf_desc, bool is_rmb) { int i, rc = 0, cnt = 0; /* protect against parallel link reconfiguration */ down_read(&lgr->llc_conf_mutex); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { struct smc_link *lnk = &lgr->lnk[i]; if (!smc_link_usable(lnk)) continue; if (smcr_buf_map_link(buf_desc, is_rmb, lnk)) { rc = -ENOMEM; goto out; } cnt++; } out: up_read(&lgr->llc_conf_mutex); if (!rc && !cnt) rc = -EINVAL; return rc; } static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr, bool is_dmb, int bufsize) { struct smc_buf_desc *buf_desc; int rc; /* try to alloc a new DMB */ buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); if (!buf_desc) return ERR_PTR(-ENOMEM); if (is_dmb) { rc = smc_ism_register_dmb(lgr, bufsize, buf_desc); if (rc) { kfree(buf_desc); if (rc == -ENOMEM) return ERR_PTR(-EAGAIN); if (rc == -ENOSPC) return ERR_PTR(-ENOSPC); return ERR_PTR(-EIO); } buf_desc->pages = virt_to_page(buf_desc->cpu_addr); /* CDC header stored in buf. So, pretend it was smaller */ buf_desc->len = bufsize - sizeof(struct smcd_cdc_msg); } else { buf_desc->cpu_addr = kzalloc(bufsize, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC); if (!buf_desc->cpu_addr) { kfree(buf_desc); return ERR_PTR(-EAGAIN); } buf_desc->len = bufsize; } return buf_desc; } static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) { struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM); struct smc_connection *conn = &smc->conn; struct smc_link_group *lgr = conn->lgr; struct list_head *buf_list; int bufsize, bufsize_comp; struct rw_semaphore *lock; /* lock buffer list */ bool is_dgraded = false; if (is_rmb) /* use socket recv buffer size (w/o overhead) as start value */ bufsize = smc->sk.sk_rcvbuf / 2; else /* use socket send buffer size (w/o overhead) as start value */ bufsize = smc->sk.sk_sndbuf / 2; for (bufsize_comp = smc_compress_bufsize(bufsize, is_smcd, is_rmb); bufsize_comp >= 0; bufsize_comp--) { if (is_rmb) { lock = &lgr->rmbs_lock; buf_list = &lgr->rmbs[bufsize_comp]; } else { lock = &lgr->sndbufs_lock; buf_list = &lgr->sndbufs[bufsize_comp]; } bufsize = smc_uncompress_bufsize(bufsize_comp); /* check for reusable slot in the link group */ buf_desc = smc_buf_get_slot(bufsize_comp, lock, buf_list); if (buf_desc) { buf_desc->is_dma_need_sync = 0; SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, bufsize); SMC_STAT_BUF_REUSE(smc, is_smcd, is_rmb); break; /* found reusable slot */ } if (is_smcd) buf_desc = smcd_new_buf_create(lgr, is_rmb, bufsize); else buf_desc = smcr_new_buf_create(lgr, is_rmb, bufsize); if (PTR_ERR(buf_desc) == -ENOMEM) break; if (IS_ERR(buf_desc)) { if (!is_dgraded) { is_dgraded = true; SMC_STAT_RMB_DOWNGRADED(smc, is_smcd, is_rmb); } continue; } SMC_STAT_RMB_ALLOC(smc, is_smcd, is_rmb); SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, bufsize); buf_desc->used = 1; down_write(lock); list_add(&buf_desc->list, buf_list); up_write(lock); break; /* found */ } if (IS_ERR(buf_desc)) return PTR_ERR(buf_desc); if (!is_smcd) { if (smcr_buf_map_usable_links(lgr, buf_desc, is_rmb)) { smcr_buf_unuse(buf_desc, is_rmb, lgr); return -ENOMEM; } } if (is_rmb) { conn->rmb_desc = buf_desc; conn->rmbe_size_comp = bufsize_comp; smc->sk.sk_rcvbuf = bufsize * 2; atomic_set(&conn->bytes_to_rcv, 0); conn->rmbe_update_limit = smc_rmb_wnd_update_limit(buf_desc->len); if (is_smcd) smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */ } else { conn->sndbuf_desc = buf_desc; smc->sk.sk_sndbuf = bufsize * 2; atomic_set(&conn->sndbuf_space, bufsize); } return 0; } void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn) { if (!conn->sndbuf_desc->is_dma_need_sync) return; if (!smc_conn_lgr_valid(conn) || conn->lgr->is_smcd || !smc_link_active(conn->lnk)) return; smc_ib_sync_sg_for_device(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE); } void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn) { int i; if (!conn->rmb_desc->is_dma_need_sync) return; if (!smc_conn_lgr_valid(conn) || conn->lgr->is_smcd) return; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (!smc_link_active(&conn->lgr->lnk[i])) continue; smc_ib_sync_sg_for_cpu(&conn->lgr->lnk[i], conn->rmb_desc, DMA_FROM_DEVICE); } } /* create the send and receive buffer for an SMC socket; * receive buffers are called RMBs; * (even though the SMC protocol allows more than one RMB-element per RMB, * the Linux implementation uses just one RMB-element per RMB, i.e. uses an * extra RMB for every connection in a link group */ int smc_buf_create(struct smc_sock *smc, bool is_smcd) { int rc; /* create send buffer */ if (is_smcd && smc_ism_support_dmb_nocopy(smc->conn.lgr->smcd)) goto create_rmb; rc = __smc_buf_create(smc, is_smcd, false); if (rc) return rc; create_rmb: /* create rmb */ rc = __smc_buf_create(smc, is_smcd, true); if (rc && smc->conn.sndbuf_desc) { down_write(&smc->conn.lgr->sndbufs_lock); list_del(&smc->conn.sndbuf_desc->list); up_write(&smc->conn.lgr->sndbufs_lock); smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc); smc->conn.sndbuf_desc = NULL; } return rc; } int smcd_buf_attach(struct smc_sock *smc) { struct smc_connection *conn = &smc->conn; struct smcd_dev *smcd = conn->lgr->smcd; u64 peer_token = conn->peer_token; struct smc_buf_desc *buf_desc; int rc; buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); if (!buf_desc) return -ENOMEM; /* The ghost sndbuf_desc describes the same memory region as * peer RMB. Its lifecycle is consistent with the connection's * and it will be freed with the connections instead of the * link group. */ rc = smc_ism_attach_dmb(smcd, peer_token, buf_desc); if (rc) goto free; smc->sk.sk_sndbuf = buf_desc->len; buf_desc->cpu_addr = (u8 *)buf_desc->cpu_addr + sizeof(struct smcd_cdc_msg); buf_desc->len -= sizeof(struct smcd_cdc_msg); conn->sndbuf_desc = buf_desc; conn->sndbuf_desc->used = 1; atomic_set(&conn->sndbuf_space, conn->sndbuf_desc->len); return 0; free: kfree(buf_desc); return rc; } static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr) { int i; for_each_clear_bit(i, lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX) { if (!test_and_set_bit(i, lgr->rtokens_used_mask)) return i; } return -ENOSPC; } static int smc_rtoken_find_by_link(struct smc_link_group *lgr, int lnk_idx, u32 rkey) { int i; for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { if (test_bit(i, lgr->rtokens_used_mask) && lgr->rtokens[i][lnk_idx].rkey == rkey) return i; } return -ENOENT; } /* set rtoken for a new link to an existing rmb */ void smc_rtoken_set(struct smc_link_group *lgr, int link_idx, int link_idx_new, __be32 nw_rkey_known, __be64 nw_vaddr, __be32 nw_rkey) { int rtok_idx; rtok_idx = smc_rtoken_find_by_link(lgr, link_idx, ntohl(nw_rkey_known)); if (rtok_idx == -ENOENT) return; lgr->rtokens[rtok_idx][link_idx_new].rkey = ntohl(nw_rkey); lgr->rtokens[rtok_idx][link_idx_new].dma_addr = be64_to_cpu(nw_vaddr); } /* set rtoken for a new link whose link_id is given */ void smc_rtoken_set2(struct smc_link_group *lgr, int rtok_idx, int link_id, __be64 nw_vaddr, __be32 nw_rkey) { u64 dma_addr = be64_to_cpu(nw_vaddr); u32 rkey = ntohl(nw_rkey); bool found = false; int link_idx; for (link_idx = 0; link_idx < SMC_LINKS_PER_LGR_MAX; link_idx++) { if (lgr->lnk[link_idx].link_id == link_id) { found = true; break; } } if (!found) return; lgr->rtokens[rtok_idx][link_idx].rkey = rkey; lgr->rtokens[rtok_idx][link_idx].dma_addr = dma_addr; } /* add a new rtoken from peer */ int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey) { struct smc_link_group *lgr = smc_get_lgr(lnk); u64 dma_addr = be64_to_cpu(nw_vaddr); u32 rkey = ntohl(nw_rkey); int i; for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { if (lgr->rtokens[i][lnk->link_idx].rkey == rkey && lgr->rtokens[i][lnk->link_idx].dma_addr == dma_addr && test_bit(i, lgr->rtokens_used_mask)) { /* already in list */ return i; } } i = smc_rmb_reserve_rtoken_idx(lgr); if (i < 0) return i; lgr->rtokens[i][lnk->link_idx].rkey = rkey; lgr->rtokens[i][lnk->link_idx].dma_addr = dma_addr; return i; } /* delete an rtoken from all links */ int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey) { struct smc_link_group *lgr = smc_get_lgr(lnk); u32 rkey = ntohl(nw_rkey); int i, j; for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { if (lgr->rtokens[i][lnk->link_idx].rkey == rkey && test_bit(i, lgr->rtokens_used_mask)) { for (j = 0; j < SMC_LINKS_PER_LGR_MAX; j++) { lgr->rtokens[i][j].rkey = 0; lgr->rtokens[i][j].dma_addr = 0; } clear_bit(i, lgr->rtokens_used_mask); return 0; } } return -ENOENT; } /* save rkey and dma_addr received from peer during clc handshake */ int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_link *lnk, struct smc_clc_msg_accept_confirm *clc) { conn->rtoken_idx = smc_rtoken_add(lnk, clc->r0.rmb_dma_addr, clc->r0.rmb_rkey); if (conn->rtoken_idx < 0) return conn->rtoken_idx; return 0; } static void smc_core_going_away(void) { struct smc_ib_device *smcibdev; struct smcd_dev *smcd; mutex_lock(&smc_ib_devices.mutex); list_for_each_entry(smcibdev, &smc_ib_devices.list, list) { int i; for (i = 0; i < SMC_MAX_PORTS; i++) set_bit(i, smcibdev->ports_going_away); } mutex_unlock(&smc_ib_devices.mutex); mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(smcd, &smcd_dev_list.list, list) { smcd->going_away = 1; } mutex_unlock(&smcd_dev_list.mutex); } /* Clean up all SMC link groups */ static void smc_lgrs_shutdown(void) { struct smcd_dev *smcd; smc_core_going_away(); smc_smcr_terminate_all(NULL); mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(smcd, &smcd_dev_list.list, list) smc_smcd_terminate_all(smcd); mutex_unlock(&smcd_dev_list.mutex); } static int smc_core_reboot_event(struct notifier_block *this, unsigned long event, void *ptr) { smc_lgrs_shutdown(); smc_ib_unregister_client(); smc_ism_exit(); return 0; } static struct notifier_block smc_reboot_notifier = { .notifier_call = smc_core_reboot_event, }; int __init smc_core_init(void) { return register_reboot_notifier(&smc_reboot_notifier); } /* Called (from smc_exit) when module is removed */ void smc_core_exit(void) { unregister_reboot_notifier(&smc_reboot_notifier); smc_lgrs_shutdown(); } |
635 395 235 360 395 452 4 1 338 338 33 33 33 2 33 250 318 327 213 532 25 17 532 122 9 119 8 349 330 529 16 525 128 5 5 334 6 328 6 515 12 11 11 4 5 8 12 765 42 725 92 460 784 784 764 23 528 458 186 330 474 776 12 647 339 780 218 744 13 121 13 121 784 779 297 42 123 65 55 3 339 328 339 326 338 329 283 121 338 266 73 338 122 121 2 121 2 123 123 123 123 577 562 17 95 485 478 75 28 543 3 578 69 4 61 11 1847 1805 330 69 321 135 326 108 350 147 297 587 586 586 582 61 61 61 603 576 35 29 604 663 467 617 616 662 338 523 141 657 10 663 661 633 84 24 529 527 338 642 23 662 663 334 532 596 358 663 663 661 659 655 46 488 603 69 69 185 47 139 38 38 38 1 37 37 1 38 38 127 188 4 65 65 65 65 65 65 65 64 65 31 61 65 45 45 45 14 5 45 45 45 45 6 43 247 343 370 256 405 20 416 460 16 453 446 16 460 2 11 393 63 394 137 325 4 350 12 132 272 15 57 15 234 293 28 11 171 25 413 135 84 86 11 11 10 136 152 130 45 21 186 186 185 185 186 186 38 182 91 44 46 37 8 4 112 113 113 428 396 33 12 26 41 396 2 400 243 672 328 235 371 702 650 542 644 73 338 171 3 57 113 432 97 428 432 318 423 11 423 421 119 188 387 394 344 47 194 72 135 359 186 397 397 394 139 377 198 395 384 59 24 394 249 489 323 369 11 217 255 119 353 358 98 71 3 69 24 52 15 21 34 361 399 97 87 87 532 434 523 12 529 509 155 155 480 30 38 127 3 6 6 6 6 21 61 61 61 61 61 61 11 6 108 108 108 95 6 6 98 91 39 87 95 17 93 64 61 98 97 98 97 98 14 37 66 43 61 43 66 77 77 65 77 77 66 46 31 15 76 76 12 60 34 65 75 63 597 225 548 552 84 85 17 69 85 41 41 40 41 133 6 127 127 133 123 123 123 119 4 7 118 123 123 123 123 122 361 362 19 362 362 361 4 357 362 354 1 4 362 4 362 362 334 30 362 333 144 121 23 23 1 4 17 17 4 14 18 1 17 17 1 127 123 14 363 364 2 362 30 333 334 144 190 332 1 21 207 186 180 80 2 78 187 187 20 68 433 5 432 274 28 28 4 28 8 8 1 7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 | // SPDX-License-Identifier: GPL-2.0-only /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Florian La Roche, <flla@stud.uni-sb.de> * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> * Linus Torvalds, <torvalds@cs.helsinki.fi> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Matthew Dillon, <dillon@apollo.west.oic.com> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Jorge Cwik, <jorge@laser.satlink.net> */ /* * Changes: Pedro Roque : Retransmit queue handled by TCP. * : Fragmentation on mtu decrease * : Segment collapse on retransmit * : AF independence * * Linus Torvalds : send_delayed_ack * David S. Miller : Charge memory using the right skb * during syn/ack processing. * David S. Miller : Output engine completely rewritten. * Andrea Arcangeli: SYNACK carry ts_recent in tsecr. * Cacophonix Gaul : draft-minshall-nagle-01 * J Hadi Salim : ECN support * */ #define pr_fmt(fmt) "TCP: " fmt #include <net/tcp.h> #include <net/mptcp.h> #include <net/proto_memory.h> #include <linux/compiler.h> #include <linux/gfp.h> #include <linux/module.h> #include <linux/static_key.h> #include <linux/skbuff_ref.h> #include <trace/events/tcp.h> /* Refresh clocks of a TCP socket, * ensuring monotically increasing values. */ void tcp_mstamp_refresh(struct tcp_sock *tp) { u64 val = tcp_clock_ns(); tp->tcp_clock_cache = val; tp->tcp_mstamp = div_u64(val, NSEC_PER_USEC); } static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp); /* Account for new data that has been sent to the network. */ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); unsigned int prior_packets = tp->packets_out; WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(skb)->end_seq); __skb_unlink(skb, &sk->sk_write_queue); tcp_rbtree_insert(&sk->tcp_rtx_queue, skb); if (tp->highest_sack == NULL) tp->highest_sack = skb; tp->packets_out += tcp_skb_pcount(skb); if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) tcp_rearm_rto(sk); NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT, tcp_skb_pcount(skb)); tcp_check_space(sk); } /* SND.NXT, if window was not shrunk or the amount of shrunk was less than one * window scaling factor due to loss of precision. * If window has been shrunk, what should we make? It is not clear at all. * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-( * Anything in between SND.UNA...SND.UNA+SND.WND also can be already * invalid. OK, let's make this for now: */ static inline __u32 tcp_acceptable_seq(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); if (!before(tcp_wnd_end(tp), tp->snd_nxt) || (tp->rx_opt.wscale_ok && ((tp->snd_nxt - tcp_wnd_end(tp)) < (1 << tp->rx_opt.rcv_wscale)))) return tp->snd_nxt; else return tcp_wnd_end(tp); } /* Calculate mss to advertise in SYN segment. * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that: * * 1. It is independent of path mtu. * 2. Ideally, it is maximal possible segment size i.e. 65535-40. * 3. For IPv4 it is reasonable to calculate it from maximal MTU of * attached devices, because some buggy hosts are confused by * large MSS. * 4. We do not make 3, we advertise MSS, calculated from first * hop device mtu, but allow to raise it to ip_rt_min_advmss. * This may be overridden via information stored in routing table. * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible, * probably even Jumbo". */ static __u16 tcp_advertise_mss(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); const struct dst_entry *dst = __sk_dst_get(sk); int mss = tp->advmss; if (dst) { unsigned int metric = dst_metric_advmss(dst); if (metric < mss) { mss = metric; tp->advmss = mss; } } return (__u16)mss; } /* RFC2861. Reset CWND after idle period longer RTO to "restart window". * This is the first part of cwnd validation mechanism. */ void tcp_cwnd_restart(struct sock *sk, s32 delta) { struct tcp_sock *tp = tcp_sk(sk); u32 restart_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk)); u32 cwnd = tcp_snd_cwnd(tp); tcp_ca_event(sk, CA_EVENT_CWND_RESTART); tp->snd_ssthresh = tcp_current_ssthresh(sk); restart_cwnd = min(restart_cwnd, cwnd); while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd) cwnd >>= 1; tcp_snd_cwnd_set(tp, max(cwnd, restart_cwnd)); tp->snd_cwnd_stamp = tcp_jiffies32; tp->snd_cwnd_used = 0; } /* Congestion state accounting after a packet has been sent. */ static void tcp_event_data_sent(struct tcp_sock *tp, struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); const u32 now = tcp_jiffies32; if (tcp_packets_in_flight(tp) == 0) tcp_ca_event(sk, CA_EVENT_TX_START); tp->lsndtime = now; /* If it is a reply for ato after last received * packet, increase pingpong count. */ if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato) inet_csk_inc_pingpong_cnt(sk); } /* Account for an ACK we sent. */ static inline void tcp_event_ack_sent(struct sock *sk, u32 rcv_nxt) { struct tcp_sock *tp = tcp_sk(sk); if (unlikely(tp->compressed_ack)) { NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, tp->compressed_ack); tp->compressed_ack = 0; if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1) __sock_put(sk); } if (unlikely(rcv_nxt != tp->rcv_nxt)) return; /* Special ACK sent by DCTCP to reflect ECN */ tcp_dec_quickack_mode(sk); inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); } /* Determine a window scaling and initial window to offer. * Based on the assumption that the given amount of space * will be offered. Store the results in the tp structure. * NOTE: for smooth operation initial space offering should * be a multiple of mss if possible. We assume here that mss >= 1. * This MUST be enforced by all callers. */ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, __u32 *rcv_wnd, __u32 *__window_clamp, int wscale_ok, __u8 *rcv_wscale, __u32 init_rcv_wnd) { unsigned int space = (__space < 0 ? 0 : __space); u32 window_clamp = READ_ONCE(*__window_clamp); /* If no clamp set the clamp to the max possible scaled window */ if (window_clamp == 0) window_clamp = (U16_MAX << TCP_MAX_WSCALE); space = min(window_clamp, space); /* Quantize space offering to a multiple of mss if possible. */ if (space > mss) space = rounddown(space, mss); /* NOTE: offering an initial window larger than 32767 * will break some buggy TCP stacks. If the admin tells us * it is likely we could be speaking with such a buggy stack * we will truncate our initial window offering to 32K-1 * unless the remote has sent us a window scaling option, * which we interpret as a sign the remote TCP is not * misinterpreting the window field as a signed quantity. */ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)) (*rcv_wnd) = min(space, MAX_TCP_WINDOW); else (*rcv_wnd) = space; if (init_rcv_wnd) *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); *rcv_wscale = 0; if (wscale_ok) { /* Set window scaling on max possible window */ space = max_t(u32, space, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); space = max_t(u32, space, READ_ONCE(sysctl_rmem_max)); space = min_t(u32, space, window_clamp); *rcv_wscale = clamp_t(int, ilog2(space) - 15, 0, TCP_MAX_WSCALE); } /* Set the clamp no higher than max representable value */ WRITE_ONCE(*__window_clamp, min_t(__u32, U16_MAX << (*rcv_wscale), window_clamp)); } EXPORT_SYMBOL(tcp_select_initial_window); /* Chose a new window to advertise, update state in tcp_sock for the * socket, and return result with RFC1323 scaling applied. The return * value can be stuffed directly into th->window for an outgoing * frame. */ static u16 tcp_select_window(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); u32 old_win = tp->rcv_wnd; u32 cur_win, new_win; /* Make the window 0 if we failed to queue the data because we * are out of memory. The window is temporary, so we don't store * it on the socket. */ if (unlikely(inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOMEM)) return 0; cur_win = tcp_receive_window(tp); new_win = __tcp_select_window(sk); if (new_win < cur_win) { /* Danger Will Robinson! * Don't update rcv_wup/rcv_wnd here or else * we will not be able to advertise a zero * window in time. --DaveM * * Relax Will Robinson. */ if (!READ_ONCE(net->ipv4.sysctl_tcp_shrink_window) || !tp->rx_opt.rcv_wscale) { /* Never shrink the offered window */ if (new_win == 0) NET_INC_STATS(net, LINUX_MIB_TCPWANTZEROWINDOWADV); new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale); } } tp->rcv_wnd = new_win; tp->rcv_wup = tp->rcv_nxt; /* Make sure we do not exceed the maximum possible * scaled window. */ if (!tp->rx_opt.rcv_wscale && READ_ONCE(net->ipv4.sysctl_tcp_workaround_signed_windows)) new_win = min(new_win, MAX_TCP_WINDOW); else new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); /* RFC1323 scaling applied */ new_win >>= tp->rx_opt.rcv_wscale; /* If we advertise zero window, disable fast path. */ if (new_win == 0) { tp->pred_flags = 0; if (old_win) NET_INC_STATS(net, LINUX_MIB_TCPTOZEROWINDOWADV); } else if (old_win == 0) { NET_INC_STATS(net, LINUX_MIB_TCPFROMZEROWINDOWADV); } return new_win; } /* Packet ECN state for a SYN-ACK */ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) { const struct tcp_sock *tp = tcp_sk(sk); TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; if (!(tp->ecn_flags & TCP_ECN_OK)) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; else if (tcp_ca_needs_ecn(sk) || tcp_bpf_ca_needs_ecn(sk)) INET_ECN_xmit(sk); } /* Packet ECN state for a SYN. */ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk); bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 || tcp_ca_needs_ecn(sk) || bpf_needs_ecn; if (!use_ecn) { const struct dst_entry *dst = __sk_dst_get(sk); if (dst && dst_feature(dst, RTAX_FEATURE_ECN)) use_ecn = true; } tp->ecn_flags = 0; if (use_ecn) { TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; tp->ecn_flags = TCP_ECN_OK; if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn) INET_ECN_xmit(sk); } } static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb) { if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) /* tp->ecn_flags are cleared at a later point in time when * SYN ACK is ultimatively being received. */ TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR); } static void tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th) { if (inet_rsk(req)->ecn_ok) th->ece = 1; } /* Set up ECN state for a packet on a ESTABLISHED socket that is about to * be sent. */ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, struct tcphdr *th, int tcp_header_len) { struct tcp_sock *tp = tcp_sk(sk); if (tp->ecn_flags & TCP_ECN_OK) { /* Not-retransmitted data segment: set ECT and inject CWR. */ if (skb->len != tcp_header_len && !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) { INET_ECN_xmit(sk); if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) { tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; th->cwr = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; } } else if (!tcp_ca_needs_ecn(sk)) { /* ACK or retransmitted segment: clear ECT|CE */ INET_ECN_dontxmit(sk); } if (tp->ecn_flags & TCP_ECN_DEMAND_CWR) th->ece = 1; } } /* Constructs common control bits of non-data skb. If SYN/FIN is present, * auto increment end seqno. */ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) { skb->ip_summed = CHECKSUM_PARTIAL; TCP_SKB_CB(skb)->tcp_flags = flags; tcp_skb_pcount_set(skb, 1); TCP_SKB_CB(skb)->seq = seq; if (flags & (TCPHDR_SYN | TCPHDR_FIN)) seq++; TCP_SKB_CB(skb)->end_seq = seq; } static inline bool tcp_urg_mode(const struct tcp_sock *tp) { return tp->snd_una != tp->snd_up; } #define OPTION_SACK_ADVERTISE BIT(0) #define OPTION_TS BIT(1) #define OPTION_MD5 BIT(2) #define OPTION_WSCALE BIT(3) #define OPTION_FAST_OPEN_COOKIE BIT(8) #define OPTION_SMC BIT(9) #define OPTION_MPTCP BIT(10) #define OPTION_AO BIT(11) static void smc_options_write(__be32 *ptr, u16 *options) { #if IS_ENABLED(CONFIG_SMC) if (static_branch_unlikely(&tcp_have_smc)) { if (unlikely(OPTION_SMC & *options)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_EXP << 8) | (TCPOLEN_EXP_SMC_BASE)); *ptr++ = htonl(TCPOPT_SMC_MAGIC); } } #endif } struct tcp_out_options { u16 options; /* bit field of OPTION_* */ u16 mss; /* 0 to disable */ u8 ws; /* window scale, 0 to disable */ u8 num_sack_blocks; /* number of SACK blocks to include */ u8 hash_size; /* bytes in hash_location */ u8 bpf_opt_len; /* length of BPF hdr option */ __u8 *hash_location; /* temporary pointer, overloaded */ __u32 tsval, tsecr; /* need to include OPTION_TS */ struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ struct mptcp_out_options mptcp; }; static void mptcp_options_write(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp, struct tcp_out_options *opts) { #if IS_ENABLED(CONFIG_MPTCP) if (unlikely(OPTION_MPTCP & opts->options)) mptcp_write_options(th, ptr, tp, &opts->mptcp); #endif } #ifdef CONFIG_CGROUP_BPF static int bpf_skops_write_hdr_opt_arg0(struct sk_buff *skb, enum tcp_synack_type synack_type) { if (unlikely(!skb)) return BPF_WRITE_HDR_TCP_CURRENT_MSS; if (unlikely(synack_type == TCP_SYNACK_COOKIE)) return BPF_WRITE_HDR_TCP_SYNACK_COOKIE; return 0; } /* req, syn_skb and synack_type are used when writing synack */ static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct sk_buff *syn_skb, enum tcp_synack_type synack_type, struct tcp_out_options *opts, unsigned int *remaining) { struct bpf_sock_ops_kern sock_ops; int err; if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)) || !*remaining) return; /* *remaining has already been aligned to 4 bytes, so *remaining >= 4 */ /* init sock_ops */ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); sock_ops.op = BPF_SOCK_OPS_HDR_OPT_LEN_CB; if (req) { /* The listen "sk" cannot be passed here because * it is not locked. It would not make too much * sense to do bpf_setsockopt(listen_sk) based * on individual connection request also. * * Thus, "req" is passed here and the cgroup-bpf-progs * of the listen "sk" will be run. * * "req" is also used here for fastopen even the "sk" here is * a fullsock "child" sk. It is to keep the behavior * consistent between fastopen and non-fastopen on * the bpf programming side. */ sock_ops.sk = (struct sock *)req; sock_ops.syn_skb = syn_skb; } else { sock_owned_by_me(sk); sock_ops.is_fullsock = 1; sock_ops.sk = sk; } sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type); sock_ops.remaining_opt_len = *remaining; /* tcp_current_mss() does not pass a skb */ if (skb) bpf_skops_init_skb(&sock_ops, skb, 0); err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk); if (err || sock_ops.remaining_opt_len == *remaining) return; opts->bpf_opt_len = *remaining - sock_ops.remaining_opt_len; /* round up to 4 bytes */ opts->bpf_opt_len = (opts->bpf_opt_len + 3) & ~3; *remaining -= opts->bpf_opt_len; } static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct sk_buff *syn_skb, enum tcp_synack_type synack_type, struct tcp_out_options *opts) { u8 first_opt_off, nr_written, max_opt_len = opts->bpf_opt_len; struct bpf_sock_ops_kern sock_ops; int err; if (likely(!max_opt_len)) return; memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); sock_ops.op = BPF_SOCK_OPS_WRITE_HDR_OPT_CB; if (req) { sock_ops.sk = (struct sock *)req; sock_ops.syn_skb = syn_skb; } else { sock_owned_by_me(sk); sock_ops.is_fullsock = 1; sock_ops.sk = sk; } sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type); sock_ops.remaining_opt_len = max_opt_len; first_opt_off = tcp_hdrlen(skb) - max_opt_len; bpf_skops_init_skb(&sock_ops, skb, first_opt_off); err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk); if (err) nr_written = 0; else nr_written = max_opt_len - sock_ops.remaining_opt_len; if (nr_written < max_opt_len) memset(skb->data + first_opt_off + nr_written, TCPOPT_NOP, max_opt_len - nr_written); } #else static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct sk_buff *syn_skb, enum tcp_synack_type synack_type, struct tcp_out_options *opts, unsigned int *remaining) { } static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct sk_buff *syn_skb, enum tcp_synack_type synack_type, struct tcp_out_options *opts) { } #endif static __be32 *process_tcp_ao_options(struct tcp_sock *tp, const struct tcp_request_sock *tcprsk, struct tcp_out_options *opts, struct tcp_key *key, __be32 *ptr) { #ifdef CONFIG_TCP_AO u8 maclen = tcp_ao_maclen(key->ao_key); if (tcprsk) { u8 aolen = maclen + sizeof(struct tcp_ao_hdr); *ptr++ = htonl((TCPOPT_AO << 24) | (aolen << 16) | (tcprsk->ao_keyid << 8) | (tcprsk->ao_rcv_next)); } else { struct tcp_ao_key *rnext_key; struct tcp_ao_info *ao_info; ao_info = rcu_dereference_check(tp->ao_info, lockdep_sock_is_held(&tp->inet_conn.icsk_inet.sk)); rnext_key = READ_ONCE(ao_info->rnext_key); if (WARN_ON_ONCE(!rnext_key)) return ptr; *ptr++ = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key->ao_key) << 16) | (key->ao_key->sndid << 8) | (rnext_key->rcvid)); } opts->hash_location = (__u8 *)ptr; ptr += maclen / sizeof(*ptr); if (unlikely(maclen % sizeof(*ptr))) { memset(ptr, TCPOPT_NOP, sizeof(*ptr)); ptr++; } #endif return ptr; } /* Write previously computed TCP options to the packet. * * Beware: Something in the Internet is very sensitive to the ordering of * TCP options, we learned this through the hard way, so be careful here. * Luckily we can at least blame others for their non-compliance but from * inter-operability perspective it seems that we're somewhat stuck with * the ordering which we have been using if we want to keep working with * those broken things (not that it currently hurts anybody as there isn't * particular reason why the ordering would need to be changed). * * At least SACK_PERM as the first option is known to lead to a disaster * (but it may well be that other scenarios fail similarly). */ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, const struct tcp_request_sock *tcprsk, struct tcp_out_options *opts, struct tcp_key *key) { __be32 *ptr = (__be32 *)(th + 1); u16 options = opts->options; /* mungable copy */ if (tcp_key_is_md5(key)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG); /* overload cookie hash location */ opts->hash_location = (__u8 *)ptr; ptr += 4; } else if (tcp_key_is_ao(key)) { ptr = process_tcp_ao_options(tp, tcprsk, opts, key, ptr); } if (unlikely(opts->mss)) { *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | opts->mss); } if (likely(OPTION_TS & options)) { if (unlikely(OPTION_SACK_ADVERTISE & options)) { *ptr++ = htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); options &= ~OPTION_SACK_ADVERTISE; } else { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); } *ptr++ = htonl(opts->tsval); *ptr++ = htonl(opts->tsecr); } if (unlikely(OPTION_SACK_ADVERTISE & options)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM); } if (unlikely(OPTION_WSCALE & options)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | opts->ws); } if (unlikely(opts->num_sack_blocks)) { struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks; int this_sack; *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_SACK << 8) | (TCPOLEN_SACK_BASE + (opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK))); for (this_sack = 0; this_sack < opts->num_sack_blocks; ++this_sack) { *ptr++ = htonl(sp[this_sack].start_seq); *ptr++ = htonl(sp[this_sack].end_seq); } tp->rx_opt.dsack = 0; } if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) { struct tcp_fastopen_cookie *foc = opts->fastopen_cookie; u8 *p = (u8 *)ptr; u32 len; /* Fast Open option length */ if (foc->exp) { len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len; *ptr = htonl((TCPOPT_EXP << 24) | (len << 16) | TCPOPT_FASTOPEN_MAGIC); p += TCPOLEN_EXP_FASTOPEN_BASE; } else { len = TCPOLEN_FASTOPEN_BASE + foc->len; *p++ = TCPOPT_FASTOPEN; *p++ = len; } memcpy(p, foc->val, foc->len); if ((len & 3) == 2) { p[foc->len] = TCPOPT_NOP; p[foc->len + 1] = TCPOPT_NOP; } ptr += (len + 3) >> 2; } smc_options_write(ptr, &options); mptcp_options_write(th, ptr, tp, opts); } static void smc_set_option(const struct tcp_sock *tp, struct tcp_out_options *opts, unsigned int *remaining) { #if IS_ENABLED(CONFIG_SMC) if (static_branch_unlikely(&tcp_have_smc)) { if (tp->syn_smc) { if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { opts->options |= OPTION_SMC; *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; } } } #endif } static void smc_set_option_cond(const struct tcp_sock *tp, const struct inet_request_sock *ireq, struct tcp_out_options *opts, unsigned int *remaining) { #if IS_ENABLED(CONFIG_SMC) if (static_branch_unlikely(&tcp_have_smc)) { if (tp->syn_smc && ireq->smc_ok) { if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { opts->options |= OPTION_SMC; *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; } } } #endif } static void mptcp_set_option_cond(const struct request_sock *req, struct tcp_out_options *opts, unsigned int *remaining) { if (rsk_is_mptcp(req)) { unsigned int size; if (mptcp_synack_options(req, &size, &opts->mptcp)) { if (*remaining >= size) { opts->options |= OPTION_MPTCP; *remaining -= size; } } } } /* Compute TCP options for SYN packets. This is not the final * network wire format yet. */ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, struct tcp_out_options *opts, struct tcp_key *key) { struct tcp_sock *tp = tcp_sk(sk); unsigned int remaining = MAX_TCP_OPTION_SPACE; struct tcp_fastopen_request *fastopen = tp->fastopen_req; bool timestamps; /* Better than switch (key.type) as it has static branches */ if (tcp_key_is_md5(key)) { timestamps = false; opts->options |= OPTION_MD5; remaining -= TCPOLEN_MD5SIG_ALIGNED; } else { timestamps = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps); if (tcp_key_is_ao(key)) { opts->options |= OPTION_AO; remaining -= tcp_ao_len_aligned(key->ao_key); } } /* We always get an MSS option. The option bytes which will be seen in * normal data packets should timestamps be used, must be in the MSS * advertised. But we subtract them from tp->mss_cache so that * calculations in tcp_sendmsg are simpler etc. So account for this * fact here if necessary. If we don't do this correctly, as a * receiver we won't recognize data packets as being full sized when we * should, and thus we won't abide by the delayed ACK rules correctly. * SACKs don't matter, we never delay an ACK when we have any of those * going out. */ opts->mss = tcp_advertise_mss(sk); remaining -= TCPOLEN_MSS_ALIGNED; if (likely(timestamps)) { opts->options |= OPTION_TS; opts->tsval = tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb) + tp->tsoffset; opts->tsecr = tp->rx_opt.ts_recent; remaining -= TCPOLEN_TSTAMP_ALIGNED; } if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling))) { opts->ws = tp->rx_opt.rcv_wscale; opts->options |= OPTION_WSCALE; remaining -= TCPOLEN_WSCALE_ALIGNED; } if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_sack))) { opts->options |= OPTION_SACK_ADVERTISE; if (unlikely(!(OPTION_TS & opts->options))) remaining -= TCPOLEN_SACKPERM_ALIGNED; } if (fastopen && fastopen->cookie.len >= 0) { u32 need = fastopen->cookie.len; need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE : TCPOLEN_FASTOPEN_BASE; need = (need + 3) & ~3U; /* Align to 32 bits */ if (remaining >= need) { opts->options |= OPTION_FAST_OPEN_COOKIE; opts->fastopen_cookie = &fastopen->cookie; remaining -= need; tp->syn_fastopen = 1; tp->syn_fastopen_exp = fastopen->cookie.exp ? 1 : 0; } } smc_set_option(tp, opts, &remaining); if (sk_is_mptcp(sk)) { unsigned int size; if (mptcp_syn_options(sk, skb, &size, &opts->mptcp)) { opts->options |= OPTION_MPTCP; remaining -= size; } } bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining); return MAX_TCP_OPTION_SPACE - remaining; } /* Set up TCP options for SYN-ACKs. */ static unsigned int tcp_synack_options(const struct sock *sk, struct request_sock *req, unsigned int mss, struct sk_buff *skb, struct tcp_out_options *opts, const struct tcp_key *key, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type, struct sk_buff *syn_skb) { struct inet_request_sock *ireq = inet_rsk(req); unsigned int remaining = MAX_TCP_OPTION_SPACE; if (tcp_key_is_md5(key)) { opts->options |= OPTION_MD5; remaining -= TCPOLEN_MD5SIG_ALIGNED; /* We can't fit any SACK blocks in a packet with MD5 + TS * options. There was discussion about disabling SACK * rather than TS in order to fit in better with old, * buggy kernels, but that was deemed to be unnecessary. */ if (synack_type != TCP_SYNACK_COOKIE) ireq->tstamp_ok &= !ireq->sack_ok; } else if (tcp_key_is_ao(key)) { opts->options |= OPTION_AO; remaining -= tcp_ao_len_aligned(key->ao_key); ireq->tstamp_ok &= !ireq->sack_ok; } /* We always send an MSS option. */ opts->mss = mss; remaining -= TCPOLEN_MSS_ALIGNED; if (likely(ireq->wscale_ok)) { opts->ws = ireq->rcv_wscale; opts->options |= OPTION_WSCALE; remaining -= TCPOLEN_WSCALE_ALIGNED; } if (likely(ireq->tstamp_ok)) { opts->options |= OPTION_TS; opts->tsval = tcp_skb_timestamp_ts(tcp_rsk(req)->req_usec_ts, skb) + tcp_rsk(req)->ts_off; opts->tsecr = READ_ONCE(req->ts_recent); remaining -= TCPOLEN_TSTAMP_ALIGNED; } if (likely(ireq->sack_ok)) { opts->options |= OPTION_SACK_ADVERTISE; if (unlikely(!ireq->tstamp_ok)) remaining -= TCPOLEN_SACKPERM_ALIGNED; } if (foc != NULL && foc->len >= 0) { u32 need = foc->len; need += foc->exp ? TCPOLEN_EXP_FASTOPEN_BASE : TCPOLEN_FASTOPEN_BASE; need = (need + 3) & ~3U; /* Align to 32 bits */ if (remaining >= need) { opts->options |= OPTION_FAST_OPEN_COOKIE; opts->fastopen_cookie = foc; remaining -= need; } } mptcp_set_option_cond(req, opts, &remaining); smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb, synack_type, opts, &remaining); return MAX_TCP_OPTION_SPACE - remaining; } /* Compute TCP options for ESTABLISHED sockets. This is not the * final wire format yet. */ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb, struct tcp_out_options *opts, struct tcp_key *key) { struct tcp_sock *tp = tcp_sk(sk); unsigned int size = 0; unsigned int eff_sacks; opts->options = 0; /* Better than switch (key.type) as it has static branches */ if (tcp_key_is_md5(key)) { opts->options |= OPTION_MD5; size += TCPOLEN_MD5SIG_ALIGNED; } else if (tcp_key_is_ao(key)) { opts->options |= OPTION_AO; size += tcp_ao_len_aligned(key->ao_key); } if (likely(tp->rx_opt.tstamp_ok)) { opts->options |= OPTION_TS; opts->tsval = skb ? tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb) + tp->tsoffset : 0; opts->tsecr = tp->rx_opt.ts_recent; size += TCPOLEN_TSTAMP_ALIGNED; } /* MPTCP options have precedence over SACK for the limited TCP * option space because a MPTCP connection would be forced to * fall back to regular TCP if a required multipath option is * missing. SACK still gets a chance to use whatever space is * left. */ if (sk_is_mptcp(sk)) { unsigned int remaining = MAX_TCP_OPTION_SPACE - size; unsigned int opt_size = 0; if (mptcp_established_options(sk, skb, &opt_size, remaining, &opts->mptcp)) { opts->options |= OPTION_MPTCP; size += opt_size; } } eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack; if (unlikely(eff_sacks)) { const unsigned int remaining = MAX_TCP_OPTION_SPACE - size; if (unlikely(remaining < TCPOLEN_SACK_BASE_ALIGNED + TCPOLEN_SACK_PERBLOCK)) return size; opts->num_sack_blocks = min_t(unsigned int, eff_sacks, (remaining - TCPOLEN_SACK_BASE_ALIGNED) / TCPOLEN_SACK_PERBLOCK); size += TCPOLEN_SACK_BASE_ALIGNED + opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; } if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) { unsigned int remaining = MAX_TCP_OPTION_SPACE - size; bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining); size = MAX_TCP_OPTION_SPACE - remaining; } return size; } /* TCP SMALL QUEUES (TSQ) * * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev) * to reduce RTT and bufferbloat. * We do this using a special skb destructor (tcp_wfree). * * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb * needs to be reallocated in a driver. * The invariant being skb->truesize subtracted from sk->sk_wmem_alloc * * Since transmit from skb destructor is forbidden, we use a tasklet * to process all sockets that eventually need to send more skbs. * We use one tasklet per cpu, with its own queue of sockets. */ struct tsq_tasklet { struct tasklet_struct tasklet; struct list_head head; /* queue of tcp sockets */ }; static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet); static void tcp_tsq_write(struct sock *sk) { if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) { struct tcp_sock *tp = tcp_sk(sk); if (tp->lost_out > tp->retrans_out && tcp_snd_cwnd(tp) > tcp_packets_in_flight(tp)) { tcp_mstamp_refresh(tp); tcp_xmit_retransmit_queue(sk); } tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle, 0, GFP_ATOMIC); } } static void tcp_tsq_handler(struct sock *sk) { bh_lock_sock(sk); if (!sock_owned_by_user(sk)) tcp_tsq_write(sk); else if (!test_and_set_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) sock_hold(sk); bh_unlock_sock(sk); } /* * One tasklet per cpu tries to send more skbs. * We run in tasklet context but need to disable irqs when * transferring tsq->head because tcp_wfree() might * interrupt us (non NAPI drivers) */ static void tcp_tasklet_func(struct tasklet_struct *t) { struct tsq_tasklet *tsq = from_tasklet(tsq, t, tasklet); LIST_HEAD(list); unsigned long flags; struct list_head *q, *n; struct tcp_sock *tp; struct sock *sk; local_irq_save(flags); list_splice_init(&tsq->head, &list); local_irq_restore(flags); list_for_each_safe(q, n, &list) { tp = list_entry(q, struct tcp_sock, tsq_node); list_del(&tp->tsq_node); sk = (struct sock *)tp; smp_mb__before_atomic(); clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags); tcp_tsq_handler(sk); sk_free(sk); } } #define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \ TCPF_WRITE_TIMER_DEFERRED | \ TCPF_DELACK_TIMER_DEFERRED | \ TCPF_MTU_REDUCED_DEFERRED | \ TCPF_ACK_DEFERRED) /** * tcp_release_cb - tcp release_sock() callback * @sk: socket * * called from release_sock() to perform protocol dependent * actions before socket release. */ void tcp_release_cb(struct sock *sk) { unsigned long flags = smp_load_acquire(&sk->sk_tsq_flags); unsigned long nflags; /* perform an atomic operation only if at least one flag is set */ do { if (!(flags & TCP_DEFERRED_ALL)) return; nflags = flags & ~TCP_DEFERRED_ALL; } while (!try_cmpxchg(&sk->sk_tsq_flags, &flags, nflags)); if (flags & TCPF_TSQ_DEFERRED) { tcp_tsq_write(sk); __sock_put(sk); } if (flags & TCPF_WRITE_TIMER_DEFERRED) { tcp_write_timer_handler(sk); __sock_put(sk); } if (flags & TCPF_DELACK_TIMER_DEFERRED) { tcp_delack_timer_handler(sk); __sock_put(sk); } if (flags & TCPF_MTU_REDUCED_DEFERRED) { inet_csk(sk)->icsk_af_ops->mtu_reduced(sk); __sock_put(sk); } if ((flags & TCPF_ACK_DEFERRED) && inet_csk_ack_scheduled(sk)) tcp_send_ack(sk); } EXPORT_SYMBOL(tcp_release_cb); void __init tcp_tasklet_init(void) { int i; for_each_possible_cpu(i) { struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i); INIT_LIST_HEAD(&tsq->head); tasklet_setup(&tsq->tasklet, tcp_tasklet_func); } } /* * Write buffer destructor automatically called from kfree_skb. * We can't xmit new skbs from this context, as we might already * hold qdisc lock. */ void tcp_wfree(struct sk_buff *skb) { struct sock *sk = skb->sk; struct tcp_sock *tp = tcp_sk(sk); unsigned long flags, nval, oval; struct tsq_tasklet *tsq; bool empty; /* Keep one reference on sk_wmem_alloc. * Will be released by sk_free() from here or tcp_tasklet_func() */ WARN_ON(refcount_sub_and_test(skb->truesize - 1, &sk->sk_wmem_alloc)); /* If this softirq is serviced by ksoftirqd, we are likely under stress. * Wait until our queues (qdisc + devices) are drained. * This gives : * - less callbacks to tcp_write_xmit(), reducing stress (batches) * - chance for incoming ACK (processed by another cpu maybe) * to migrate this flow (skb->ooo_okay will be eventually set) */ if (refcount_read(&sk->sk_wmem_alloc) >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current) goto out; oval = smp_load_acquire(&sk->sk_tsq_flags); do { if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED)) goto out; nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED; } while (!try_cmpxchg(&sk->sk_tsq_flags, &oval, nval)); /* queue this socket to tasklet queue */ local_irq_save(flags); tsq = this_cpu_ptr(&tsq_tasklet); empty = list_empty(&tsq->head); list_add(&tp->tsq_node, &tsq->head); if (empty) tasklet_schedule(&tsq->tasklet); local_irq_restore(flags); return; out: sk_free(sk); } /* Note: Called under soft irq. * We can call TCP stack right away, unless socket is owned by user. */ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer) { struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer); struct sock *sk = (struct sock *)tp; tcp_tsq_handler(sk); sock_put(sk); return HRTIMER_NORESTART; } static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb, u64 prior_wstamp) { struct tcp_sock *tp = tcp_sk(sk); if (sk->sk_pacing_status != SK_PACING_NONE) { unsigned long rate = READ_ONCE(sk->sk_pacing_rate); /* Original sch_fq does not pace first 10 MSS * Note that tp->data_segs_out overflows after 2^32 packets, * this is a minor annoyance. */ if (rate != ~0UL && rate && tp->data_segs_out >= 10) { u64 len_ns = div64_ul((u64)skb->len * NSEC_PER_SEC, rate); u64 credit = tp->tcp_wstamp_ns - prior_wstamp; /* take into account OS jitter */ len_ns -= min_t(u64, len_ns / 2, credit); tp->tcp_wstamp_ns += len_ns; } } list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); } INDIRECT_CALLABLE_DECLARE(int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)); INDIRECT_CALLABLE_DECLARE(int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)); INDIRECT_CALLABLE_DECLARE(void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)); /* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial * transmission and possible later retransmissions. * All SKB's seen here are completely headerless. It is our * job to build the TCP header, and pass the packet down to * IP so it can do the same plus pass the packet off to the * device. * * We are working here with either a clone of the original * SKB, or a fresh unique copy made by the retransmit engine. */ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask, u32 rcv_nxt) { const struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet; struct tcp_sock *tp; struct tcp_skb_cb *tcb; struct tcp_out_options opts; unsigned int tcp_options_size, tcp_header_size; struct sk_buff *oskb = NULL; struct tcp_key key; struct tcphdr *th; u64 prior_wstamp; int err; BUG_ON(!skb || !tcp_skb_pcount(skb)); tp = tcp_sk(sk); prior_wstamp = tp->tcp_wstamp_ns; tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache); skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC); if (clone_it) { oskb = skb; tcp_skb_tsorted_save(oskb) { if (unlikely(skb_cloned(oskb))) skb = pskb_copy(oskb, gfp_mask); else skb = skb_clone(oskb, gfp_mask); } tcp_skb_tsorted_restore(oskb); if (unlikely(!skb)) return -ENOBUFS; /* retransmit skbs might have a non zero value in skb->dev * because skb->dev is aliased with skb->rbnode.rb_left */ skb->dev = NULL; } inet = inet_sk(sk); tcb = TCP_SKB_CB(skb); memset(&opts, 0, sizeof(opts)); tcp_get_current_key(sk, &key); if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) { tcp_options_size = tcp_syn_options(sk, skb, &opts, &key); } else { tcp_options_size = tcp_established_options(sk, skb, &opts, &key); /* Force a PSH flag on all (GSO) packets to expedite GRO flush * at receiver : This slightly improve GRO performance. * Note that we do not force the PSH flag for non GSO packets, * because they might be sent under high congestion events, * and in this case it is better to delay the delivery of 1-MSS * packets and thus the corresponding ACK packet that would * release the following packet. */ if (tcp_skb_pcount(skb) > 1) tcb->tcp_flags |= TCPHDR_PSH; } tcp_header_size = tcp_options_size + sizeof(struct tcphdr); /* We set skb->ooo_okay to one if this packet can select * a different TX queue than prior packets of this flow, * to avoid self inflicted reorders. * The 'other' queue decision is based on current cpu number * if XPS is enabled, or sk->sk_txhash otherwise. * We can switch to another (and better) queue if: * 1) No packet with payload is in qdisc/device queues. * Delays in TX completion can defeat the test * even if packets were already sent. * 2) Or rtx queue is empty. * This mitigates above case if ACK packets for * all prior packets were already processed. */ skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) || tcp_rtx_queue_empty(sk); /* If we had to use memory reserve to allocate this skb, * this might cause drops if packet is looped back : * Other socket might not have SOCK_MEMALLOC. * Packets not looped back do not care about pfmemalloc. */ skb->pfmemalloc = 0; skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); skb_orphan(skb); skb->sk = sk; skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree; refcount_add(skb->truesize, &sk->sk_wmem_alloc); skb_set_dst_pending_confirm(skb, READ_ONCE(sk->sk_dst_pending_confirm)); /* Build TCP header and checksum it. */ th = (struct tcphdr *)skb->data; th->source = inet->inet_sport; th->dest = inet->inet_dport; th->seq = htonl(tcb->seq); th->ack_seq = htonl(rcv_nxt); *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->tcp_flags); th->check = 0; th->urg_ptr = 0; /* The urg_mode check is necessary during a below snd_una win probe */ if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) { if (before(tp->snd_up, tcb->seq + 0x10000)) { th->urg_ptr = htons(tp->snd_up - tcb->seq); th->urg = 1; } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) { th->urg_ptr = htons(0xFFFF); th->urg = 1; } } skb_shinfo(skb)->gso_type = sk->sk_gso_type; if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) { th->window = htons(tcp_select_window(sk)); tcp_ecn_send(sk, skb, th, tcp_header_size); } else { /* RFC1323: The window in SYN & SYN/ACK segments * is never scaled. */ th->window = htons(min(tp->rcv_wnd, 65535U)); } tcp_options_write(th, tp, NULL, &opts, &key); if (tcp_key_is_md5(&key)) { #ifdef CONFIG_TCP_MD5SIG /* Calculate the MD5 hash, as we have all we need now */ sk_gso_disable(sk); tp->af_specific->calc_md5_hash(opts.hash_location, key.md5_key, sk, skb); #endif } else if (tcp_key_is_ao(&key)) { int err; err = tcp_ao_transmit_skb(sk, skb, key.ao_key, th, opts.hash_location); if (err) { kfree_skb_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED); return -ENOMEM; } } /* BPF prog is the last one writing header option */ bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts); INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check, tcp_v6_send_check, tcp_v4_send_check, sk, skb); if (likely(tcb->tcp_flags & TCPHDR_ACK)) tcp_event_ack_sent(sk, rcv_nxt); if (skb->len != tcp_header_size) { tcp_event_data_sent(tp, sk); tp->data_segs_out += tcp_skb_pcount(skb); tp->bytes_sent += skb->len - tcp_header_size; } if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); tp->segs_out += tcp_skb_pcount(skb); skb_set_hash_from_sk(skb, sk); /* OK, its time to fill skb_shinfo(skb)->gso_{segs|size} */ skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb); skb_shinfo(skb)->gso_size = tcp_skb_mss(skb); /* Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns) */ /* Cleanup our debris for IP stacks */ memset(skb->cb, 0, max(sizeof(struct inet_skb_parm), sizeof(struct inet6_skb_parm))); tcp_add_tx_delay(skb, tp); err = INDIRECT_CALL_INET(icsk->icsk_af_ops->queue_xmit, inet6_csk_xmit, ip_queue_xmit, sk, skb, &inet->cork.fl); if (unlikely(err > 0)) { tcp_enter_cwr(sk); err = net_xmit_eval(err); } if (!err && oskb) { tcp_update_skb_after_send(sk, oskb, prior_wstamp); tcp_rate_skb_sent(sk, oskb); } return err; } static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask) { return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask, tcp_sk(sk)->rcv_nxt); } /* This routine just queues the buffer for sending. * * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames, * otherwise socket can stall. */ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); /* Advance write_seq and place onto the write_queue. */ WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq); __skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); } /* Initialize TSO segments for a packet. */ static int tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) { int tso_segs; if (skb->len <= mss_now) { /* Avoid the costly divide in the normal * non-TSO case. */ TCP_SKB_CB(skb)->tcp_gso_size = 0; tcp_skb_pcount_set(skb, 1); return 1; } TCP_SKB_CB(skb)->tcp_gso_size = mss_now; tso_segs = DIV_ROUND_UP(skb->len, mss_now); tcp_skb_pcount_set(skb, tso_segs); return tso_segs; } /* Pcount in the middle of the write queue got changed, we need to do various * tweaks to fix counters */ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr) { struct tcp_sock *tp = tcp_sk(sk); tp->packets_out -= decr; if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) tp->sacked_out -= decr; if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) tp->retrans_out -= decr; if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) tp->lost_out -= decr; /* Reno case is special. Sigh... */ if (tcp_is_reno(tp) && decr > 0) tp->sacked_out -= min_t(u32, tp->sacked_out, decr); if (tp->lost_skb_hint && before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) tp->lost_cnt_hint -= decr; tcp_verify_left_out(tp); } static bool tcp_has_tx_tstamp(const struct sk_buff *skb) { return TCP_SKB_CB(skb)->txstamp_ack || (skb_shinfo(skb)->tx_flags & SKBTX_ANY_TSTAMP); } static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2) { struct skb_shared_info *shinfo = skb_shinfo(skb); if (unlikely(tcp_has_tx_tstamp(skb)) && !before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) { struct skb_shared_info *shinfo2 = skb_shinfo(skb2); u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP; shinfo->tx_flags &= ~tsflags; shinfo2->tx_flags |= tsflags; swap(shinfo->tskey, shinfo2->tskey); TCP_SKB_CB(skb2)->txstamp_ack = TCP_SKB_CB(skb)->txstamp_ack; TCP_SKB_CB(skb)->txstamp_ack = 0; } } static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2) { TCP_SKB_CB(skb2)->eor = TCP_SKB_CB(skb)->eor; TCP_SKB_CB(skb)->eor = 0; } /* Insert buff after skb on the write or rtx queue of sk. */ static void tcp_insert_write_queue_after(struct sk_buff *skb, struct sk_buff *buff, struct sock *sk, enum tcp_queue tcp_queue) { if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE) __skb_queue_after(&sk->sk_write_queue, skb, buff); else tcp_rbtree_insert(&sk->tcp_rtx_queue, buff); } /* Function to create two new TCP segments. Shrinks the given segment * to the specified size and appends a new segment with the rest of the * packet to the list. This won't be called frequently, I hope. * Remember, these are still headerless SKBs at this point. */ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, struct sk_buff *skb, u32 len, unsigned int mss_now, gfp_t gfp) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; int old_factor; long limit; int nlen; u8 flags; if (WARN_ON(len > skb->len)) return -EINVAL; DEBUG_NET_WARN_ON_ONCE(skb_headlen(skb)); /* tcp_sendmsg() can overshoot sk_wmem_queued by one full size skb. * We need some allowance to not penalize applications setting small * SO_SNDBUF values. * Also allow first and last skb in retransmit queue to be split. */ limit = sk->sk_sndbuf + 2 * SKB_TRUESIZE(GSO_LEGACY_MAX_SIZE); if (unlikely((sk->sk_wmem_queued >> 1) > limit && tcp_queue != TCP_FRAG_IN_WRITE_QUEUE && skb != tcp_rtx_queue_head(sk) && skb != tcp_rtx_queue_tail(sk))) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWQUEUETOOBIG); return -ENOMEM; } if (skb_unclone_keeptruesize(skb, gfp)) return -ENOMEM; /* Get a new skb... force flag on. */ buff = tcp_stream_alloc_skb(sk, gfp, true); if (!buff) return -ENOMEM; /* We'll just try again later. */ skb_copy_decrypted(buff, skb); mptcp_skb_ext_copy(buff, skb); sk_wmem_queued_add(sk, buff->truesize); sk_mem_charge(sk, buff->truesize); nlen = skb->len - len; buff->truesize += nlen; skb->truesize -= nlen; /* Correct the sequence numbers. */ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; /* PSH and FIN should only be set in the second packet. */ flags = TCP_SKB_CB(skb)->tcp_flags; TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); TCP_SKB_CB(buff)->tcp_flags = flags; TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked; tcp_skb_fragment_eor(skb, buff); skb_split(skb, buff, len); skb_set_delivery_time(buff, skb->tstamp, SKB_CLOCK_MONOTONIC); tcp_fragment_tstamp(skb, buff); old_factor = tcp_skb_pcount(skb); /* Fix up tso_factor for both original and new SKB. */ tcp_set_skb_tso_segs(skb, mss_now); tcp_set_skb_tso_segs(buff, mss_now); /* Update delivered info for the new segment */ TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx; /* If this packet has been sent out already, we must * adjust the various packet counters. */ if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) { int diff = old_factor - tcp_skb_pcount(skb) - tcp_skb_pcount(buff); if (diff) tcp_adjust_pcount(sk, skb, diff); } /* Link BUFF into the send queue. */ __skb_header_release(buff); tcp_insert_write_queue_after(skb, buff, sk, tcp_queue); if (tcp_queue == TCP_FRAG_IN_RTX_QUEUE) list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor); return 0; } /* This is similar to __pskb_pull_tail(). The difference is that pulled * data is not copied, but immediately discarded. */ static int __pskb_trim_head(struct sk_buff *skb, int len) { struct skb_shared_info *shinfo; int i, k, eat; DEBUG_NET_WARN_ON_ONCE(skb_headlen(skb)); eat = len; k = 0; shinfo = skb_shinfo(skb); for (i = 0; i < shinfo->nr_frags; i++) { int size = skb_frag_size(&shinfo->frags[i]); if (size <= eat) { skb_frag_unref(skb, i); eat -= size; } else { shinfo->frags[k] = shinfo->frags[i]; if (eat) { skb_frag_off_add(&shinfo->frags[k], eat); skb_frag_size_sub(&shinfo->frags[k], eat); eat = 0; } k++; } } shinfo->nr_frags = k; skb->data_len -= len; skb->len = skb->data_len; return len; } /* Remove acked data from a packet in the transmit queue. */ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) { u32 delta_truesize; if (skb_unclone_keeptruesize(skb, GFP_ATOMIC)) return -ENOMEM; delta_truesize = __pskb_trim_head(skb, len); TCP_SKB_CB(skb)->seq += len; skb->truesize -= delta_truesize; sk_wmem_queued_add(sk, -delta_truesize); if (!skb_zcopy_pure(skb)) sk_mem_uncharge(sk, delta_truesize); /* Any change of skb->len requires recalculation of tso factor. */ if (tcp_skb_pcount(skb) > 1) tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb)); return 0; } /* Calculate MSS not accounting any TCP options. */ static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu) { const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); int mss_now; /* Calculate base mss without TCP options: It is MMS_S - sizeof(tcphdr) of rfc1122 */ mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr); /* Clamp it (mss_clamp does not include tcp options) */ if (mss_now > tp->rx_opt.mss_clamp) mss_now = tp->rx_opt.mss_clamp; /* Now subtract optional transport overhead */ mss_now -= icsk->icsk_ext_hdr_len; /* Then reserve room for full set of TCP options and 8 bytes of data */ mss_now = max(mss_now, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss)); return mss_now; } /* Calculate MSS. Not accounting for SACKs here. */ int tcp_mtu_to_mss(struct sock *sk, int pmtu) { /* Subtract TCP options size, not including SACKs */ return __tcp_mtu_to_mss(sk, pmtu) - (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr)); } EXPORT_SYMBOL(tcp_mtu_to_mss); /* Inverse of above */ int tcp_mss_to_mtu(struct sock *sk, int mss) { const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); return mss + tp->tcp_header_len + icsk->icsk_ext_hdr_len + icsk->icsk_af_ops->net_header_len; } EXPORT_SYMBOL(tcp_mss_to_mtu); /* MTU probing init per socket */ void tcp_mtup_init(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct net *net = sock_net(sk); icsk->icsk_mtup.enabled = READ_ONCE(net->ipv4.sysctl_tcp_mtu_probing) > 1; icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) + icsk->icsk_af_ops->net_header_len; icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, READ_ONCE(net->ipv4.sysctl_tcp_base_mss)); icsk->icsk_mtup.probe_size = 0; if (icsk->icsk_mtup.enabled) icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; } EXPORT_SYMBOL(tcp_mtup_init); /* This function synchronize snd mss to current pmtu/exthdr set. tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts for TCP options, but includes only bare TCP header. tp->rx_opt.mss_clamp is mss negotiated at connection setup. It is minimum of user_mss and mss received with SYN. It also does not include TCP options. inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function. tp->mss_cache is current effective sending mss, including all tcp options except for SACKs. It is evaluated, taking into account current pmtu, but never exceeds tp->rx_opt.mss_clamp. NOTE1. rfc1122 clearly states that advertised MSS DOES NOT include either tcp or ip options. NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache are READ ONLY outside this function. --ANK (980731) */ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); int mss_now; if (icsk->icsk_mtup.search_high > pmtu) icsk->icsk_mtup.search_high = pmtu; mss_now = tcp_mtu_to_mss(sk, pmtu); mss_now = tcp_bound_to_half_wnd(tp, mss_now); /* And store cached results */ icsk->icsk_pmtu_cookie = pmtu; if (icsk->icsk_mtup.enabled) mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low)); tp->mss_cache = mss_now; return mss_now; } EXPORT_SYMBOL(tcp_sync_mss); /* Compute the current effective MSS, taking SACKs and IP options, * and even PMTU discovery events into account. */ unsigned int tcp_current_mss(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); const struct dst_entry *dst = __sk_dst_get(sk); u32 mss_now; unsigned int header_len; struct tcp_out_options opts; struct tcp_key key; mss_now = tp->mss_cache; if (dst) { u32 mtu = dst_mtu(dst); if (mtu != inet_csk(sk)->icsk_pmtu_cookie) mss_now = tcp_sync_mss(sk, mtu); } tcp_get_current_key(sk, &key); header_len = tcp_established_options(sk, NULL, &opts, &key) + sizeof(struct tcphdr); /* The mss_cache is sized based on tp->tcp_header_len, which assumes * some common options. If this is an odd packet (because we have SACK * blocks etc) then our calculated header_len will be different, and * we have to adjust mss_now correspondingly */ if (header_len != tp->tcp_header_len) { int delta = (int) header_len - tp->tcp_header_len; mss_now -= delta; } return mss_now; } /* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. * As additional protections, we do not touch cwnd in retransmission phases, * and if application hit its sndbuf limit recently. */ static void tcp_cwnd_application_limited(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open && sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { /* Limited by application or receiver window. */ u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk)); u32 win_used = max(tp->snd_cwnd_used, init_win); if (win_used < tcp_snd_cwnd(tp)) { tp->snd_ssthresh = tcp_current_ssthresh(sk); tcp_snd_cwnd_set(tp, (tcp_snd_cwnd(tp) + win_used) >> 1); } tp->snd_cwnd_used = 0; } tp->snd_cwnd_stamp = tcp_jiffies32; } static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) { const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; struct tcp_sock *tp = tcp_sk(sk); /* Track the strongest available signal of the degree to which the cwnd * is fully utilized. If cwnd-limited then remember that fact for the * current window. If not cwnd-limited then track the maximum number of * outstanding packets in the current window. (If cwnd-limited then we * chose to not update tp->max_packets_out to avoid an extra else * clause with no functional impact.) */ if (!before(tp->snd_una, tp->cwnd_usage_seq) || is_cwnd_limited || (!tp->is_cwnd_limited && tp->packets_out > tp->max_packets_out)) { tp->is_cwnd_limited = is_cwnd_limited; tp->max_packets_out = tp->packets_out; tp->cwnd_usage_seq = tp->snd_nxt; } if (tcp_is_cwnd_limited(sk)) { /* Network is feed fully. */ tp->snd_cwnd_used = 0; tp->snd_cwnd_stamp = tcp_jiffies32; } else { /* Network starves. */ if (tp->packets_out > tp->snd_cwnd_used) tp->snd_cwnd_used = tp->packets_out; if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle) && (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto && !ca_ops->cong_control) tcp_cwnd_application_limited(sk); /* The following conditions together indicate the starvation * is caused by insufficient sender buffer: * 1) just sent some data (see tcp_write_xmit) * 2) not cwnd limited (this else condition) * 3) no more data to send (tcp_write_queue_empty()) * 4) application is hitting buffer limit (SOCK_NOSPACE) */ if (tcp_write_queue_empty(sk) && sk->sk_socket && test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) && (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED); } } /* Minshall's variant of the Nagle send check. */ static bool tcp_minshall_check(const struct tcp_sock *tp) { return after(tp->snd_sml, tp->snd_una) && !after(tp->snd_sml, tp->snd_nxt); } /* Update snd_sml if this skb is under mss * Note that a TSO packet might end with a sub-mss segment * The test is really : * if ((skb->len % mss) != 0) * tp->snd_sml = TCP_SKB_CB(skb)->end_seq; * But we can avoid doing the divide again given we already have * skb_pcount = skb->len / mss_now */ static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now, const struct sk_buff *skb) { if (skb->len < tcp_skb_pcount(skb) * mss_now) tp->snd_sml = TCP_SKB_CB(skb)->end_seq; } /* Return false, if packet can be sent now without violation Nagle's rules: * 1. It is full sized. (provided by caller in %partial bool) * 2. Or it contains FIN. (already checked by caller) * 3. Or TCP_CORK is not set, and TCP_NODELAY is set. * 4. Or TCP_CORK is not set, and all sent packets are ACKed. * With Minshall's modification: all sent small packets are ACKed. */ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, int nonagle) { return partial && ((nonagle & TCP_NAGLE_CORK) || (!nonagle && tp->packets_out && tcp_minshall_check(tp))); } /* Return how many segs we'd like on a TSO packet, * depending on current pacing rate, and how close the peer is. * * Rationale is: * - For close peers, we rather send bigger packets to reduce * cpu costs, because occasional losses will be repaired fast. * - For long distance/rtt flows, we would like to get ACK clocking * with 1 ACK per ms. * * Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting * in bigger TSO bursts. We we cut the RTT-based allowance in half * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance * is below 1500 bytes after 6 * ~500 usec = 3ms. */ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, int min_tso_segs) { unsigned long bytes; u32 r; bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift); r = tcp_min_rtt(tcp_sk(sk)) >> READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_rtt_log); if (r < BITS_PER_TYPE(sk->sk_gso_max_size)) bytes += sk->sk_gso_max_size >> r; bytes = min_t(unsigned long, bytes, sk->sk_gso_max_size); return max_t(u32, bytes / mss_now, min_tso_segs); } /* Return the number of segments we want in the skb we are transmitting. * See if congestion control module wants to decide; otherwise, autosize. */ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) { const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; u32 min_tso, tso_segs; min_tso = ca_ops->min_tso_segs ? ca_ops->min_tso_segs(sk) : READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); tso_segs = tcp_tso_autosize(sk, mss_now, min_tso); return min_t(u32, tso_segs, sk->sk_gso_max_segs); } /* Returns the portion of skb which can be sent right away */ static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, unsigned int mss_now, unsigned int max_segs, int nonagle) { const struct tcp_sock *tp = tcp_sk(sk); u32 partial, needed, window, max_len; window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; max_len = mss_now * max_segs; if (likely(max_len <= window && skb != tcp_write_queue_tail(sk))) return max_len; needed = min(skb->len, window); if (max_len <= needed) return max_len; partial = needed % mss_now; /* If last segment is not a full MSS, check if Nagle rules allow us * to include this last segment in this skb. * Otherwise, we'll split the skb at last MSS boundary */ if (tcp_nagle_check(partial != 0, tp, nonagle)) return needed - partial; return needed; } /* Can at least one segment of SKB be sent right now, according to the * congestion window rules? If so, return how many segments are allowed. */ static u32 tcp_cwnd_test(const struct tcp_sock *tp) { u32 in_flight, cwnd, halfcwnd; in_flight = tcp_packets_in_flight(tp); cwnd = tcp_snd_cwnd(tp); if (in_flight >= cwnd) return 0; /* For better scheduling, ensure we have at least * 2 GSO packets in flight. */ halfcwnd = max(cwnd >> 1, 1U); return min(halfcwnd, cwnd - in_flight); } /* Initialize TSO state of a skb. * This must be invoked the first time we consider transmitting * SKB onto the wire. */ static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now) { int tso_segs = tcp_skb_pcount(skb); if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) return tcp_set_skb_tso_segs(skb, mss_now); return tso_segs; } /* Return true if the Nagle test allows this packet to be * sent now. */ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb, unsigned int cur_mss, int nonagle) { /* Nagle rule does not apply to frames, which sit in the middle of the * write_queue (they have no chances to get new data). * * This is implemented in the callers, where they modify the 'nonagle' * argument based upon the location of SKB in the send queue. */ if (nonagle & TCP_NAGLE_PUSH) return true; /* Don't use the nagle rule for urgent data (or for the final FIN). */ if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) return true; if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle)) return true; return false; } /* Does at least the first segment of SKB fit into the send window? */ static bool tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb, unsigned int cur_mss) { u32 end_seq = TCP_SKB_CB(skb)->end_seq; if (skb->len > cur_mss) end_seq = TCP_SKB_CB(skb)->seq + cur_mss; return !after(end_seq, tcp_wnd_end(tp)); } /* Trim TSO SKB to LEN bytes, put the remaining data into a new packet * which is put after SKB on the list. It is very much like * tcp_fragment() except that it may make several kinds of assumptions * in order to speed up the splitting operation. In particular, we * know that all the data is in scatter-gather pages, and that the * packet has never been sent out before (and thus is not cloned). */ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, unsigned int mss_now, gfp_t gfp) { int nlen = skb->len - len; struct sk_buff *buff; u8 flags; /* All of a TSO frame must be composed of paged data. */ DEBUG_NET_WARN_ON_ONCE(skb->len != skb->data_len); buff = tcp_stream_alloc_skb(sk, gfp, true); if (unlikely(!buff)) return -ENOMEM; skb_copy_decrypted(buff, skb); mptcp_skb_ext_copy(buff, skb); sk_wmem_queued_add(sk, buff->truesize); sk_mem_charge(sk, buff->truesize); buff->truesize += nlen; skb->truesize -= nlen; /* Correct the sequence numbers. */ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; /* PSH and FIN should only be set in the second packet. */ flags = TCP_SKB_CB(skb)->tcp_flags; TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); TCP_SKB_CB(buff)->tcp_flags = flags; tcp_skb_fragment_eor(skb, buff); skb_split(skb, buff, len); tcp_fragment_tstamp(skb, buff); /* Fix up tso_factor for both original and new SKB. */ tcp_set_skb_tso_segs(skb, mss_now); tcp_set_skb_tso_segs(buff, mss_now); /* Link BUFF into the send queue. */ __skb_header_release(buff); tcp_insert_write_queue_after(skb, buff, sk, TCP_FRAG_IN_WRITE_QUEUE); return 0; } /* Try to defer sending, if possible, in order to minimize the amount * of TSO splitting we do. View it as a kind of TSO Nagle test. * * This algorithm is from John Heffner. */ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, bool *is_cwnd_limited, bool *is_rwnd_limited, u32 max_segs) { const struct inet_connection_sock *icsk = inet_csk(sk); u32 send_win, cong_win, limit, in_flight; struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *head; int win_divisor; s64 delta; if (icsk->icsk_ca_state >= TCP_CA_Recovery) goto send_now; /* Avoid bursty behavior by allowing defer * only if the last write was recent (1 ms). * Note that tp->tcp_wstamp_ns can be in the future if we have * packets waiting in a qdisc or device for EDT delivery. */ delta = tp->tcp_clock_cache - tp->tcp_wstamp_ns - NSEC_PER_MSEC; if (delta > 0) goto send_now; in_flight = tcp_packets_in_flight(tp); BUG_ON(tcp_skb_pcount(skb) <= 1); BUG_ON(tcp_snd_cwnd(tp) <= in_flight); send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; /* From in_flight test above, we know that cwnd > in_flight. */ cong_win = (tcp_snd_cwnd(tp) - in_flight) * tp->mss_cache; limit = min(send_win, cong_win); /* If a full-sized TSO skb can be sent, do it. */ if (limit >= max_segs * tp->mss_cache) goto send_now; /* Middle in queue won't get any more data, full sendable already? */ if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len)) goto send_now; win_divisor = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor); if (win_divisor) { u32 chunk = min(tp->snd_wnd, tcp_snd_cwnd(tp) * tp->mss_cache); /* If at least some fraction of a window is available, * just use it. */ chunk /= win_divisor; if (limit >= chunk) goto send_now; } else { /* Different approach, try not to defer past a single * ACK. Receiver should ACK every other full sized * frame, so if we have space for more than 3 frames * then send now. */ if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache) goto send_now; } /* TODO : use tsorted_sent_queue ? */ head = tcp_rtx_queue_head(sk); if (!head) goto send_now; delta = tp->tcp_clock_cache - head->tstamp; /* If next ACK is likely to come too late (half srtt), do not defer */ if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> 4)) < 0) goto send_now; /* Ok, it looks like it is advisable to defer. * Three cases are tracked : * 1) We are cwnd-limited * 2) We are rwnd-limited * 3) We are application limited. */ if (cong_win < send_win) { if (cong_win <= skb->len) { *is_cwnd_limited = true; return true; } } else { if (send_win <= skb->len) { *is_rwnd_limited = true; return true; } } /* If this packet won't get more data, do not wait. */ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) || TCP_SKB_CB(skb)->eor) goto send_now; return true; send_now: return false; } static inline void tcp_mtu_check_reprobe(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); u32 interval; s32 delta; interval = READ_ONCE(net->ipv4.sysctl_tcp_probe_interval); delta = tcp_jiffies32 - icsk->icsk_mtup.probe_timestamp; if (unlikely(delta >= interval * HZ)) { int mss = tcp_current_mss(sk); /* Update current search range */ icsk->icsk_mtup.probe_size = 0; icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) + icsk->icsk_af_ops->net_header_len; icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); /* Update probe time stamp */ icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; } } static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len) { struct sk_buff *skb, *next; skb = tcp_send_head(sk); tcp_for_write_queue_from_safe(skb, next, sk) { if (len <= skb->len) break; if (unlikely(TCP_SKB_CB(skb)->eor) || tcp_has_tx_tstamp(skb) || !skb_pure_zcopy_same(skb, next)) return false; len -= skb->len; } return true; } static int tcp_clone_payload(struct sock *sk, struct sk_buff *to, int probe_size) { skb_frag_t *lastfrag = NULL, *fragto = skb_shinfo(to)->frags; int i, todo, len = 0, nr_frags = 0; const struct sk_buff *skb; if (!sk_wmem_schedule(sk, to->truesize + probe_size)) return -ENOMEM; skb_queue_walk(&sk->sk_write_queue, skb) { const skb_frag_t *fragfrom = skb_shinfo(skb)->frags; if (skb_headlen(skb)) return -EINVAL; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++, fragfrom++) { if (len >= probe_size) goto commit; todo = min_t(int, skb_frag_size(fragfrom), probe_size - len); len += todo; if (lastfrag && skb_frag_page(fragfrom) == skb_frag_page(lastfrag) && skb_frag_off(fragfrom) == skb_frag_off(lastfrag) + skb_frag_size(lastfrag)) { skb_frag_size_add(lastfrag, todo); continue; } if (unlikely(nr_frags == MAX_SKB_FRAGS)) return -E2BIG; skb_frag_page_copy(fragto, fragfrom); skb_frag_off_copy(fragto, fragfrom); skb_frag_size_set(fragto, todo); nr_frags++; lastfrag = fragto++; } } commit: WARN_ON_ONCE(len != probe_size); for (i = 0; i < nr_frags; i++) skb_frag_ref(to, i); skb_shinfo(to)->nr_frags = nr_frags; to->truesize += probe_size; to->len += probe_size; to->data_len += probe_size; __skb_header_release(to); return 0; } /* tcp_mtu_probe() and tcp_grow_skb() can both eat an skb (src) if * all its payload was moved to another one (dst). * Make sure to transfer tcp_flags, eor, and tstamp. */ static void tcp_eat_one_skb(struct sock *sk, struct sk_buff *dst, struct sk_buff *src) { TCP_SKB_CB(dst)->tcp_flags |= TCP_SKB_CB(src)->tcp_flags; TCP_SKB_CB(dst)->eor = TCP_SKB_CB(src)->eor; tcp_skb_collapse_tstamp(dst, src); tcp_unlink_write_queue(src, sk); tcp_wmem_free_skb(sk, src); } /* Create a new MTU probe if we are ready. * MTU probe is regularly attempting to increase the path MTU by * deliberately sending larger packets. This discovers routing * changes resulting in larger path MTUs. * * Returns 0 if we should wait to probe (no cwnd available), * 1 if a probe was sent, * -1 otherwise */ static int tcp_mtu_probe(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb, *nskb, *next; struct net *net = sock_net(sk); int probe_size; int size_needed; int copy, len; int mss_now; int interval; /* Not currently probing/verifying, * not in recovery, * have enough cwnd, and * not SACKing (the variable headers throw things off) */ if (likely(!icsk->icsk_mtup.enabled || icsk->icsk_mtup.probe_size || inet_csk(sk)->icsk_ca_state != TCP_CA_Open || tcp_snd_cwnd(tp) < 11 || tp->rx_opt.num_sacks || tp->rx_opt.dsack)) return -1; /* Use binary search for probe_size between tcp_mss_base, * and current mss_clamp. if (search_high - search_low) * smaller than a threshold, backoff from probing. */ mss_now = tcp_current_mss(sk); probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high + icsk->icsk_mtup.search_low) >> 1); size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache; interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low; /* When misfortune happens, we are reprobing actively, * and then reprobe timer has expired. We stick with current * probing process by not resetting search range to its orignal. */ if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) || interval < READ_ONCE(net->ipv4.sysctl_tcp_probe_threshold)) { /* Check whether enough time has elaplased for * another round of probing. */ tcp_mtu_check_reprobe(sk); return -1; } /* Have enough data in the send queue to probe? */ if (tp->write_seq - tp->snd_nxt < size_needed) return -1; if (tp->snd_wnd < size_needed) return -1; if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp))) return 0; /* Do we need to wait to drain cwnd? With none in flight, don't stall */ if (tcp_packets_in_flight(tp) + 2 > tcp_snd_cwnd(tp)) { if (!tcp_packets_in_flight(tp)) return -1; else return 0; } if (!tcp_can_coalesce_send_queue_head(sk, probe_size)) return -1; /* We're allowed to probe. Build it now. */ nskb = tcp_stream_alloc_skb(sk, GFP_ATOMIC, false); if (!nskb) return -1; /* build the payload, and be prepared to abort if this fails. */ if (tcp_clone_payload(sk, nskb, probe_size)) { tcp_skb_tsorted_anchor_cleanup(nskb); consume_skb(nskb); return -1; } sk_wmem_queued_add(sk, nskb->truesize); sk_mem_charge(sk, nskb->truesize); skb = tcp_send_head(sk); skb_copy_decrypted(nskb, skb); mptcp_skb_ext_copy(nskb, skb); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK; tcp_insert_write_queue_before(nskb, skb, sk); tcp_highest_sack_replace(sk, skb, nskb); len = 0; tcp_for_write_queue_from_safe(skb, next, sk) { copy = min_t(int, skb->len, probe_size - len); if (skb->len <= copy) { tcp_eat_one_skb(sk, nskb, skb); } else { TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags & ~(TCPHDR_FIN|TCPHDR_PSH); __pskb_trim_head(skb, copy); tcp_set_skb_tso_segs(skb, mss_now); TCP_SKB_CB(skb)->seq += copy; } len += copy; if (len >= probe_size) break; } tcp_init_tso_segs(nskb, nskb->len); /* We're ready to send. If this fails, the probe will * be resegmented into mss-sized pieces by tcp_write_xmit(). */ if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) { /* Decrement cwnd here because we are sending * effectively two packets. */ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) - 1); tcp_event_new_data_sent(sk, nskb); icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len); tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq; tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq; return 1; } return -1; } static bool tcp_pacing_check(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (!tcp_needs_internal_pacing(sk)) return false; if (tp->tcp_wstamp_ns <= tp->tcp_clock_cache) return false; if (!hrtimer_is_queued(&tp->pacing_timer)) { hrtimer_start(&tp->pacing_timer, ns_to_ktime(tp->tcp_wstamp_ns), HRTIMER_MODE_ABS_PINNED_SOFT); sock_hold(sk); } return true; } static bool tcp_rtx_queue_empty_or_single_skb(const struct sock *sk) { const struct rb_node *node = sk->tcp_rtx_queue.rb_node; /* No skb in the rtx queue. */ if (!node) return true; /* Only one skb in rtx queue. */ return !node->rb_left && !node->rb_right; } /* TCP Small Queues : * Control number of packets in qdisc/devices to two packets / or ~1 ms. * (These limits are doubled for retransmits) * This allows for : * - better RTT estimation and ACK scheduling * - faster recovery * - high rates * Alas, some drivers / subsystems require a fair amount * of queued bytes to ensure line rate. * One example is wifi aggregation (802.11 AMPDU) */ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, unsigned int factor) { unsigned long limit; limit = max_t(unsigned long, 2 * skb->truesize, READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift)); if (sk->sk_pacing_status == SK_PACING_NONE) limit = min_t(unsigned long, limit, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes)); limit <<= factor; if (static_branch_unlikely(&tcp_tx_delay_enabled) && tcp_sk(sk)->tcp_tx_delay) { u64 extra_bytes = (u64)READ_ONCE(sk->sk_pacing_rate) * tcp_sk(sk)->tcp_tx_delay; /* TSQ is based on skb truesize sum (sk_wmem_alloc), so we * approximate our needs assuming an ~100% skb->truesize overhead. * USEC_PER_SEC is approximated by 2^20. * do_div(extra_bytes, USEC_PER_SEC/2) is replaced by a right shift. */ extra_bytes >>= (20 - 1); limit += extra_bytes; } if (refcount_read(&sk->sk_wmem_alloc) > limit) { /* Always send skb if rtx queue is empty or has one skb. * No need to wait for TX completion to call us back, * after softirq/tasklet schedule. * This helps when TX completions are delayed too much. */ if (tcp_rtx_queue_empty_or_single_skb(sk)) return false; set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); /* It is possible TX completion already happened * before we set TSQ_THROTTLED, so we must * test again the condition. */ smp_mb__after_atomic(); if (refcount_read(&sk->sk_wmem_alloc) > limit) return true; } return false; } static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new) { const u32 now = tcp_jiffies32; enum tcp_chrono old = tp->chrono_type; if (old > TCP_CHRONO_UNSPEC) tp->chrono_stat[old - 1] += now - tp->chrono_start; tp->chrono_start = now; tp->chrono_type = new; } void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type) { struct tcp_sock *tp = tcp_sk(sk); /* If there are multiple conditions worthy of tracking in a * chronograph then the highest priority enum takes precedence * over the other conditions. So that if something "more interesting" * starts happening, stop the previous chrono and start a new one. */ if (type > tp->chrono_type) tcp_chrono_set(tp, type); } void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type) { struct tcp_sock *tp = tcp_sk(sk); /* There are multiple conditions worthy of tracking in a * chronograph, so that the highest priority enum takes * precedence over the other conditions (see tcp_chrono_start). * If a condition stops, we only stop chrono tracking if * it's the "most interesting" or current chrono we are * tracking and starts busy chrono if we have pending data. */ if (tcp_rtx_and_write_queues_empty(sk)) tcp_chrono_set(tp, TCP_CHRONO_UNSPEC); else if (type == tp->chrono_type) tcp_chrono_set(tp, TCP_CHRONO_BUSY); } /* First skb in the write queue is smaller than ideal packet size. * Check if we can move payload from the second skb in the queue. */ static void tcp_grow_skb(struct sock *sk, struct sk_buff *skb, int amount) { struct sk_buff *next_skb = skb->next; unsigned int nlen; if (tcp_skb_is_last(sk, skb)) return; if (!tcp_skb_can_collapse(skb, next_skb)) return; nlen = min_t(u32, amount, next_skb->len); if (!nlen || !skb_shift(skb, next_skb, nlen)) return; TCP_SKB_CB(skb)->end_seq += nlen; TCP_SKB_CB(next_skb)->seq += nlen; if (!next_skb->len) { /* In case FIN is set, we need to update end_seq */ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; tcp_eat_one_skb(sk, skb, next_skb); } } /* This routine writes packets to the network. It advances the * send_head. This happens as incoming acks open up the remote * window for us. * * LARGESEND note: !tcp_urg_mode is overkill, only frames between * snd_up-64k-mss .. snd_up cannot be large. However, taking into * account rare use of URG, this is not a big flaw. * * Send at most one packet when push_one > 0. Temporarily ignore * cwnd limit to force at most one packet out when push_one == 2. * Returns true, if no segments are in flight and we have queued segments, * but cannot send anything now because of SWS or another problem. */ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; unsigned int tso_segs, sent_pkts; u32 cwnd_quota, max_segs; int result; bool is_cwnd_limited = false, is_rwnd_limited = false; sent_pkts = 0; tcp_mstamp_refresh(tp); if (!push_one) { /* Do MTU probing. */ result = tcp_mtu_probe(sk); if (!result) { return false; } else if (result > 0) { sent_pkts = 1; } } max_segs = tcp_tso_segs(sk, mss_now); while ((skb = tcp_send_head(sk))) { unsigned int limit; int missing_bytes; if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { /* "skb_mstamp_ns" is used as a start point for the retransmit timer */ tp->tcp_wstamp_ns = tp->tcp_clock_cache; skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC); list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); tcp_init_tso_segs(skb, mss_now); goto repair; /* Skip network transmission */ } if (tcp_pacing_check(sk)) break; cwnd_quota = tcp_cwnd_test(tp); if (!cwnd_quota) { if (push_one == 2) /* Force out a loss probe pkt. */ cwnd_quota = 1; else break; } cwnd_quota = min(cwnd_quota, max_segs); missing_bytes = cwnd_quota * mss_now - skb->len; if (missing_bytes > 0) tcp_grow_skb(sk, skb, missing_bytes); tso_segs = tcp_set_skb_tso_segs(skb, mss_now); if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) { is_rwnd_limited = true; break; } if (tso_segs == 1) { if (unlikely(!tcp_nagle_test(tp, skb, mss_now, (tcp_skb_is_last(sk, skb) ? nonagle : TCP_NAGLE_PUSH)))) break; } else { if (!push_one && tcp_tso_should_defer(sk, skb, &is_cwnd_limited, &is_rwnd_limited, max_segs)) break; } limit = mss_now; if (tso_segs > 1 && !tcp_urg_mode(tp)) limit = tcp_mss_split_point(sk, skb, mss_now, cwnd_quota, nonagle); if (skb->len > limit && unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) break; if (tcp_small_queue_check(sk, skb, 0)) break; /* Argh, we hit an empty skb(), presumably a thread * is sleeping in sendmsg()/sk_stream_wait_memory(). * We do not want to send a pure-ack packet and have * a strange looking rtx queue with empty packet(s). */ if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) break; if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) break; repair: /* Advance the send_head. This one is sent out. * This call will increment packets_out. */ tcp_event_new_data_sent(sk, skb); tcp_minshall_update(tp, mss_now, skb); sent_pkts += tcp_skb_pcount(skb); if (push_one) break; } if (is_rwnd_limited) tcp_chrono_start(sk, TCP_CHRONO_RWND_LIMITED); else tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED); is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tcp_snd_cwnd(tp)); if (likely(sent_pkts || is_cwnd_limited)) tcp_cwnd_validate(sk, is_cwnd_limited); if (likely(sent_pkts)) { if (tcp_in_cwnd_reduction(sk)) tp->prr_out += sent_pkts; /* Send one loss probe per tail loss episode. */ if (push_one != 2) tcp_schedule_loss_probe(sk, false); return false; } return !tp->packets_out && !tcp_write_queue_empty(sk); } bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); u32 timeout, timeout_us, rto_delta_us; int early_retrans; /* Don't do any loss probe on a Fast Open connection before 3WHS * finishes. */ if (rcu_access_pointer(tp->fastopen_rsk)) return false; early_retrans = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_early_retrans); /* Schedule a loss probe in 2*RTT for SACK capable connections * not in loss recovery, that are either limited by cwnd or application. */ if ((early_retrans != 3 && early_retrans != 4) || !tp->packets_out || !tcp_is_sack(tp) || (icsk->icsk_ca_state != TCP_CA_Open && icsk->icsk_ca_state != TCP_CA_CWR)) return false; /* Probe timeout is 2*rtt. Add minimum RTO to account * for delayed ack when there's one outstanding packet. If no RTT * sample is available then probe after TCP_TIMEOUT_INIT. */ if (tp->srtt_us) { timeout_us = tp->srtt_us >> 2; if (tp->packets_out == 1) timeout_us += tcp_rto_min_us(sk); else timeout_us += TCP_TIMEOUT_MIN_US; timeout = usecs_to_jiffies(timeout_us); } else { timeout = TCP_TIMEOUT_INIT; } /* If the RTO formula yields an earlier time, then use that time. */ rto_delta_us = advancing_rto ? jiffies_to_usecs(inet_csk(sk)->icsk_rto) : tcp_rto_delta_us(sk); /* How far in future is RTO? */ if (rto_delta_us > 0) timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us)); tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, TCP_RTO_MAX); return true; } /* Thanks to skb fast clones, we can detect if a prior transmit of * a packet is still in a qdisc or driver queue. * In this case, there is very little point doing a retransmit ! */ static bool skb_still_in_host_queue(struct sock *sk, const struct sk_buff *skb) { if (unlikely(skb_fclone_busy(sk, skb))) { set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); smp_mb__after_atomic(); if (skb_fclone_busy(sk, skb)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); return true; } } return false; } /* When probe timeout (PTO) fires, try send a new segment if possible, else * retransmit the last segment. */ void tcp_send_loss_probe(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; int pcount; int mss = tcp_current_mss(sk); /* At most one outstanding TLP */ if (tp->tlp_high_seq) goto rearm_timer; tp->tlp_retrans = 0; skb = tcp_send_head(sk); if (skb && tcp_snd_wnd_test(tp, skb, mss)) { pcount = tp->packets_out; tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); if (tp->packets_out > pcount) goto probe_sent; goto rearm_timer; } skb = skb_rb_last(&sk->tcp_rtx_queue); if (unlikely(!skb)) { WARN_ONCE(tp->packets_out, "invalid inflight: %u state %u cwnd %u mss %d\n", tp->packets_out, sk->sk_state, tcp_snd_cwnd(tp), mss); inet_csk(sk)->icsk_pending = 0; return; } if (skb_still_in_host_queue(sk, skb)) goto rearm_timer; pcount = tcp_skb_pcount(skb); if (WARN_ON(!pcount)) goto rearm_timer; if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) { if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, (pcount - 1) * mss, mss, GFP_ATOMIC))) goto rearm_timer; skb = skb_rb_next(skb); } if (WARN_ON(!skb || !tcp_skb_pcount(skb))) goto rearm_timer; if (__tcp_retransmit_skb(sk, skb, 1)) goto rearm_timer; tp->tlp_retrans = 1; probe_sent: /* Record snd_nxt for loss detection. */ tp->tlp_high_seq = tp->snd_nxt; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES); /* Reset s.t. tcp_rearm_rto will restart timer from now */ inet_csk(sk)->icsk_pending = 0; rearm_timer: tcp_rearm_rto(sk); } /* Push out any pending frames which were held back due to * TCP_CORK or attempt at coalescing tiny packets. * The socket must be locked by the caller. */ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, int nonagle) { /* If we are closed, the bytes will have to remain here. * In time closedown will finish, we empty the write queue and * all will be happy. */ if (unlikely(sk->sk_state == TCP_CLOSE)) return; if (tcp_write_xmit(sk, cur_mss, nonagle, 0, sk_gfp_mask(sk, GFP_ATOMIC))) tcp_check_probe_timer(sk); } /* Send _single_ skb sitting at the send head. This function requires * true push pending frames to setup probe timer etc. */ void tcp_push_one(struct sock *sk, unsigned int mss_now) { struct sk_buff *skb = tcp_send_head(sk); BUG_ON(!skb || skb->len < mss_now); tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation); } /* This function returns the amount that we can raise the * usable window based on the following constraints * * 1. The window can never be shrunk once it is offered (RFC 793) * 2. We limit memory per socket * * RFC 1122: * "the suggested [SWS] avoidance algorithm for the receiver is to keep * RECV.NEXT + RCV.WIN fixed until: * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)" * * i.e. don't raise the right edge of the window until you can raise * it at least MSS bytes. * * Unfortunately, the recommended algorithm breaks header prediction, * since header prediction assumes th->window stays fixed. * * Strictly speaking, keeping th->window fixed violates the receiver * side SWS prevention criteria. The problem is that under this rule * a stream of single byte packets will cause the right side of the * window to always advance by a single byte. * * Of course, if the sender implements sender side SWS prevention * then this will not be a problem. * * BSD seems to make the following compromise: * * If the free space is less than the 1/4 of the maximum * space available and the free space is less than 1/2 mss, * then set the window to 0. * [ Actually, bsd uses MSS and 1/4 of maximal _window_ ] * Otherwise, just prevent the window from shrinking * and from being larger than the largest representable value. * * This prevents incremental opening of the window in the regime * where TCP is limited by the speed of the reader side taking * data out of the TCP receive queue. It does nothing about * those cases where the window is constrained on the sender side * because the pipeline is full. * * BSD also seems to "accidentally" limit itself to windows that are a * multiple of MSS, at least until the free space gets quite small. * This would appear to be a side effect of the mbuf implementation. * Combining these two algorithms results in the observed behavior * of having a fixed window size at almost all times. * * Below we obtain similar behavior by forcing the offered window to * a multiple of the mss when it is feasible to do so. * * Note, we don't "adjust" for TIMESTAMP or SACK option bytes. * Regular options like TIMESTAMP are taken into account. */ u32 __tcp_select_window(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); /* MSS for the peer's data. Previous versions used mss_clamp * here. I don't know if the value based on our guesses * of peer's MSS is better for the performance. It's more correct * but may be worse for the performance because of rcv_mss * fluctuations. --SAW 1998/11/1 */ int mss = icsk->icsk_ack.rcv_mss; int free_space = tcp_space(sk); int allowed_space = tcp_full_space(sk); int full_space, window; if (sk_is_mptcp(sk)) mptcp_space(sk, &free_space, &allowed_space); full_space = min_t(int, tp->window_clamp, allowed_space); if (unlikely(mss > full_space)) { mss = full_space; if (mss <= 0) return 0; } /* Only allow window shrink if the sysctl is enabled and we have * a non-zero scaling factor in effect. */ if (READ_ONCE(net->ipv4.sysctl_tcp_shrink_window) && tp->rx_opt.rcv_wscale) goto shrink_window_allowed; /* do not allow window to shrink */ if (free_space < (full_space >> 1)) { icsk->icsk_ack.quick = 0; if (tcp_under_memory_pressure(sk)) tcp_adjust_rcv_ssthresh(sk); /* free_space might become our new window, make sure we don't * increase it due to wscale. */ free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale); /* if free space is less than mss estimate, or is below 1/16th * of the maximum allowed, try to move to zero-window, else * tcp_clamp_window() will grow rcv buf up to tcp_rmem[2], and * new incoming data is dropped due to memory limits. * With large window, mss test triggers way too late in order * to announce zero window in time before rmem limit kicks in. */ if (free_space < (allowed_space >> 4) || free_space < mss) return 0; } if (free_space > tp->rcv_ssthresh) free_space = tp->rcv_ssthresh; /* Don't do rounding if we are using window scaling, since the * scaled window will not line up with the MSS boundary anyway. */ if (tp->rx_opt.rcv_wscale) { window = free_space; /* Advertise enough space so that it won't get scaled away. * Import case: prevent zero window announcement if * 1<<rcv_wscale > mss. */ window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale)); } else { window = tp->rcv_wnd; /* Get the largest window that is a nice multiple of mss. * Window clamp already applied above. * If our current window offering is within 1 mss of the * free space we just keep it. This prevents the divide * and multiply from happening most of the time. * We also don't do any window rounding when the free space * is too small. */ if (window <= free_space - mss || window > free_space) window = rounddown(free_space, mss); else if (mss == full_space && free_space > window + (full_space >> 1)) window = free_space; } return window; shrink_window_allowed: /* new window should always be an exact multiple of scaling factor */ free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale); if (free_space < (full_space >> 1)) { icsk->icsk_ack.quick = 0; if (tcp_under_memory_pressure(sk)) tcp_adjust_rcv_ssthresh(sk); /* if free space is too low, return a zero window */ if (free_space < (allowed_space >> 4) || free_space < mss || free_space < (1 << tp->rx_opt.rcv_wscale)) return 0; } if (free_space > tp->rcv_ssthresh) { free_space = tp->rcv_ssthresh; /* new window should always be an exact multiple of scaling factor * * For this case, we ALIGN "up" (increase free_space) because * we know free_space is not zero here, it has been reduced from * the memory-based limit, and rcv_ssthresh is not a hard limit * (unlike sk_rcvbuf). */ free_space = ALIGN(free_space, (1 << tp->rx_opt.rcv_wscale)); } return free_space; } void tcp_skb_collapse_tstamp(struct sk_buff *skb, const struct sk_buff *next_skb) { if (unlikely(tcp_has_tx_tstamp(next_skb))) { const struct skb_shared_info *next_shinfo = skb_shinfo(next_skb); struct skb_shared_info *shinfo = skb_shinfo(skb); shinfo->tx_flags |= next_shinfo->tx_flags & SKBTX_ANY_TSTAMP; shinfo->tskey = next_shinfo->tskey; TCP_SKB_CB(skb)->txstamp_ack |= TCP_SKB_CB(next_skb)->txstamp_ack; } } /* Collapses two adjacent SKB's during retransmission. */ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *next_skb = skb_rb_next(skb); int next_skb_size; next_skb_size = next_skb->len; BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); if (next_skb_size && !tcp_skb_shift(skb, next_skb, 1, next_skb_size)) return false; tcp_highest_sack_replace(sk, next_skb, skb); /* Update sequence range on original skb. */ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; /* Merge over control information. This moves PSH/FIN etc. over */ TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags; /* All done, get rid of second SKB and account for it so * packet counting does not break. */ TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS; TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor; /* changed transmit queue under us so clear hints */ tcp_clear_retrans_hints_partial(tp); if (next_skb == tp->retransmit_skb_hint) tp->retransmit_skb_hint = skb; tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb)); tcp_skb_collapse_tstamp(skb, next_skb); tcp_rtx_queue_unlink_and_free(next_skb, sk); return true; } /* Check if coalescing SKBs is legal. */ static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb) { if (tcp_skb_pcount(skb) > 1) return false; if (skb_cloned(skb)) return false; /* Some heuristics for collapsing over SACK'd could be invented */ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) return false; return true; } /* Collapse packets in the retransmit queue to make to create * less packets on the wire. This is only done on retransmission. */ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, int space) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb = to, *tmp; bool first = true; if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse)) return; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) return; skb_rbtree_walk_from_safe(skb, tmp) { if (!tcp_can_collapse(sk, skb)) break; if (!tcp_skb_can_collapse(to, skb)) break; space -= skb->len; if (first) { first = false; continue; } if (space < 0) break; if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp))) break; if (!tcp_collapse_retrans(sk, to)) break; } } /* This retransmits one SKB. Policy decisions and retransmit queue * state updates are done by the caller. Returns non-zero if an * error occurred which prevented the send. */ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); unsigned int cur_mss; int diff, len, err; int avail_wnd; /* Inconclusive MTU probe */ if (icsk->icsk_mtup.probe_size) icsk->icsk_mtup.probe_size = 0; if (skb_still_in_host_queue(sk, skb)) return -EBUSY; start: if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) { if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN; TCP_SKB_CB(skb)->seq++; goto start; } if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) { WARN_ON_ONCE(1); return -EINVAL; } if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) return -ENOMEM; } if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) return -EHOSTUNREACH; /* Routing failure or similar. */ cur_mss = tcp_current_mss(sk); avail_wnd = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; /* If receiver has shrunk his window, and skb is out of * new window, do not retransmit it. The exception is the * case, when window is shrunk to zero. In this case * our retransmit of one segment serves as a zero window probe. */ if (avail_wnd <= 0) { if (TCP_SKB_CB(skb)->seq != tp->snd_una) return -EAGAIN; avail_wnd = cur_mss; } len = cur_mss * segs; if (len > avail_wnd) { len = rounddown(avail_wnd, cur_mss); if (!len) len = avail_wnd; } if (skb->len > len) { if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len, cur_mss, GFP_ATOMIC)) return -ENOMEM; /* We'll try again later. */ } else { if (skb_unclone_keeptruesize(skb, GFP_ATOMIC)) return -ENOMEM; diff = tcp_skb_pcount(skb); tcp_set_skb_tso_segs(skb, cur_mss); diff -= tcp_skb_pcount(skb); if (diff) tcp_adjust_pcount(sk, skb, diff); avail_wnd = min_t(int, avail_wnd, cur_mss); if (skb->len < avail_wnd) tcp_retrans_try_collapse(sk, skb, avail_wnd); } /* RFC3168, section 6.1.1.1. ECN fallback */ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) tcp_ecn_clear_syn(sk, skb); /* Update global and local TCP statistics. */ segs = tcp_skb_pcount(skb); TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs); if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); tp->total_retrans += segs; tp->bytes_retrans += skb->len; /* make sure skb->data is aligned on arches that require it * and check if ack-trimming & collapsing extended the headroom * beyond what csum_start can cover. */ if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) || skb_headroom(skb) >= 0xFFFF)) { struct sk_buff *nskb; tcp_skb_tsorted_save(skb) { nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); if (nskb) { nskb->dev = NULL; err = tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC); } else { err = -ENOBUFS; } } tcp_skb_tsorted_restore(skb); if (!err) { tcp_update_skb_after_send(sk, skb, tp->tcp_wstamp_ns); tcp_rate_skb_sent(sk, skb); } } else { err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG)) tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB, TCP_SKB_CB(skb)->seq, segs, err); if (likely(!err)) { trace_tcp_retransmit_skb(sk, skb); } else if (err != -EBUSY) { NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs); } /* To avoid taking spuriously low RTT samples based on a timestamp * for a transmit that never happened, always mark EVER_RETRANS */ TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; return err; } int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) { struct tcp_sock *tp = tcp_sk(sk); int err = __tcp_retransmit_skb(sk, skb, segs); if (err == 0) { #if FASTRETRANS_DEBUG > 0 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { net_dbg_ratelimited("retrans_out leaked\n"); } #endif TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS; tp->retrans_out += tcp_skb_pcount(skb); } /* Save stamp of the first (attempted) retransmit. */ if (!tp->retrans_stamp) tp->retrans_stamp = tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb); if (tp->undo_retrans < 0) tp->undo_retrans = 0; tp->undo_retrans += tcp_skb_pcount(skb); return err; } /* This gets called after a retransmit timeout, and the initially * retransmitted data is acknowledged. It tries to continue * resending the rest of the retransmit queue, until either * we've sent it all or the congestion window limit is reached. */ void tcp_xmit_retransmit_queue(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb, *rtx_head, *hole = NULL; struct tcp_sock *tp = tcp_sk(sk); bool rearm_timer = false; u32 max_segs; int mib_idx; if (!tp->packets_out) return; rtx_head = tcp_rtx_queue_head(sk); skb = tp->retransmit_skb_hint ?: rtx_head; max_segs = tcp_tso_segs(sk, tcp_current_mss(sk)); skb_rbtree_walk_from(skb) { __u8 sacked; int segs; if (tcp_pacing_check(sk)) break; /* we could do better than to assign each time */ if (!hole) tp->retransmit_skb_hint = skb; segs = tcp_snd_cwnd(tp) - tcp_packets_in_flight(tp); if (segs <= 0) break; sacked = TCP_SKB_CB(skb)->sacked; /* In case tcp_shift_skb_data() have aggregated large skbs, * we need to make sure not sending too bigs TSO packets */ segs = min_t(int, segs, max_segs); if (tp->retrans_out >= tp->lost_out) { break; } else if (!(sacked & TCPCB_LOST)) { if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED))) hole = skb; continue; } else { if (icsk->icsk_ca_state != TCP_CA_Loss) mib_idx = LINUX_MIB_TCPFASTRETRANS; else mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS; } if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS)) continue; if (tcp_small_queue_check(sk, skb, 1)) break; if (tcp_retransmit_skb(sk, skb, segs)) break; NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb)); if (tcp_in_cwnd_reduction(sk)) tp->prr_out += tcp_skb_pcount(skb); if (skb == rtx_head && icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT) rearm_timer = true; } if (rearm_timer) tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); } /* We allow to exceed memory limits for FIN packets to expedite * connection tear down and (memory) recovery. * Otherwise tcp_send_fin() could be tempted to either delay FIN * or even be forced to close flow without any FIN. * In general, we want to allow one skb per socket to avoid hangs * with edge trigger epoll() */ void sk_forced_mem_schedule(struct sock *sk, int size) { int delta, amt; delta = size - sk->sk_forward_alloc; if (delta <= 0) return; amt = sk_mem_pages(delta); sk_forward_alloc_add(sk, amt << PAGE_SHIFT); sk_memory_allocated_add(sk, amt); if (mem_cgroup_sockets_enabled && sk->sk_memcg) mem_cgroup_charge_skmem(sk->sk_memcg, amt, gfp_memcg_charge() | __GFP_NOFAIL); } /* Send a FIN. The caller locks the socket for us. * We should try to send a FIN packet really hard, but eventually give up. */ void tcp_send_fin(struct sock *sk) { struct sk_buff *skb, *tskb, *tail = tcp_write_queue_tail(sk); struct tcp_sock *tp = tcp_sk(sk); /* Optimization, tack on the FIN if we have one skb in write queue and * this skb was not yet sent, or we are under memory pressure. * Note: in the latter case, FIN packet will be sent after a timeout, * as TCP stack thinks it has already been transmitted. */ tskb = tail; if (!tskb && tcp_under_memory_pressure(sk)) tskb = skb_rb_last(&sk->tcp_rtx_queue); if (tskb) { TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN; TCP_SKB_CB(tskb)->end_seq++; tp->write_seq++; if (!tail) { /* This means tskb was already sent. * Pretend we included the FIN on previous transmit. * We need to set tp->snd_nxt to the value it would have * if FIN had been sent. This is because retransmit path * does not change tp->snd_nxt. */ WRITE_ONCE(tp->snd_nxt, tp->snd_nxt + 1); return; } } else { skb = alloc_skb_fclone(MAX_TCP_HEADER, sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN)); if (unlikely(!skb)) return; INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); skb_reserve(skb, MAX_TCP_HEADER); sk_forced_mem_schedule(sk, skb->truesize); /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ tcp_init_nondata_skb(skb, tp->write_seq, TCPHDR_ACK | TCPHDR_FIN); tcp_queue_skb(sk, skb); } __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF); } /* We get here when a process closes a file descriptor (either due to * an explicit close() or as a byproduct of exit()'ing) and there * was unread data in the receive queue. This behavior is recommended * by RFC 2525, section 2.17. -DaveM */ void tcp_send_active_reset(struct sock *sk, gfp_t priority, enum sk_rst_reason reason) { struct sk_buff *skb; TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS); /* NOTE: No TCP options attached and we never retransmit this. */ skb = alloc_skb(MAX_TCP_HEADER, priority); if (!skb) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); return; } /* Reserve space for headers and prepare control bits. */ skb_reserve(skb, MAX_TCP_HEADER); tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), TCPHDR_ACK | TCPHDR_RST); tcp_mstamp_refresh(tcp_sk(sk)); /* Send it off. */ if (tcp_transmit_skb(sk, skb, 0, priority)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); /* skb of trace_tcp_send_reset() keeps the skb that caused RST, * skb here is different to the troublesome skb, so use NULL */ trace_tcp_send_reset(sk, NULL, SK_RST_REASON_NOT_SPECIFIED); } /* Send a crossed SYN-ACK during socket establishment. * WARNING: This routine must only be called when we have already sent * a SYN packet that crossed the incoming SYN that caused this routine * to get called. If this assumption fails then the initial rcv_wnd * and rcv_wscale values will not be correct. */ int tcp_send_synack(struct sock *sk) { struct sk_buff *skb; skb = tcp_rtx_queue_head(sk); if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { pr_err("%s: wrong queue state\n", __func__); return -EFAULT; } if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) { if (skb_cloned(skb)) { struct sk_buff *nskb; tcp_skb_tsorted_save(skb) { nskb = skb_copy(skb, GFP_ATOMIC); } tcp_skb_tsorted_restore(skb); if (!nskb) return -ENOMEM; INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor); tcp_highest_sack_replace(sk, skb, nskb); tcp_rtx_queue_unlink_and_free(skb, sk); __skb_header_release(nskb); tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb); sk_wmem_queued_add(sk, nskb->truesize); sk_mem_charge(sk, nskb->truesize); skb = nskb; } TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK; tcp_ecn_send_synack(sk, skb); } return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } /** * tcp_make_synack - Allocate one skb and build a SYNACK packet. * @sk: listener socket * @dst: dst entry attached to the SYNACK. It is consumed and caller * should not use it again. * @req: request_sock pointer * @foc: cookie for tcp fast open * @synack_type: Type of synack to prepare * @syn_skb: SYN packet just received. It could be NULL for rtx case. */ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type, struct sk_buff *syn_skb) { struct inet_request_sock *ireq = inet_rsk(req); const struct tcp_sock *tp = tcp_sk(sk); struct tcp_out_options opts; struct tcp_key key = {}; struct sk_buff *skb; int tcp_header_size; struct tcphdr *th; int mss; u64 now; skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); if (unlikely(!skb)) { dst_release(dst); return NULL; } /* Reserve space for headers. */ skb_reserve(skb, MAX_TCP_HEADER); switch (synack_type) { case TCP_SYNACK_NORMAL: skb_set_owner_w(skb, req_to_sk(req)); break; case TCP_SYNACK_COOKIE: /* Under synflood, we do not attach skb to a socket, * to avoid false sharing. */ break; case TCP_SYNACK_FASTOPEN: /* sk is a const pointer, because we want to express multiple * cpu might call us concurrently. * sk->sk_wmem_alloc in an atomic, we can promote to rw. */ skb_set_owner_w(skb, (struct sock *)sk); break; } skb_dst_set(skb, dst); mss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); memset(&opts, 0, sizeof(opts)); now = tcp_clock_ns(); #ifdef CONFIG_SYN_COOKIES if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok)) skb_set_delivery_time(skb, cookie_init_timestamp(req, now), SKB_CLOCK_MONOTONIC); else #endif { skb_set_delivery_time(skb, now, SKB_CLOCK_MONOTONIC); if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */ tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb); } #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) rcu_read_lock(); #endif if (tcp_rsk_used_ao(req)) { #ifdef CONFIG_TCP_AO struct tcp_ao_key *ao_key = NULL; u8 keyid = tcp_rsk(req)->ao_keyid; u8 rnext = tcp_rsk(req)->ao_rcv_next; ao_key = tcp_sk(sk)->af_specific->ao_lookup(sk, req_to_sk(req), keyid, -1); /* If there is no matching key - avoid sending anything, * especially usigned segments. It could try harder and lookup * for another peer-matching key, but the peer has requested * ao_keyid (RFC5925 RNextKeyID), so let's keep it simple here. */ if (unlikely(!ao_key)) { trace_tcp_ao_synack_no_key(sk, keyid, rnext); rcu_read_unlock(); kfree_skb(skb); net_warn_ratelimited("TCP-AO: the keyid %u from SYN packet is not present - not sending SYNACK\n", keyid); return NULL; } key.ao_key = ao_key; key.type = TCP_KEY_AO; #endif } else { #ifdef CONFIG_TCP_MD5SIG key.md5_key = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req)); if (key.md5_key) key.type = TCP_KEY_MD5; #endif } skb_set_hash(skb, READ_ONCE(tcp_rsk(req)->txhash), PKT_HASH_TYPE_L4); /* bpf program will be interested in the tcp_flags */ TCP_SKB_CB(skb)->tcp_flags = TCPHDR_SYN | TCPHDR_ACK; tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &key, foc, synack_type, syn_skb) + sizeof(*th); skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); th = (struct tcphdr *)skb->data; memset(th, 0, sizeof(struct tcphdr)); th->syn = 1; th->ack = 1; tcp_ecn_make_synack(req, th); th->source = htons(ireq->ir_num); th->dest = ireq->ir_rmt_port; skb->mark = ireq->ir_mark; skb->ip_summed = CHECKSUM_PARTIAL; th->seq = htonl(tcp_rsk(req)->snt_isn); /* XXX data is queued and acked as is. No buffer/window check */ th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt); /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ th->window = htons(min(req->rsk_rcv_wnd, 65535U)); tcp_options_write(th, NULL, tcp_rsk(req), &opts, &key); th->doff = (tcp_header_size >> 2); TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); /* Okay, we have all we need - do the md5 hash if needed */ if (tcp_key_is_md5(&key)) { #ifdef CONFIG_TCP_MD5SIG tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location, key.md5_key, req_to_sk(req), skb); #endif } else if (tcp_key_is_ao(&key)) { #ifdef CONFIG_TCP_AO tcp_rsk(req)->af_specific->ao_synack_hash(opts.hash_location, key.ao_key, req, skb, opts.hash_location - (u8 *)th, 0); #endif } #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) rcu_read_unlock(); #endif bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb, synack_type, &opts); skb_set_delivery_time(skb, now, SKB_CLOCK_MONOTONIC); tcp_add_tx_delay(skb, tp); return skb; } EXPORT_SYMBOL(tcp_make_synack); static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst) { struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_congestion_ops *ca; u32 ca_key = dst_metric(dst, RTAX_CC_ALGO); if (ca_key == TCP_CA_UNSPEC) return; rcu_read_lock(); ca = tcp_ca_find_key(ca_key); if (likely(ca && bpf_try_module_get(ca, ca->owner))) { bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner); icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst); icsk->icsk_ca_ops = ca; } rcu_read_unlock(); } /* Do all connect socket setups that can be done AF independent. */ static void tcp_connect_init(struct sock *sk) { const struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); __u8 rcv_wscale; u32 rcv_wnd; /* We'll fix this up when we get a response from the other end. * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. */ tp->tcp_header_len = sizeof(struct tcphdr); if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps)) tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED; tcp_ao_connect_init(sk); /* If user gave his TCP_MAXSEG, record it to clamp */ if (tp->rx_opt.user_mss) tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; tp->max_window = 0; tcp_mtup_init(sk); tcp_sync_mss(sk, dst_mtu(dst)); tcp_ca_dst_init(sk, dst); if (!tp->window_clamp) WRITE_ONCE(tp->window_clamp, dst_metric(dst, RTAX_WINDOW)); tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); tcp_initialize_rcv_mss(sk); /* limit the window selection if the user enforce a smaller rx buffer */ if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0)) WRITE_ONCE(tp->window_clamp, tcp_full_space(sk)); rcv_wnd = tcp_rwnd_init_bpf(sk); if (rcv_wnd == 0) rcv_wnd = dst_metric(dst, RTAX_INITRWND); tcp_select_initial_window(sk, tcp_full_space(sk), tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), &tp->rcv_wnd, &tp->window_clamp, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling), &rcv_wscale, rcv_wnd); tp->rx_opt.rcv_wscale = rcv_wscale; tp->rcv_ssthresh = tp->rcv_wnd; WRITE_ONCE(sk->sk_err, 0); sock_reset_flag(sk, SOCK_DONE); tp->snd_wnd = 0; tcp_init_wl(tp, 0); tcp_write_queue_purge(sk); tp->snd_una = tp->write_seq; tp->snd_sml = tp->write_seq; tp->snd_up = tp->write_seq; WRITE_ONCE(tp->snd_nxt, tp->write_seq); if (likely(!tp->repair)) tp->rcv_nxt = 0; else tp->rcv_tstamp = tcp_jiffies32; tp->rcv_wup = tp->rcv_nxt; WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); inet_csk(sk)->icsk_rto = tcp_timeout_init(sk); inet_csk(sk)->icsk_retransmits = 0; tcp_clear_retrans(tp); } static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); tcb->end_seq += skb->len; __skb_header_release(skb); sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); WRITE_ONCE(tp->write_seq, tcb->end_seq); tp->packets_out += tcp_skb_pcount(skb); } /* Build and send a SYN with data and (cached) Fast Open cookie. However, * queue a data-only packet after the regular SYN, such that regular SYNs * are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges * only the SYN sequence, the data are retransmitted in the first ACK. * If cookie is not cached or other error occurs, falls back to send a * regular SYN with Fast Open cookie request option. */ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct tcp_fastopen_request *fo = tp->fastopen_req; struct page_frag *pfrag = sk_page_frag(sk); struct sk_buff *syn_data; int space, err = 0; tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */ if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie)) goto fallback; /* MSS for SYN-data is based on cached MSS and bounded by PMTU and * user-MSS. Reserve maximum option space for middleboxes that add * private TCP options. The cost is reduced data space in SYN :( */ tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp); /* Sync mss_cache after updating the mss_clamp */ tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); space = __tcp_mtu_to_mss(sk, icsk->icsk_pmtu_cookie) - MAX_TCP_OPTION_SPACE; space = min_t(size_t, space, fo->size); if (space && !skb_page_frag_refill(min_t(size_t, space, PAGE_SIZE), pfrag, sk->sk_allocation)) goto fallback; syn_data = tcp_stream_alloc_skb(sk, sk->sk_allocation, false); if (!syn_data) goto fallback; memcpy(syn_data->cb, syn->cb, sizeof(syn->cb)); if (space) { space = min_t(size_t, space, pfrag->size - pfrag->offset); space = tcp_wmem_schedule(sk, space); } if (space) { space = copy_page_from_iter(pfrag->page, pfrag->offset, space, &fo->data->msg_iter); if (unlikely(!space)) { tcp_skb_tsorted_anchor_cleanup(syn_data); kfree_skb(syn_data); goto fallback; } skb_fill_page_desc(syn_data, 0, pfrag->page, pfrag->offset, space); page_ref_inc(pfrag->page); pfrag->offset += space; skb_len_add(syn_data, space); skb_zcopy_set(syn_data, fo->uarg, NULL); } /* No more data pending in inet_wait_for_connect() */ if (space == fo->size) fo->data = NULL; fo->copied = space; tcp_connect_queue_skb(sk, syn_data); if (syn_data->len) tcp_chrono_start(sk, TCP_CHRONO_BUSY); err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation); skb_set_delivery_time(syn, syn_data->skb_mstamp_ns, SKB_CLOCK_MONOTONIC); /* Now full SYN+DATA was cloned and sent (or not), * remove the SYN from the original skb (syn_data) * we keep in write queue in case of a retransmit, as we * also have the SYN packet (with no data) in the same queue. */ TCP_SKB_CB(syn_data)->seq++; TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH; if (!err) { tp->syn_data = (fo->copied > 0); tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT); goto done; } /* data was not sent, put it in write_queue */ __skb_queue_tail(&sk->sk_write_queue, syn_data); tp->packets_out -= tcp_skb_pcount(syn_data); fallback: /* Send a regular SYN with Fast Open cookie request option */ if (fo->cookie.len > 0) fo->cookie.len = 0; err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation); if (err) tp->syn_fastopen = 0; done: fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */ return err; } /* Build a SYN and send it off. */ int tcp_connect(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; int err; tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL); #if defined(CONFIG_TCP_MD5SIG) && defined(CONFIG_TCP_AO) /* Has to be checked late, after setting daddr/saddr/ops. * Return error if the peer has both a md5 and a tcp-ao key * configured as this is ambiguous. */ if (unlikely(rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)))) { bool needs_ao = !!tp->af_specific->ao_lookup(sk, sk, -1, -1); bool needs_md5 = !!tp->af_specific->md5_lookup(sk, sk); struct tcp_ao_info *ao_info; ao_info = rcu_dereference_check(tp->ao_info, lockdep_sock_is_held(sk)); if (ao_info) { /* This is an extra check: tcp_ao_required() in * tcp_v{4,6}_parse_md5_keys() should prevent adding * md5 keys on ao_required socket. */ needs_ao |= ao_info->ao_required; WARN_ON_ONCE(ao_info->ao_required && needs_md5); } if (needs_md5 && needs_ao) return -EKEYREJECTED; /* If we have a matching md5 key and no matching tcp-ao key * then free up ao_info if allocated. */ if (needs_md5) { tcp_ao_destroy_sock(sk, false); } else if (needs_ao) { tcp_clear_md5_list(sk); kfree(rcu_replace_pointer(tp->md5sig_info, NULL, lockdep_sock_is_held(sk))); } } #endif #ifdef CONFIG_TCP_AO if (unlikely(rcu_dereference_protected(tp->ao_info, lockdep_sock_is_held(sk)))) { /* Don't allow connecting if ao is configured but no * matching key is found. */ if (!tp->af_specific->ao_lookup(sk, sk, -1, -1)) return -EKEYREJECTED; } #endif if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) return -EHOSTUNREACH; /* Routing failure or similar. */ tcp_connect_init(sk); if (unlikely(tp->repair)) { tcp_finish_connect(sk, NULL); return 0; } buff = tcp_stream_alloc_skb(sk, sk->sk_allocation, true); if (unlikely(!buff)) return -ENOBUFS; tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); tcp_mstamp_refresh(tp); tp->retrans_stamp = tcp_time_stamp_ts(tp); tcp_connect_queue_skb(sk, buff); tcp_ecn_send_syn(sk, buff); tcp_rbtree_insert(&sk->tcp_rtx_queue, buff); /* Send off SYN; include data in Fast Open. */ err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); if (err == -ECONNREFUSED) return err; /* We change tp->snd_nxt after the tcp_transmit_skb() call * in order to make this packet get counted in tcpOutSegs. */ WRITE_ONCE(tp->snd_nxt, tp->write_seq); tp->pushed_seq = tp->write_seq; buff = tcp_send_head(sk); if (unlikely(buff)) { WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(buff)->seq); tp->pushed_seq = TCP_SKB_CB(buff)->seq; } TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); return 0; } EXPORT_SYMBOL(tcp_connect); u32 tcp_delack_max(const struct sock *sk) { u32 delack_from_rto_min = max(tcp_rto_min(sk), 2) - 1; return min(inet_csk(sk)->icsk_delack_max, delack_from_rto_min); } /* Send out a delayed ack, the caller does the policy checking * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check() * for details. */ void tcp_send_delayed_ack(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); int ato = icsk->icsk_ack.ato; unsigned long timeout; if (ato > TCP_DELACK_MIN) { const struct tcp_sock *tp = tcp_sk(sk); int max_ato = HZ / 2; if (inet_csk_in_pingpong_mode(sk) || (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)) max_ato = TCP_DELACK_MAX; /* Slow path, intersegment interval is "high". */ /* If some rtt estimate is known, use it to bound delayed ack. * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements * directly. */ if (tp->srtt_us) { int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3), TCP_DELACK_MIN); if (rtt < max_ato) max_ato = rtt; } ato = min(ato, max_ato); } ato = min_t(u32, ato, tcp_delack_max(sk)); /* Stay within the limit we were given */ timeout = jiffies + ato; /* Use new timeout only if there wasn't a older one earlier. */ if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) { /* If delack timer is about to expire, send ACK now. */ if (time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) { tcp_send_ack(sk); return; } if (!time_before(timeout, icsk->icsk_ack.timeout)) timeout = icsk->icsk_ack.timeout; } icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER; icsk->icsk_ack.timeout = timeout; sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout); } /* This routine sends an ack and also updates the window. */ void __tcp_send_ack(struct sock *sk, u32 rcv_nxt) { struct sk_buff *buff; /* If we have been reset, we may not send again. */ if (sk->sk_state == TCP_CLOSE) return; /* We are not putting this on the write queue, so * tcp_transmit_skb() will set the ownership to this * sock. */ buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN)); if (unlikely(!buff)) { struct inet_connection_sock *icsk = inet_csk(sk); unsigned long delay; delay = TCP_DELACK_MAX << icsk->icsk_ack.retry; if (delay < TCP_RTO_MAX) icsk->icsk_ack.retry++; inet_csk_schedule_ack(sk); icsk->icsk_ack.ato = TCP_ATO_MIN; inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, delay, TCP_RTO_MAX); return; } /* Reserve space for headers and prepare control bits. */ skb_reserve(buff, MAX_TCP_HEADER); tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK); /* We do not want pure acks influencing TCP Small Queues or fq/pacing * too much. * SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784 */ skb_set_tcp_pure_ack(buff); /* Send it off, this clears delayed acks for us. */ __tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt); } EXPORT_SYMBOL_GPL(__tcp_send_ack); void tcp_send_ack(struct sock *sk) { __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt); } /* This routine sends a packet with an out of date sequence * number. It assumes the other end will try to ack it. * * Question: what should we make while urgent mode? * 4.4BSD forces sending single byte of data. We cannot send * out of window data, because we have SND.NXT==SND.MAX... * * Current solution: to send TWO zero-length segments in urgent mode: * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is * out-of-date with SND.UNA-1 to probe window. */ static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; /* We don't queue it, tcp_transmit_skb() sets ownership. */ skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN)); if (!skb) return -1; /* Reserve space for headers and set control bits. */ skb_reserve(skb, MAX_TCP_HEADER); /* Use a previous sequence. This should cause the other * end to send an ack. Don't queue or clone SKB, just * send it. */ tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); NET_INC_STATS(sock_net(sk), mib); return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0); } /* Called from setsockopt( ... TCP_REPAIR ) */ void tcp_send_window_probe(struct sock *sk) { if (sk->sk_state == TCP_ESTABLISHED) { tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1; tcp_mstamp_refresh(tcp_sk(sk)); tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE); } } /* Initiate keepalive or window probe from timer. */ int tcp_write_wakeup(struct sock *sk, int mib) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; if (sk->sk_state == TCP_CLOSE) return -1; skb = tcp_send_head(sk); if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) { int err; unsigned int mss = tcp_current_mss(sk); unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq)) tp->pushed_seq = TCP_SKB_CB(skb)->end_seq; /* We are probing the opening of a window * but the window size is != 0 * must have been a result SWS avoidance ( sender ) */ if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq || skb->len > mss) { seg_size = min(seg_size, mss); TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE, skb, seg_size, mss, GFP_ATOMIC)) return -1; } else if (!tcp_skb_pcount(skb)) tcp_set_skb_tso_segs(skb, mss); TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); if (!err) tcp_event_new_data_sent(sk, skb); return err; } else { if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF)) tcp_xmit_probe_skb(sk, 1, mib); return tcp_xmit_probe_skb(sk, 0, mib); } } /* A window probe timeout has occurred. If window is not closed send * a partial packet else a zero probe. */ void tcp_send_probe0(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); unsigned long timeout; int err; err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE); if (tp->packets_out || tcp_write_queue_empty(sk)) { /* Cancel probe timer, if it is not required. */ icsk->icsk_probes_out = 0; icsk->icsk_backoff = 0; icsk->icsk_probes_tstamp = 0; return; } icsk->icsk_probes_out++; if (err <= 0) { if (icsk->icsk_backoff < READ_ONCE(net->ipv4.sysctl_tcp_retries2)) icsk->icsk_backoff++; timeout = tcp_probe0_when(sk, TCP_RTO_MAX); } else { /* If packet was not sent due to local congestion, * Let senders fight for local resources conservatively. */ timeout = TCP_RESOURCE_PROBE_INTERVAL; } timeout = tcp_clamp_probe0_to_user_timeout(sk, timeout); tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX); } int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) { const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific; struct flowi fl; int res; /* Paired with WRITE_ONCE() in sock_setsockopt() */ if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED) WRITE_ONCE(tcp_rsk(req)->txhash, net_tx_rndhash()); res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL, NULL); if (!res) { TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); if (unlikely(tcp_passive_fastopen(sk))) { /* sk has const attribute because listeners are lockless. * However in this case, we are dealing with a passive fastopen * socket thus we can change total_retrans value. */ tcp_sk_rw(sk)->total_retrans++; } trace_tcp_retransmit_synack(sk, req); } return res; } EXPORT_SYMBOL(tcp_rtx_synack); |
9163 7496 2604 2607 2605 9165 3 6924 143 7054 31 7035 7034 7039 7036 4312 4308 4315 4299 12 8 64 4316 4310 4322 4000 4328 4326 7673 11 11 11 11 4 4 4 3816 106 2000 3875 3972 3946 6210 6213 7629 19 4478 3787 7630 7631 3582 87 3508 250 249 250 214 17 17 211 213 12 12 12 12 12 12 25 25 25 25 25 25 3815 3227 3491 3817 11 3797 250 3813 10 3818 19 3816 280 5 38 249 250 15 2 15 15 15 26 4 25 4 25 25 1672 3818 663 2 5 1 1 3 697 3 699 699 698 596 110 4712 4374 4710 4381 26 3 2791 2700 592 4524 2551 4342 2790 253 179 249 249 247 248 2580 1012 2583 1446 1450 845 1448 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/swap.c * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds */ /* * This file contains the default values for the operation of the * Linux VM subsystem. Fine-tuning documentation can be found in * Documentation/admin-guide/sysctl/vm.rst. * Started 18.12.91 * Swap aging added 23.2.95, Stephen Tweedie. * Buffermem limits added 12.3.98, Rik van Riel. */ #include <linux/mm.h> #include <linux/sched.h> #include <linux/kernel_stat.h> #include <linux/swap.h> #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/pagevec.h> #include <linux/init.h> #include <linux/export.h> #include <linux/mm_inline.h> #include <linux/percpu_counter.h> #include <linux/memremap.h> #include <linux/percpu.h> #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/backing-dev.h> #include <linux/memcontrol.h> #include <linux/gfp.h> #include <linux/uio.h> #include <linux/hugetlb.h> #include <linux/page_idle.h> #include <linux/local_lock.h> #include <linux/buffer_head.h> #include "internal.h" #define CREATE_TRACE_POINTS #include <trace/events/pagemap.h> /* How many pages do we try to swap or page in/out together? As a power of 2 */ int page_cluster; const int page_cluster_max = 31; /* Protecting only lru_rotate.fbatch which requires disabling interrupts */ struct lru_rotate { local_lock_t lock; struct folio_batch fbatch; }; static DEFINE_PER_CPU(struct lru_rotate, lru_rotate) = { .lock = INIT_LOCAL_LOCK(lock), }; /* * The following folio batches are grouped together because they are protected * by disabling preemption (and interrupts remain enabled). */ struct cpu_fbatches { local_lock_t lock; struct folio_batch lru_add; struct folio_batch lru_deactivate_file; struct folio_batch lru_deactivate; struct folio_batch lru_lazyfree; #ifdef CONFIG_SMP struct folio_batch activate; #endif }; static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = { .lock = INIT_LOCAL_LOCK(lock), }; static void __page_cache_release(struct folio *folio, struct lruvec **lruvecp, unsigned long *flagsp) { if (folio_test_lru(folio)) { folio_lruvec_relock_irqsave(folio, lruvecp, flagsp); lruvec_del_folio(*lruvecp, folio); __folio_clear_lru_flags(folio); } /* * In rare cases, when truncation or holepunching raced with * munlock after VM_LOCKED was cleared, Mlocked may still be * found set here. This does not indicate a problem, unless * "unevictable_pgs_cleared" appears worryingly large. */ if (unlikely(folio_test_mlocked(folio))) { long nr_pages = folio_nr_pages(folio); __folio_clear_mlocked(folio); zone_stat_mod_folio(folio, NR_MLOCK, -nr_pages); count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages); } } /* * This path almost never happens for VM activity - pages are normally freed * in batches. But it gets used by networking - and for compound pages. */ static void page_cache_release(struct folio *folio) { struct lruvec *lruvec = NULL; unsigned long flags; __page_cache_release(folio, &lruvec, &flags); if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); } void __folio_put(struct folio *folio) { if (unlikely(folio_is_zone_device(folio))) { free_zone_device_folio(folio); return; } else if (folio_test_hugetlb(folio)) { free_huge_folio(folio); return; } page_cache_release(folio); folio_undo_large_rmappable(folio); mem_cgroup_uncharge(folio); free_unref_page(&folio->page, folio_order(folio)); } EXPORT_SYMBOL(__folio_put); /** * put_pages_list() - release a list of pages * @pages: list of pages threaded on page->lru * * Release a list of pages which are strung together on page.lru. */ void put_pages_list(struct list_head *pages) { struct folio_batch fbatch; struct folio *folio, *next; folio_batch_init(&fbatch); list_for_each_entry_safe(folio, next, pages, lru) { if (!folio_put_testzero(folio)) continue; if (folio_test_hugetlb(folio)) { free_huge_folio(folio); continue; } /* LRU flag must be clear because it's passed using the lru */ if (folio_batch_add(&fbatch, folio) > 0) continue; free_unref_folios(&fbatch); } if (fbatch.nr) free_unref_folios(&fbatch); INIT_LIST_HEAD(pages); } EXPORT_SYMBOL(put_pages_list); typedef void (*move_fn_t)(struct lruvec *lruvec, struct folio *folio); static void lru_add_fn(struct lruvec *lruvec, struct folio *folio) { int was_unevictable = folio_test_clear_unevictable(folio); long nr_pages = folio_nr_pages(folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); /* * Is an smp_mb__after_atomic() still required here, before * folio_evictable() tests the mlocked flag, to rule out the possibility * of stranding an evictable folio on an unevictable LRU? I think * not, because __munlock_folio() only clears the mlocked flag * while the LRU lock is held. * * (That is not true of __page_cache_release(), and not necessarily * true of folios_put(): but those only clear the mlocked flag after * folio_put_testzero() has excluded any other users of the folio.) */ if (folio_evictable(folio)) { if (was_unevictable) __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); } else { folio_clear_active(folio); folio_set_unevictable(folio); /* * folio->mlock_count = !!folio_test_mlocked(folio)? * But that leaves __mlock_folio() in doubt whether another * actor has already counted the mlock or not. Err on the * safe side, underestimate, let page reclaim fix it, rather * than leaving a page on the unevictable LRU indefinitely. */ folio->mlock_count = 0; if (!was_unevictable) __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages); } lruvec_add_folio(lruvec, folio); trace_mm_lru_insertion(folio); } static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) { int i; struct lruvec *lruvec = NULL; unsigned long flags = 0; for (i = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; folio_lruvec_relock_irqsave(folio, &lruvec, &flags); move_fn(lruvec, folio); folio_set_lru(folio); } if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); folios_put(fbatch); } static void folio_batch_add_and_move(struct folio_batch *fbatch, struct folio *folio, move_fn_t move_fn) { if (folio_batch_add(fbatch, folio) && !folio_test_large(folio) && !lru_cache_disabled()) return; folio_batch_move_lru(fbatch, move_fn); } static void lru_move_tail_fn(struct lruvec *lruvec, struct folio *folio) { if (!folio_test_unevictable(folio)) { lruvec_del_folio(lruvec, folio); folio_clear_active(folio); lruvec_add_folio_tail(lruvec, folio); __count_vm_events(PGROTATED, folio_nr_pages(folio)); } } /* * Writeback is about to end against a folio which has been marked for * immediate reclaim. If it still appears to be reclaimable, move it * to the tail of the inactive list. * * folio_rotate_reclaimable() must disable IRQs, to prevent nasty races. */ void folio_rotate_reclaimable(struct folio *folio) { if (!folio_test_locked(folio) && !folio_test_dirty(folio) && !folio_test_unevictable(folio)) { struct folio_batch *fbatch; unsigned long flags; folio_get(folio); if (!folio_test_clear_lru(folio)) { folio_put(folio); return; } local_lock_irqsave(&lru_rotate.lock, flags); fbatch = this_cpu_ptr(&lru_rotate.fbatch); folio_batch_add_and_move(fbatch, folio, lru_move_tail_fn); local_unlock_irqrestore(&lru_rotate.lock, flags); } } void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_io, unsigned int nr_rotated) { unsigned long cost; /* * Reflect the relative cost of incurring IO and spending CPU * time on rotations. This doesn't attempt to make a precise * comparison, it just says: if reloads are about comparable * between the LRU lists, or rotations are overwhelmingly * different between them, adjust scan balance for CPU work. */ cost = nr_io * SWAP_CLUSTER_MAX + nr_rotated; do { unsigned long lrusize; /* * Hold lruvec->lru_lock is safe here, since * 1) The pinned lruvec in reclaim, or * 2) From a pre-LRU page during refault (which also holds the * rcu lock, so would be safe even if the page was on the LRU * and could move simultaneously to a new lruvec). */ spin_lock_irq(&lruvec->lru_lock); /* Record cost event */ if (file) lruvec->file_cost += cost; else lruvec->anon_cost += cost; /* * Decay previous events * * Because workloads change over time (and to avoid * overflow) we keep these statistics as a floating * average, which ends up weighing recent refaults * more than old ones. */ lrusize = lruvec_page_state(lruvec, NR_INACTIVE_ANON) + lruvec_page_state(lruvec, NR_ACTIVE_ANON) + lruvec_page_state(lruvec, NR_INACTIVE_FILE) + lruvec_page_state(lruvec, NR_ACTIVE_FILE); if (lruvec->file_cost + lruvec->anon_cost > lrusize / 4) { lruvec->file_cost /= 2; lruvec->anon_cost /= 2; } spin_unlock_irq(&lruvec->lru_lock); } while ((lruvec = parent_lruvec(lruvec))); } void lru_note_cost_refault(struct folio *folio) { lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio), folio_nr_pages(folio), 0); } static void folio_activate_fn(struct lruvec *lruvec, struct folio *folio) { if (!folio_test_active(folio) && !folio_test_unevictable(folio)) { long nr_pages = folio_nr_pages(folio); lruvec_del_folio(lruvec, folio); folio_set_active(folio); lruvec_add_folio(lruvec, folio); trace_mm_lru_activate(folio); __count_vm_events(PGACTIVATE, nr_pages); __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, nr_pages); } } #ifdef CONFIG_SMP static void folio_activate_drain(int cpu) { struct folio_batch *fbatch = &per_cpu(cpu_fbatches.activate, cpu); if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, folio_activate_fn); } void folio_activate(struct folio *folio) { if (!folio_test_active(folio) && !folio_test_unevictable(folio)) { struct folio_batch *fbatch; folio_get(folio); if (!folio_test_clear_lru(folio)) { folio_put(folio); return; } local_lock(&cpu_fbatches.lock); fbatch = this_cpu_ptr(&cpu_fbatches.activate); folio_batch_add_and_move(fbatch, folio, folio_activate_fn); local_unlock(&cpu_fbatches.lock); } } #else static inline void folio_activate_drain(int cpu) { } void folio_activate(struct folio *folio) { struct lruvec *lruvec; if (folio_test_clear_lru(folio)) { lruvec = folio_lruvec_lock_irq(folio); folio_activate_fn(lruvec, folio); unlock_page_lruvec_irq(lruvec); folio_set_lru(folio); } } #endif static void __lru_cache_activate_folio(struct folio *folio) { struct folio_batch *fbatch; int i; local_lock(&cpu_fbatches.lock); fbatch = this_cpu_ptr(&cpu_fbatches.lru_add); /* * Search backwards on the optimistic assumption that the folio being * activated has just been added to this batch. Note that only * the local batch is examined as a !LRU folio could be in the * process of being released, reclaimed, migrated or on a remote * batch that is currently being drained. Furthermore, marking * a remote batch's folio active potentially hits a race where * a folio is marked active just after it is added to the inactive * list causing accounting errors and BUG_ON checks to trigger. */ for (i = folio_batch_count(fbatch) - 1; i >= 0; i--) { struct folio *batch_folio = fbatch->folios[i]; if (batch_folio == folio) { folio_set_active(folio); break; } } local_unlock(&cpu_fbatches.lock); } #ifdef CONFIG_LRU_GEN static void folio_inc_refs(struct folio *folio) { unsigned long new_flags, old_flags = READ_ONCE(folio->flags); if (folio_test_unevictable(folio)) return; if (!folio_test_referenced(folio)) { folio_set_referenced(folio); return; } if (!folio_test_workingset(folio)) { folio_set_workingset(folio); return; } /* see the comment on MAX_NR_TIERS */ do { new_flags = old_flags & LRU_REFS_MASK; if (new_flags == LRU_REFS_MASK) break; new_flags += BIT(LRU_REFS_PGOFF); new_flags |= old_flags & ~LRU_REFS_MASK; } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); } #else static void folio_inc_refs(struct folio *folio) { } #endif /* CONFIG_LRU_GEN */ /** * folio_mark_accessed - Mark a folio as having seen activity. * @folio: The folio to mark. * * This function will perform one of the following transitions: * * * inactive,unreferenced -> inactive,referenced * * inactive,referenced -> active,unreferenced * * active,unreferenced -> active,referenced * * When a newly allocated folio is not yet visible, so safe for non-atomic ops, * __folio_set_referenced() may be substituted for folio_mark_accessed(). */ void folio_mark_accessed(struct folio *folio) { if (lru_gen_enabled()) { folio_inc_refs(folio); return; } if (!folio_test_referenced(folio)) { folio_set_referenced(folio); } else if (folio_test_unevictable(folio)) { /* * Unevictable pages are on the "LRU_UNEVICTABLE" list. But, * this list is never rotated or maintained, so marking an * unevictable page accessed has no effect. */ } else if (!folio_test_active(folio)) { /* * If the folio is on the LRU, queue it for activation via * cpu_fbatches.activate. Otherwise, assume the folio is in a * folio_batch, mark it active and it'll be moved to the active * LRU on the next drain. */ if (folio_test_lru(folio)) folio_activate(folio); else __lru_cache_activate_folio(folio); folio_clear_referenced(folio); workingset_activation(folio); } if (folio_test_idle(folio)) folio_clear_idle(folio); } EXPORT_SYMBOL(folio_mark_accessed); /** * folio_add_lru - Add a folio to an LRU list. * @folio: The folio to be added to the LRU. * * Queue the folio for addition to the LRU. The decision on whether * to add the page to the [in]active [file|anon] list is deferred until the * folio_batch is drained. This gives a chance for the caller of folio_add_lru() * have the folio added to the active list using folio_mark_accessed(). */ void folio_add_lru(struct folio *folio) { struct folio_batch *fbatch; VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); /* see the comment in lru_gen_add_folio() */ if (lru_gen_enabled() && !folio_test_unevictable(folio) && lru_gen_in_fault() && !(current->flags & PF_MEMALLOC)) folio_set_active(folio); folio_get(folio); local_lock(&cpu_fbatches.lock); fbatch = this_cpu_ptr(&cpu_fbatches.lru_add); folio_batch_add_and_move(fbatch, folio, lru_add_fn); local_unlock(&cpu_fbatches.lock); } EXPORT_SYMBOL(folio_add_lru); /** * folio_add_lru_vma() - Add a folio to the appropate LRU list for this VMA. * @folio: The folio to be added to the LRU. * @vma: VMA in which the folio is mapped. * * If the VMA is mlocked, @folio is added to the unevictable list. * Otherwise, it is treated the same way as folio_add_lru(). */ void folio_add_lru_vma(struct folio *folio, struct vm_area_struct *vma) { VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); if (unlikely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED)) mlock_new_folio(folio); else folio_add_lru(folio); } /* * If the folio cannot be invalidated, it is moved to the * inactive list to speed up its reclaim. It is moved to the * head of the list, rather than the tail, to give the flusher * threads some time to write it out, as this is much more * effective than the single-page writeout from reclaim. * * If the folio isn't mapped and dirty/writeback, the folio * could be reclaimed asap using the reclaim flag. * * 1. active, mapped folio -> none * 2. active, dirty/writeback folio -> inactive, head, reclaim * 3. inactive, mapped folio -> none * 4. inactive, dirty/writeback folio -> inactive, head, reclaim * 5. inactive, clean -> inactive, tail * 6. Others -> none * * In 4, it moves to the head of the inactive list so the folio is * written out by flusher threads as this is much more efficient * than the single-page writeout from reclaim. */ static void lru_deactivate_file_fn(struct lruvec *lruvec, struct folio *folio) { bool active = folio_test_active(folio); long nr_pages = folio_nr_pages(folio); if (folio_test_unevictable(folio)) return; /* Some processes are using the folio */ if (folio_mapped(folio)) return; lruvec_del_folio(lruvec, folio); folio_clear_active(folio); folio_clear_referenced(folio); if (folio_test_writeback(folio) || folio_test_dirty(folio)) { /* * Setting the reclaim flag could race with * folio_end_writeback() and confuse readahead. But the * race window is _really_ small and it's not a critical * problem. */ lruvec_add_folio(lruvec, folio); folio_set_reclaim(folio); } else { /* * The folio's writeback ended while it was in the batch. * We move that folio to the tail of the inactive list. */ lruvec_add_folio_tail(lruvec, folio); __count_vm_events(PGROTATED, nr_pages); } if (active) { __count_vm_events(PGDEACTIVATE, nr_pages); __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages); } } static void lru_deactivate_fn(struct lruvec *lruvec, struct folio *folio) { if (!folio_test_unevictable(folio) && (folio_test_active(folio) || lru_gen_enabled())) { long nr_pages = folio_nr_pages(folio); lruvec_del_folio(lruvec, folio); folio_clear_active(folio); folio_clear_referenced(folio); lruvec_add_folio(lruvec, folio); __count_vm_events(PGDEACTIVATE, nr_pages); __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages); } } static void lru_lazyfree_fn(struct lruvec *lruvec, struct folio *folio) { if (folio_test_anon(folio) && folio_test_swapbacked(folio) && !folio_test_swapcache(folio) && !folio_test_unevictable(folio)) { long nr_pages = folio_nr_pages(folio); lruvec_del_folio(lruvec, folio); folio_clear_active(folio); folio_clear_referenced(folio); /* * Lazyfree folios are clean anonymous folios. They have * the swapbacked flag cleared, to distinguish them from normal * anonymous folios */ folio_clear_swapbacked(folio); lruvec_add_folio(lruvec, folio); __count_vm_events(PGLAZYFREE, nr_pages); __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, nr_pages); } } /* * Drain pages out of the cpu's folio_batch. * Either "cpu" is the current CPU, and preemption has already been * disabled; or "cpu" is being hot-unplugged, and is already dead. */ void lru_add_drain_cpu(int cpu) { struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu); struct folio_batch *fbatch = &fbatches->lru_add; if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_add_fn); fbatch = &per_cpu(lru_rotate.fbatch, cpu); /* Disabling interrupts below acts as a compiler barrier. */ if (data_race(folio_batch_count(fbatch))) { unsigned long flags; /* No harm done if a racing interrupt already did this */ local_lock_irqsave(&lru_rotate.lock, flags); folio_batch_move_lru(fbatch, lru_move_tail_fn); local_unlock_irqrestore(&lru_rotate.lock, flags); } fbatch = &fbatches->lru_deactivate_file; if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_deactivate_file_fn); fbatch = &fbatches->lru_deactivate; if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_deactivate_fn); fbatch = &fbatches->lru_lazyfree; if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_lazyfree_fn); folio_activate_drain(cpu); } /** * deactivate_file_folio() - Deactivate a file folio. * @folio: Folio to deactivate. * * This function hints to the VM that @folio is a good reclaim candidate, * for example if its invalidation fails due to the folio being dirty * or under writeback. * * Context: Caller holds a reference on the folio. */ void deactivate_file_folio(struct folio *folio) { struct folio_batch *fbatch; /* Deactivating an unevictable folio will not accelerate reclaim */ if (folio_test_unevictable(folio)) return; folio_get(folio); if (!folio_test_clear_lru(folio)) { folio_put(folio); return; } local_lock(&cpu_fbatches.lock); fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate_file); folio_batch_add_and_move(fbatch, folio, lru_deactivate_file_fn); local_unlock(&cpu_fbatches.lock); } /* * folio_deactivate - deactivate a folio * @folio: folio to deactivate * * folio_deactivate() moves @folio to the inactive list if @folio was on the * active list and was not unevictable. This is done to accelerate the * reclaim of @folio. */ void folio_deactivate(struct folio *folio) { if (!folio_test_unevictable(folio) && (folio_test_active(folio) || lru_gen_enabled())) { struct folio_batch *fbatch; folio_get(folio); if (!folio_test_clear_lru(folio)) { folio_put(folio); return; } local_lock(&cpu_fbatches.lock); fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate); folio_batch_add_and_move(fbatch, folio, lru_deactivate_fn); local_unlock(&cpu_fbatches.lock); } } /** * folio_mark_lazyfree - make an anon folio lazyfree * @folio: folio to deactivate * * folio_mark_lazyfree() moves @folio to the inactive file list. * This is done to accelerate the reclaim of @folio. */ void folio_mark_lazyfree(struct folio *folio) { if (folio_test_anon(folio) && folio_test_swapbacked(folio) && !folio_test_swapcache(folio) && !folio_test_unevictable(folio)) { struct folio_batch *fbatch; folio_get(folio); if (!folio_test_clear_lru(folio)) { folio_put(folio); return; } local_lock(&cpu_fbatches.lock); fbatch = this_cpu_ptr(&cpu_fbatches.lru_lazyfree); folio_batch_add_and_move(fbatch, folio, lru_lazyfree_fn); local_unlock(&cpu_fbatches.lock); } } void lru_add_drain(void) { local_lock(&cpu_fbatches.lock); lru_add_drain_cpu(smp_processor_id()); local_unlock(&cpu_fbatches.lock); mlock_drain_local(); } /* * It's called from per-cpu workqueue context in SMP case so * lru_add_drain_cpu and invalidate_bh_lrus_cpu should run on * the same cpu. It shouldn't be a problem in !SMP case since * the core is only one and the locks will disable preemption. */ static void lru_add_and_bh_lrus_drain(void) { local_lock(&cpu_fbatches.lock); lru_add_drain_cpu(smp_processor_id()); local_unlock(&cpu_fbatches.lock); invalidate_bh_lrus_cpu(); mlock_drain_local(); } void lru_add_drain_cpu_zone(struct zone *zone) { local_lock(&cpu_fbatches.lock); lru_add_drain_cpu(smp_processor_id()); drain_local_pages(zone); local_unlock(&cpu_fbatches.lock); mlock_drain_local(); } #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); static void lru_add_drain_per_cpu(struct work_struct *dummy) { lru_add_and_bh_lrus_drain(); } static bool cpu_needs_drain(unsigned int cpu) { struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu); /* Check these in order of likelihood that they're not zero */ return folio_batch_count(&fbatches->lru_add) || data_race(folio_batch_count(&per_cpu(lru_rotate.fbatch, cpu))) || folio_batch_count(&fbatches->lru_deactivate_file) || folio_batch_count(&fbatches->lru_deactivate) || folio_batch_count(&fbatches->lru_lazyfree) || folio_batch_count(&fbatches->activate) || need_mlock_drain(cpu) || has_bh_in_lru(cpu, NULL); } /* * Doesn't need any cpu hotplug locking because we do rely on per-cpu * kworkers being shut down before our page_alloc_cpu_dead callback is * executed on the offlined cpu. * Calling this function with cpu hotplug locks held can actually lead * to obscure indirect dependencies via WQ context. */ static inline void __lru_add_drain_all(bool force_all_cpus) { /* * lru_drain_gen - Global pages generation number * * (A) Definition: global lru_drain_gen = x implies that all generations * 0 < n <= x are already *scheduled* for draining. * * This is an optimization for the highly-contended use case where a * user space workload keeps constantly generating a flow of pages for * each CPU. */ static unsigned int lru_drain_gen; static struct cpumask has_work; static DEFINE_MUTEX(lock); unsigned cpu, this_gen; /* * Make sure nobody triggers this path before mm_percpu_wq is fully * initialized. */ if (WARN_ON(!mm_percpu_wq)) return; /* * Guarantee folio_batch counter stores visible by this CPU * are visible to other CPUs before loading the current drain * generation. */ smp_mb(); /* * (B) Locally cache global LRU draining generation number * * The read barrier ensures that the counter is loaded before the mutex * is taken. It pairs with smp_mb() inside the mutex critical section * at (D). */ this_gen = smp_load_acquire(&lru_drain_gen); mutex_lock(&lock); /* * (C) Exit the draining operation if a newer generation, from another * lru_add_drain_all(), was already scheduled for draining. Check (A). */ if (unlikely(this_gen != lru_drain_gen && !force_all_cpus)) goto done; /* * (D) Increment global generation number * * Pairs with smp_load_acquire() at (B), outside of the critical * section. Use a full memory barrier to guarantee that the * new global drain generation number is stored before loading * folio_batch counters. * * This pairing must be done here, before the for_each_online_cpu loop * below which drains the page vectors. * * Let x, y, and z represent some system CPU numbers, where x < y < z. * Assume CPU #z is in the middle of the for_each_online_cpu loop * below and has already reached CPU #y's per-cpu data. CPU #x comes * along, adds some pages to its per-cpu vectors, then calls * lru_add_drain_all(). * * If the paired barrier is done at any later step, e.g. after the * loop, CPU #x will just exit at (C) and miss flushing out all of its * added pages. */ WRITE_ONCE(lru_drain_gen, lru_drain_gen + 1); smp_mb(); cpumask_clear(&has_work); for_each_online_cpu(cpu) { struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); if (cpu_needs_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); queue_work_on(cpu, mm_percpu_wq, work); __cpumask_set_cpu(cpu, &has_work); } } for_each_cpu(cpu, &has_work) flush_work(&per_cpu(lru_add_drain_work, cpu)); done: mutex_unlock(&lock); } void lru_add_drain_all(void) { __lru_add_drain_all(false); } #else void lru_add_drain_all(void) { lru_add_drain(); } #endif /* CONFIG_SMP */ atomic_t lru_disable_count = ATOMIC_INIT(0); /* * lru_cache_disable() needs to be called before we start compiling * a list of pages to be migrated using isolate_lru_page(). * It drains pages on LRU cache and then disable on all cpus until * lru_cache_enable is called. * * Must be paired with a call to lru_cache_enable(). */ void lru_cache_disable(void) { atomic_inc(&lru_disable_count); /* * Readers of lru_disable_count are protected by either disabling * preemption or rcu_read_lock: * * preempt_disable, local_irq_disable [bh_lru_lock()] * rcu_read_lock [rt_spin_lock CONFIG_PREEMPT_RT] * preempt_disable [local_lock !CONFIG_PREEMPT_RT] * * Since v5.1 kernel, synchronize_rcu() is guaranteed to wait on * preempt_disable() regions of code. So any CPU which sees * lru_disable_count = 0 will have exited the critical * section when synchronize_rcu() returns. */ synchronize_rcu_expedited(); #ifdef CONFIG_SMP __lru_add_drain_all(true); #else lru_add_and_bh_lrus_drain(); #endif } /** * folios_put_refs - Reduce the reference count on a batch of folios. * @folios: The folios. * @refs: The number of refs to subtract from each folio. * * Like folio_put(), but for a batch of folios. This is more efficient * than writing the loop yourself as it will optimise the locks which need * to be taken if the folios are freed. The folios batch is returned * empty and ready to be reused for another batch; there is no need * to reinitialise it. If @refs is NULL, we subtract one from each * folio refcount. * * Context: May be called in process or interrupt context, but not in NMI * context. May be called while holding a spinlock. */ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) { int i, j; struct lruvec *lruvec = NULL; unsigned long flags = 0; for (i = 0, j = 0; i < folios->nr; i++) { struct folio *folio = folios->folios[i]; unsigned int nr_refs = refs ? refs[i] : 1; if (is_huge_zero_folio(folio)) continue; if (folio_is_zone_device(folio)) { if (lruvec) { unlock_page_lruvec_irqrestore(lruvec, flags); lruvec = NULL; } if (put_devmap_managed_folio_refs(folio, nr_refs)) continue; if (folio_ref_sub_and_test(folio, nr_refs)) free_zone_device_folio(folio); continue; } if (!folio_ref_sub_and_test(folio, nr_refs)) continue; /* hugetlb has its own memcg */ if (folio_test_hugetlb(folio)) { if (lruvec) { unlock_page_lruvec_irqrestore(lruvec, flags); lruvec = NULL; } free_huge_folio(folio); continue; } folio_undo_large_rmappable(folio); __page_cache_release(folio, &lruvec, &flags); if (j != i) folios->folios[j] = folio; j++; } if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); if (!j) { folio_batch_reinit(folios); return; } folios->nr = j; mem_cgroup_uncharge_folios(folios); free_unref_folios(folios); } EXPORT_SYMBOL(folios_put_refs); /** * release_pages - batched put_page() * @arg: array of pages to release * @nr: number of pages * * Decrement the reference count on all the pages in @arg. If it * fell to zero, remove the page from the LRU and free it. * * Note that the argument can be an array of pages, encoded pages, * or folio pointers. We ignore any encoded bits, and turn any of * them into just a folio that gets free'd. */ void release_pages(release_pages_arg arg, int nr) { struct folio_batch fbatch; int refs[PAGEVEC_SIZE]; struct encoded_page **encoded = arg.encoded_pages; int i; folio_batch_init(&fbatch); for (i = 0; i < nr; i++) { /* Turn any of the argument types into a folio */ struct folio *folio = page_folio(encoded_page_ptr(encoded[i])); /* Is our next entry actually "nr_pages" -> "nr_refs" ? */ refs[fbatch.nr] = 1; if (unlikely(encoded_page_flags(encoded[i]) & ENCODED_PAGE_BIT_NR_PAGES_NEXT)) refs[fbatch.nr] = encoded_nr_pages(encoded[++i]); if (folio_batch_add(&fbatch, folio) > 0) continue; folios_put_refs(&fbatch, refs); } if (fbatch.nr) folios_put_refs(&fbatch, refs); } EXPORT_SYMBOL(release_pages); /* * The folios which we're about to release may be in the deferred lru-addition * queues. That would prevent them from really being freed right now. That's * OK from a correctness point of view but is inefficient - those folios may be * cache-warm and we want to give them back to the page allocator ASAP. * * So __folio_batch_release() will drain those queues here. * folio_batch_move_lru() calls folios_put() directly to avoid * mutual recursion. */ void __folio_batch_release(struct folio_batch *fbatch) { if (!fbatch->percpu_pvec_drained) { lru_add_drain(); fbatch->percpu_pvec_drained = true; } folios_put(fbatch); } EXPORT_SYMBOL(__folio_batch_release); /** * folio_batch_remove_exceptionals() - Prune non-folios from a batch. * @fbatch: The batch to prune * * find_get_entries() fills a batch with both folios and shadow/swap/DAX * entries. This function prunes all the non-folio entries from @fbatch * without leaving holes, so that it can be passed on to folio-only batch * operations. */ void folio_batch_remove_exceptionals(struct folio_batch *fbatch) { unsigned int i, j; for (i = 0, j = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; if (!xa_is_value(folio)) fbatch->folios[j++] = folio; } fbatch->nr = j; } /* * Perform any setup for the swap system */ void __init swap_setup(void) { unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT); /* Use a smaller cluster for small-memory machines */ if (megs < 16) page_cluster = 2; else page_cluster = 3; /* * Right now other parts of the system means that we * _really_ don't want to cluster much more */ } |
14 14 14 6 1 1 1 1 1 1 2 14 10 4 2 10 1 1 1 1 4 5 12 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 | // SPDX-License-Identifier: GPL-2.0-or-later /* * HID driver for some logitech "special" devices * * Copyright (c) 1999 Andreas Gal * Copyright (c) 2000-2005 Vojtech Pavlik <vojtech@suse.cz> * Copyright (c) 2005 Michael Haboustak <mike-@cinci.rr.com> for Concept2, Inc * Copyright (c) 2006-2007 Jiri Kosina * Copyright (c) 2008 Jiri Slaby * Copyright (c) 2010 Hendrik Iben */ /* */ #include <linux/device.h> #include <linux/hid.h> #include <linux/module.h> #include <linux/random.h> #include <linux/sched.h> #include <linux/usb.h> #include <linux/wait.h> #include "usbhid/usbhid.h" #include "hid-ids.h" #include "hid-lg.h" #include "hid-lg4ff.h" #define LG_RDESC 0x001 #define LG_BAD_RELATIVE_KEYS 0x002 #define LG_DUPLICATE_USAGES 0x004 #define LG_EXPANDED_KEYMAP 0x010 #define LG_IGNORE_DOUBLED_WHEEL 0x020 #define LG_WIRELESS 0x040 #define LG_INVERT_HWHEEL 0x080 #define LG_NOGET 0x100 #define LG_FF 0x200 #define LG_FF2 0x400 #define LG_RDESC_REL_ABS 0x800 #define LG_FF3 0x1000 #define LG_FF4 0x2000 /* Size of the original descriptors of the Driving Force (and Pro) wheels */ #define DF_RDESC_ORIG_SIZE 130 #define DFP_RDESC_ORIG_SIZE 97 #define FV_RDESC_ORIG_SIZE 130 #define MOMO_RDESC_ORIG_SIZE 87 #define MOMO2_RDESC_ORIG_SIZE 87 #define FFG_RDESC_ORIG_SIZE 85 #define FG_RDESC_ORIG_SIZE 82 /* Fixed report descriptors for Logitech Driving Force (and Pro) * wheel controllers * * The original descriptors hide the separate throttle and brake axes in * a custom vendor usage page, providing only a combined value as * GenericDesktop.Y. * These descriptors remove the combined Y axis and instead report * separate throttle (Y) and brake (RZ). */ static __u8 df_rdesc_fixed[] = { 0x05, 0x01, /* Usage Page (Desktop), */ 0x09, 0x04, /* Usage (Joystick), */ 0xA1, 0x01, /* Collection (Application), */ 0xA1, 0x02, /* Collection (Logical), */ 0x95, 0x01, /* Report Count (1), */ 0x75, 0x0A, /* Report Size (10), */ 0x14, /* Logical Minimum (0), */ 0x26, 0xFF, 0x03, /* Logical Maximum (1023), */ 0x34, /* Physical Minimum (0), */ 0x46, 0xFF, 0x03, /* Physical Maximum (1023), */ 0x09, 0x30, /* Usage (X), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x0C, /* Report Count (12), */ 0x75, 0x01, /* Report Size (1), */ 0x25, 0x01, /* Logical Maximum (1), */ 0x45, 0x01, /* Physical Maximum (1), */ 0x05, 0x09, /* Usage (Buttons), */ 0x19, 0x01, /* Usage Minimum (1), */ 0x29, 0x0c, /* Usage Maximum (12), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x02, /* Report Count (2), */ 0x06, 0x00, 0xFF, /* Usage Page (Vendor: 65280), */ 0x09, 0x01, /* Usage (?: 1), */ 0x81, 0x02, /* Input (Variable), */ 0x05, 0x01, /* Usage Page (Desktop), */ 0x26, 0xFF, 0x00, /* Logical Maximum (255), */ 0x46, 0xFF, 0x00, /* Physical Maximum (255), */ 0x95, 0x01, /* Report Count (1), */ 0x75, 0x08, /* Report Size (8), */ 0x81, 0x02, /* Input (Variable), */ 0x25, 0x07, /* Logical Maximum (7), */ 0x46, 0x3B, 0x01, /* Physical Maximum (315), */ 0x75, 0x04, /* Report Size (4), */ 0x65, 0x14, /* Unit (Degrees), */ 0x09, 0x39, /* Usage (Hat Switch), */ 0x81, 0x42, /* Input (Variable, Null State), */ 0x75, 0x01, /* Report Size (1), */ 0x95, 0x04, /* Report Count (4), */ 0x65, 0x00, /* Unit (none), */ 0x06, 0x00, 0xFF, /* Usage Page (Vendor: 65280), */ 0x09, 0x01, /* Usage (?: 1), */ 0x25, 0x01, /* Logical Maximum (1), */ 0x45, 0x01, /* Physical Maximum (1), */ 0x81, 0x02, /* Input (Variable), */ 0x05, 0x01, /* Usage Page (Desktop), */ 0x95, 0x01, /* Report Count (1), */ 0x75, 0x08, /* Report Size (8), */ 0x26, 0xFF, 0x00, /* Logical Maximum (255), */ 0x46, 0xFF, 0x00, /* Physical Maximum (255), */ 0x09, 0x31, /* Usage (Y), */ 0x81, 0x02, /* Input (Variable), */ 0x09, 0x35, /* Usage (Rz), */ 0x81, 0x02, /* Input (Variable), */ 0xC0, /* End Collection, */ 0xA1, 0x02, /* Collection (Logical), */ 0x26, 0xFF, 0x00, /* Logical Maximum (255), */ 0x46, 0xFF, 0x00, /* Physical Maximum (255), */ 0x95, 0x07, /* Report Count (7), */ 0x75, 0x08, /* Report Size (8), */ 0x09, 0x03, /* Usage (?: 3), */ 0x91, 0x02, /* Output (Variable), */ 0xC0, /* End Collection, */ 0xC0 /* End Collection */ }; static __u8 dfp_rdesc_fixed[] = { 0x05, 0x01, /* Usage Page (Desktop), */ 0x09, 0x04, /* Usage (Joystick), */ 0xA1, 0x01, /* Collection (Application), */ 0xA1, 0x02, /* Collection (Logical), */ 0x95, 0x01, /* Report Count (1), */ 0x75, 0x0E, /* Report Size (14), */ 0x14, /* Logical Minimum (0), */ 0x26, 0xFF, 0x3F, /* Logical Maximum (16383), */ 0x34, /* Physical Minimum (0), */ 0x46, 0xFF, 0x3F, /* Physical Maximum (16383), */ 0x09, 0x30, /* Usage (X), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x0E, /* Report Count (14), */ 0x75, 0x01, /* Report Size (1), */ 0x25, 0x01, /* Logical Maximum (1), */ 0x45, 0x01, /* Physical Maximum (1), */ 0x05, 0x09, /* Usage Page (Button), */ 0x19, 0x01, /* Usage Minimum (01h), */ 0x29, 0x0E, /* Usage Maximum (0Eh), */ 0x81, 0x02, /* Input (Variable), */ 0x05, 0x01, /* Usage Page (Desktop), */ 0x95, 0x01, /* Report Count (1), */ 0x75, 0x04, /* Report Size (4), */ 0x25, 0x07, /* Logical Maximum (7), */ 0x46, 0x3B, 0x01, /* Physical Maximum (315), */ 0x65, 0x14, /* Unit (Degrees), */ 0x09, 0x39, /* Usage (Hat Switch), */ 0x81, 0x42, /* Input (Variable, Nullstate), */ 0x65, 0x00, /* Unit, */ 0x26, 0xFF, 0x00, /* Logical Maximum (255), */ 0x46, 0xFF, 0x00, /* Physical Maximum (255), */ 0x75, 0x08, /* Report Size (8), */ 0x81, 0x01, /* Input (Constant), */ 0x09, 0x31, /* Usage (Y), */ 0x81, 0x02, /* Input (Variable), */ 0x09, 0x35, /* Usage (Rz), */ 0x81, 0x02, /* Input (Variable), */ 0x81, 0x01, /* Input (Constant), */ 0xC0, /* End Collection, */ 0xA1, 0x02, /* Collection (Logical), */ 0x09, 0x02, /* Usage (02h), */ 0x95, 0x07, /* Report Count (7), */ 0x91, 0x02, /* Output (Variable), */ 0xC0, /* End Collection, */ 0xC0 /* End Collection */ }; static __u8 fv_rdesc_fixed[] = { 0x05, 0x01, /* Usage Page (Desktop), */ 0x09, 0x04, /* Usage (Joystick), */ 0xA1, 0x01, /* Collection (Application), */ 0xA1, 0x02, /* Collection (Logical), */ 0x95, 0x01, /* Report Count (1), */ 0x75, 0x0A, /* Report Size (10), */ 0x15, 0x00, /* Logical Minimum (0), */ 0x26, 0xFF, 0x03, /* Logical Maximum (1023), */ 0x35, 0x00, /* Physical Minimum (0), */ 0x46, 0xFF, 0x03, /* Physical Maximum (1023), */ 0x09, 0x30, /* Usage (X), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x0C, /* Report Count (12), */ 0x75, 0x01, /* Report Size (1), */ 0x25, 0x01, /* Logical Maximum (1), */ 0x45, 0x01, /* Physical Maximum (1), */ 0x05, 0x09, /* Usage Page (Button), */ 0x19, 0x01, /* Usage Minimum (01h), */ 0x29, 0x0C, /* Usage Maximum (0Ch), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x02, /* Report Count (2), */ 0x06, 0x00, 0xFF, /* Usage Page (FF00h), */ 0x09, 0x01, /* Usage (01h), */ 0x81, 0x02, /* Input (Variable), */ 0x09, 0x02, /* Usage (02h), */ 0x26, 0xFF, 0x00, /* Logical Maximum (255), */ 0x46, 0xFF, 0x00, /* Physical Maximum (255), */ 0x95, 0x01, /* Report Count (1), */ 0x75, 0x08, /* Report Size (8), */ 0x81, 0x02, /* Input (Variable), */ 0x05, 0x01, /* Usage Page (Desktop), */ 0x25, 0x07, /* Logical Maximum (7), */ 0x46, 0x3B, 0x01, /* Physical Maximum (315), */ 0x75, 0x04, /* Report Size (4), */ 0x65, 0x14, /* Unit (Degrees), */ 0x09, 0x39, /* Usage (Hat Switch), */ 0x81, 0x42, /* Input (Variable, Null State), */ 0x75, 0x01, /* Report Size (1), */ 0x95, 0x04, /* Report Count (4), */ 0x65, 0x00, /* Unit, */ 0x06, 0x00, 0xFF, /* Usage Page (FF00h), */ 0x09, 0x01, /* Usage (01h), */ 0x25, 0x01, /* Logical Maximum (1), */ 0x45, 0x01, /* Physical Maximum (1), */ 0x81, 0x02, /* Input (Variable), */ 0x05, 0x01, /* Usage Page (Desktop), */ 0x95, 0x01, /* Report Count (1), */ 0x75, 0x08, /* Report Size (8), */ 0x26, 0xFF, 0x00, /* Logical Maximum (255), */ 0x46, 0xFF, 0x00, /* Physical Maximum (255), */ 0x09, 0x31, /* Usage (Y), */ 0x81, 0x02, /* Input (Variable), */ 0x09, 0x32, /* Usage (Z), */ 0x81, 0x02, /* Input (Variable), */ 0xC0, /* End Collection, */ 0xA1, 0x02, /* Collection (Logical), */ 0x26, 0xFF, 0x00, /* Logical Maximum (255), */ 0x46, 0xFF, 0x00, /* Physical Maximum (255), */ 0x95, 0x07, /* Report Count (7), */ 0x75, 0x08, /* Report Size (8), */ 0x09, 0x03, /* Usage (03h), */ 0x91, 0x02, /* Output (Variable), */ 0xC0, /* End Collection, */ 0xC0 /* End Collection */ }; static __u8 momo_rdesc_fixed[] = { 0x05, 0x01, /* Usage Page (Desktop), */ 0x09, 0x04, /* Usage (Joystick), */ 0xA1, 0x01, /* Collection (Application), */ 0xA1, 0x02, /* Collection (Logical), */ 0x95, 0x01, /* Report Count (1), */ 0x75, 0x0A, /* Report Size (10), */ 0x15, 0x00, /* Logical Minimum (0), */ 0x26, 0xFF, 0x03, /* Logical Maximum (1023), */ 0x35, 0x00, /* Physical Minimum (0), */ 0x46, 0xFF, 0x03, /* Physical Maximum (1023), */ 0x09, 0x30, /* Usage (X), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x08, /* Report Count (8), */ 0x75, 0x01, /* Report Size (1), */ 0x25, 0x01, /* Logical Maximum (1), */ 0x45, 0x01, /* Physical Maximum (1), */ 0x05, 0x09, /* Usage Page (Button), */ 0x19, 0x01, /* Usage Minimum (01h), */ 0x29, 0x08, /* Usage Maximum (08h), */ 0x81, 0x02, /* Input (Variable), */ 0x06, 0x00, 0xFF, /* Usage Page (FF00h), */ 0x75, 0x0E, /* Report Size (14), */ 0x95, 0x01, /* Report Count (1), */ 0x26, 0xFF, 0x00, /* Logical Maximum (255), */ 0x46, 0xFF, 0x00, /* Physical Maximum (255), */ 0x09, 0x00, /* Usage (00h), */ 0x81, 0x02, /* Input (Variable), */ 0x05, 0x01, /* Usage Page (Desktop), */ 0x75, 0x08, /* Report Size (8), */ 0x09, 0x31, /* Usage (Y), */ 0x81, 0x02, /* Input (Variable), */ 0x09, 0x32, /* Usage (Z), */ 0x81, 0x02, /* Input (Variable), */ 0x06, 0x00, 0xFF, /* Usage Page (FF00h), */ 0x09, 0x01, /* Usage (01h), */ 0x81, 0x02, /* Input (Variable), */ 0xC0, /* End Collection, */ 0xA1, 0x02, /* Collection (Logical), */ 0x09, 0x02, /* Usage (02h), */ 0x95, 0x07, /* Report Count (7), */ 0x91, 0x02, /* Output (Variable), */ 0xC0, /* End Collection, */ 0xC0 /* End Collection */ }; static __u8 momo2_rdesc_fixed[] = { 0x05, 0x01, /* Usage Page (Desktop), */ 0x09, 0x04, /* Usage (Joystick), */ 0xA1, 0x01, /* Collection (Application), */ 0xA1, 0x02, /* Collection (Logical), */ 0x95, 0x01, /* Report Count (1), */ 0x75, 0x0A, /* Report Size (10), */ 0x15, 0x00, /* Logical Minimum (0), */ 0x26, 0xFF, 0x03, /* Logical Maximum (1023), */ 0x35, 0x00, /* Physical Minimum (0), */ 0x46, 0xFF, 0x03, /* Physical Maximum (1023), */ 0x09, 0x30, /* Usage (X), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x0A, /* Report Count (10), */ 0x75, 0x01, /* Report Size (1), */ 0x25, 0x01, /* Logical Maximum (1), */ 0x45, 0x01, /* Physical Maximum (1), */ 0x05, 0x09, /* Usage Page (Button), */ 0x19, 0x01, /* Usage Minimum (01h), */ 0x29, 0x0A, /* Usage Maximum (0Ah), */ 0x81, 0x02, /* Input (Variable), */ 0x06, 0x00, 0xFF, /* Usage Page (FF00h), */ 0x09, 0x00, /* Usage (00h), */ 0x95, 0x04, /* Report Count (4), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x01, /* Report Count (1), */ 0x75, 0x08, /* Report Size (8), */ 0x26, 0xFF, 0x00, /* Logical Maximum (255), */ 0x46, 0xFF, 0x00, /* Physical Maximum (255), */ 0x09, 0x01, /* Usage (01h), */ 0x81, 0x02, /* Input (Variable), */ 0x05, 0x01, /* Usage Page (Desktop), */ 0x09, 0x31, /* Usage (Y), */ 0x81, 0x02, /* Input (Variable), */ 0x09, 0x32, /* Usage (Z), */ 0x81, 0x02, /* Input (Variable), */ 0x06, 0x00, 0xFF, /* Usage Page (FF00h), */ 0x09, 0x00, /* Usage (00h), */ 0x81, 0x02, /* Input (Variable), */ 0xC0, /* End Collection, */ 0xA1, 0x02, /* Collection (Logical), */ 0x09, 0x02, /* Usage (02h), */ 0x95, 0x07, /* Report Count (7), */ 0x91, 0x02, /* Output (Variable), */ 0xC0, /* End Collection, */ 0xC0 /* End Collection */ }; static __u8 ffg_rdesc_fixed[] = { 0x05, 0x01, /* Usage Page (Desktop), */ 0x09, 0x04, /* Usage (Joystik), */ 0xA1, 0x01, /* Collection (Application), */ 0xA1, 0x02, /* Collection (Logical), */ 0x95, 0x01, /* Report Count (1), */ 0x75, 0x0A, /* Report Size (10), */ 0x15, 0x00, /* Logical Minimum (0), */ 0x26, 0xFF, 0x03, /* Logical Maximum (1023), */ 0x35, 0x00, /* Physical Minimum (0), */ 0x46, 0xFF, 0x03, /* Physical Maximum (1023), */ 0x09, 0x30, /* Usage (X), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x06, /* Report Count (6), */ 0x75, 0x01, /* Report Size (1), */ 0x25, 0x01, /* Logical Maximum (1), */ 0x45, 0x01, /* Physical Maximum (1), */ 0x05, 0x09, /* Usage Page (Button), */ 0x19, 0x01, /* Usage Minimum (01h), */ 0x29, 0x06, /* Usage Maximum (06h), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x01, /* Report Count (1), */ 0x75, 0x08, /* Report Size (8), */ 0x26, 0xFF, 0x00, /* Logical Maximum (255), */ 0x46, 0xFF, 0x00, /* Physical Maximum (255), */ 0x06, 0x00, 0xFF, /* Usage Page (FF00h), */ 0x09, 0x01, /* Usage (01h), */ 0x81, 0x02, /* Input (Variable), */ 0x05, 0x01, /* Usage Page (Desktop), */ 0x81, 0x01, /* Input (Constant), */ 0x09, 0x31, /* Usage (Y), */ 0x81, 0x02, /* Input (Variable), */ 0x09, 0x32, /* Usage (Z), */ 0x81, 0x02, /* Input (Variable), */ 0x06, 0x00, 0xFF, /* Usage Page (FF00h), */ 0x09, 0x01, /* Usage (01h), */ 0x81, 0x02, /* Input (Variable), */ 0xC0, /* End Collection, */ 0xA1, 0x02, /* Collection (Logical), */ 0x09, 0x02, /* Usage (02h), */ 0x95, 0x07, /* Report Count (7), */ 0x91, 0x02, /* Output (Variable), */ 0xC0, /* End Collection, */ 0xC0 /* End Collection */ }; static __u8 fg_rdesc_fixed[] = { 0x05, 0x01, /* Usage Page (Desktop), */ 0x09, 0x04, /* Usage (Joystik), */ 0xA1, 0x01, /* Collection (Application), */ 0xA1, 0x02, /* Collection (Logical), */ 0x15, 0x00, /* Logical Minimum (0), */ 0x26, 0xFF, 0x00, /* Logical Maximum (255), */ 0x35, 0x00, /* Physical Minimum (0), */ 0x46, 0xFF, 0x00, /* Physical Maximum (255), */ 0x75, 0x08, /* Report Size (8), */ 0x95, 0x01, /* Report Count (1), */ 0x09, 0x30, /* Usage (X), */ 0x81, 0x02, /* Input (Variable), */ 0xA4, /* Push, */ 0x25, 0x01, /* Logical Maximum (1), */ 0x45, 0x01, /* Physical Maximum (1), */ 0x75, 0x01, /* Report Size (1), */ 0x95, 0x02, /* Report Count (2), */ 0x81, 0x01, /* Input (Constant), */ 0x95, 0x06, /* Report Count (6), */ 0x05, 0x09, /* Usage Page (Button), */ 0x19, 0x01, /* Usage Minimum (01h), */ 0x29, 0x06, /* Usage Maximum (06h), */ 0x81, 0x02, /* Input (Variable), */ 0x05, 0x01, /* Usage Page (Desktop), */ 0xB4, /* Pop, */ 0x81, 0x02, /* Input (Constant), */ 0x09, 0x31, /* Usage (Y), */ 0x81, 0x02, /* Input (Variable), */ 0x09, 0x32, /* Usage (Z), */ 0x81, 0x02, /* Input (Variable), */ 0xC0, /* End Collection, */ 0xA1, 0x02, /* Collection (Logical), */ 0x26, 0xFF, 0x00, /* Logical Maximum (255), */ 0x46, 0xFF, 0x00, /* Physical Maximum (255), */ 0x75, 0x08, /* Report Size (8), */ 0x95, 0x04, /* Report Count (4), */ 0x09, 0x02, /* Usage (02h), */ 0xB1, 0x02, /* Feature (Variable), */ 0xC0, /* End Collection, */ 0xC0 /* End Collection, */ }; /* * Certain Logitech keyboards send in report #3 keys which are far * above the logical maximum described in descriptor. This extends * the original value of 0x28c of logical maximum to 0x104d */ static __u8 *lg_report_fixup(struct hid_device *hdev, __u8 *rdesc, unsigned int *rsize) { struct lg_drv_data *drv_data = hid_get_drvdata(hdev); if ((drv_data->quirks & LG_RDESC) && *rsize >= 91 && rdesc[83] == 0x26 && rdesc[84] == 0x8c && rdesc[85] == 0x02) { hid_info(hdev, "fixing up Logitech keyboard report descriptor\n"); rdesc[84] = rdesc[89] = 0x4d; rdesc[85] = rdesc[90] = 0x10; } if ((drv_data->quirks & LG_RDESC_REL_ABS) && *rsize >= 51 && rdesc[32] == 0x81 && rdesc[33] == 0x06 && rdesc[49] == 0x81 && rdesc[50] == 0x06) { hid_info(hdev, "fixing up rel/abs in Logitech report descriptor\n"); rdesc[33] = rdesc[50] = 0x02; } switch (hdev->product) { case USB_DEVICE_ID_LOGITECH_WINGMAN_FG: if (*rsize == FG_RDESC_ORIG_SIZE) { hid_info(hdev, "fixing up Logitech Wingman Formula GP report descriptor\n"); rdesc = fg_rdesc_fixed; *rsize = sizeof(fg_rdesc_fixed); } else { hid_info(hdev, "rdesc size test failed for formula gp\n"); } break; case USB_DEVICE_ID_LOGITECH_WINGMAN_FFG: if (*rsize == FFG_RDESC_ORIG_SIZE) { hid_info(hdev, "fixing up Logitech Wingman Formula Force GP report descriptor\n"); rdesc = ffg_rdesc_fixed; *rsize = sizeof(ffg_rdesc_fixed); } break; /* Several wheels report as this id when operating in emulation mode. */ case USB_DEVICE_ID_LOGITECH_WHEEL: if (*rsize == DF_RDESC_ORIG_SIZE) { hid_info(hdev, "fixing up Logitech Driving Force report descriptor\n"); rdesc = df_rdesc_fixed; *rsize = sizeof(df_rdesc_fixed); } break; case USB_DEVICE_ID_LOGITECH_MOMO_WHEEL: if (*rsize == MOMO_RDESC_ORIG_SIZE) { hid_info(hdev, "fixing up Logitech Momo Force (Red) report descriptor\n"); rdesc = momo_rdesc_fixed; *rsize = sizeof(momo_rdesc_fixed); } break; case USB_DEVICE_ID_LOGITECH_MOMO_WHEEL2: if (*rsize == MOMO2_RDESC_ORIG_SIZE) { hid_info(hdev, "fixing up Logitech Momo Racing Force (Black) report descriptor\n"); rdesc = momo2_rdesc_fixed; *rsize = sizeof(momo2_rdesc_fixed); } break; case USB_DEVICE_ID_LOGITECH_VIBRATION_WHEEL: if (*rsize == FV_RDESC_ORIG_SIZE) { hid_info(hdev, "fixing up Logitech Formula Vibration report descriptor\n"); rdesc = fv_rdesc_fixed; *rsize = sizeof(fv_rdesc_fixed); } break; case USB_DEVICE_ID_LOGITECH_DFP_WHEEL: if (*rsize == DFP_RDESC_ORIG_SIZE) { hid_info(hdev, "fixing up Logitech Driving Force Pro report descriptor\n"); rdesc = dfp_rdesc_fixed; *rsize = sizeof(dfp_rdesc_fixed); } break; case USB_DEVICE_ID_LOGITECH_WII_WHEEL: if (*rsize >= 101 && rdesc[41] == 0x95 && rdesc[42] == 0x0B && rdesc[47] == 0x05 && rdesc[48] == 0x09) { hid_info(hdev, "fixing up Logitech Speed Force Wireless report descriptor\n"); rdesc[41] = 0x05; rdesc[42] = 0x09; rdesc[47] = 0x95; rdesc[48] = 0x0B; } break; } return rdesc; } #define lg_map_key_clear(c) hid_map_usage_clear(hi, usage, bit, max, \ EV_KEY, (c)) static int lg_ultrax_remote_mapping(struct hid_input *hi, struct hid_usage *usage, unsigned long **bit, int *max) { if ((usage->hid & HID_USAGE_PAGE) != HID_UP_LOGIVENDOR) return 0; set_bit(EV_REP, hi->input->evbit); switch (usage->hid & HID_USAGE) { /* Reported on Logitech Ultra X Media Remote */ case 0x004: lg_map_key_clear(KEY_AGAIN); break; case 0x00d: lg_map_key_clear(KEY_HOME); break; case 0x024: lg_map_key_clear(KEY_SHUFFLE); break; case 0x025: lg_map_key_clear(KEY_TV); break; case 0x026: lg_map_key_clear(KEY_MENU); break; case 0x031: lg_map_key_clear(KEY_AUDIO); break; case 0x032: lg_map_key_clear(KEY_TEXT); break; case 0x033: lg_map_key_clear(KEY_LAST); break; case 0x047: lg_map_key_clear(KEY_MP3); break; case 0x048: lg_map_key_clear(KEY_DVD); break; case 0x049: lg_map_key_clear(KEY_MEDIA); break; case 0x04a: lg_map_key_clear(KEY_VIDEO); break; case 0x04b: lg_map_key_clear(KEY_ANGLE); break; case 0x04c: lg_map_key_clear(KEY_LANGUAGE); break; case 0x04d: lg_map_key_clear(KEY_SUBTITLE); break; case 0x051: lg_map_key_clear(KEY_RED); break; case 0x052: lg_map_key_clear(KEY_CLOSE); break; default: return 0; } return 1; } static int lg_wireless_mapping(struct hid_input *hi, struct hid_usage *usage, unsigned long **bit, int *max) { if ((usage->hid & HID_USAGE_PAGE) != HID_UP_CONSUMER) return 0; switch (usage->hid & HID_USAGE) { case 0x1001: lg_map_key_clear(KEY_MESSENGER); break; case 0x1003: lg_map_key_clear(KEY_SOUND); break; case 0x1004: lg_map_key_clear(KEY_VIDEO); break; case 0x1005: lg_map_key_clear(KEY_AUDIO); break; case 0x100a: lg_map_key_clear(KEY_DOCUMENTS); break; /* The following two entries are Playlist 1 and 2 on the MX3200 */ case 0x100f: lg_map_key_clear(KEY_FN_1); break; case 0x1010: lg_map_key_clear(KEY_FN_2); break; case 0x1011: lg_map_key_clear(KEY_PREVIOUSSONG); break; case 0x1012: lg_map_key_clear(KEY_NEXTSONG); break; case 0x1013: lg_map_key_clear(KEY_CAMERA); break; case 0x1014: lg_map_key_clear(KEY_MESSENGER); break; case 0x1015: lg_map_key_clear(KEY_RECORD); break; case 0x1016: lg_map_key_clear(KEY_PLAYER); break; case 0x1017: lg_map_key_clear(KEY_EJECTCD); break; case 0x1018: lg_map_key_clear(KEY_MEDIA); break; case 0x1019: lg_map_key_clear(KEY_PROG1); break; case 0x101a: lg_map_key_clear(KEY_PROG2); break; case 0x101b: lg_map_key_clear(KEY_PROG3); break; case 0x101c: lg_map_key_clear(KEY_CYCLEWINDOWS); break; case 0x101f: lg_map_key_clear(KEY_ZOOMIN); break; case 0x1020: lg_map_key_clear(KEY_ZOOMOUT); break; case 0x1021: lg_map_key_clear(KEY_ZOOMRESET); break; case 0x1023: lg_map_key_clear(KEY_CLOSE); break; case 0x1027: lg_map_key_clear(KEY_MENU); break; /* this one is marked as 'Rotate' */ case 0x1028: lg_map_key_clear(KEY_ANGLE); break; case 0x1029: lg_map_key_clear(KEY_SHUFFLE); break; case 0x102a: lg_map_key_clear(KEY_BACK); break; case 0x102b: lg_map_key_clear(KEY_CYCLEWINDOWS); break; case 0x102d: lg_map_key_clear(KEY_WWW); break; /* The following two are 'Start/answer call' and 'End/reject call' on the MX3200 */ case 0x1031: lg_map_key_clear(KEY_OK); break; case 0x1032: lg_map_key_clear(KEY_CANCEL); break; case 0x1041: lg_map_key_clear(KEY_BATTERY); break; case 0x1042: lg_map_key_clear(KEY_WORDPROCESSOR); break; case 0x1043: lg_map_key_clear(KEY_SPREADSHEET); break; case 0x1044: lg_map_key_clear(KEY_PRESENTATION); break; case 0x1045: lg_map_key_clear(KEY_UNDO); break; case 0x1046: lg_map_key_clear(KEY_REDO); break; case 0x1047: lg_map_key_clear(KEY_PRINT); break; case 0x1048: lg_map_key_clear(KEY_SAVE); break; case 0x1049: lg_map_key_clear(KEY_PROG1); break; case 0x104a: lg_map_key_clear(KEY_PROG2); break; case 0x104b: lg_map_key_clear(KEY_PROG3); break; case 0x104c: lg_map_key_clear(KEY_PROG4); break; default: return 0; } return 1; } static int lg_input_mapping(struct hid_device *hdev, struct hid_input *hi, struct hid_field *field, struct hid_usage *usage, unsigned long **bit, int *max) { /* extended mapping for certain Logitech hardware (Logitech cordless desktop LX500) */ static const u8 e_keymap[] = { 0,216, 0,213,175,156, 0, 0, 0, 0, 144, 0, 0, 0, 0, 0, 0, 0, 0,212, 174,167,152,161,112, 0, 0, 0,154, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,183,184,185,186,187, 188,189,190,191,192,193,194, 0, 0, 0 }; struct lg_drv_data *drv_data = hid_get_drvdata(hdev); unsigned int hid = usage->hid; if (hdev->product == USB_DEVICE_ID_LOGITECH_RECEIVER && lg_ultrax_remote_mapping(hi, usage, bit, max)) return 1; if ((drv_data->quirks & LG_WIRELESS) && lg_wireless_mapping(hi, usage, bit, max)) return 1; if ((hid & HID_USAGE_PAGE) != HID_UP_BUTTON) return 0; hid &= HID_USAGE; /* Special handling for Logitech Cordless Desktop */ if (field->application == HID_GD_MOUSE) { if ((drv_data->quirks & LG_IGNORE_DOUBLED_WHEEL) && (hid == 7 || hid == 8)) return -1; } else { if ((drv_data->quirks & LG_EXPANDED_KEYMAP) && hid < ARRAY_SIZE(e_keymap) && e_keymap[hid] != 0) { hid_map_usage(hi, usage, bit, max, EV_KEY, e_keymap[hid]); return 1; } } return 0; } static int lg_input_mapped(struct hid_device *hdev, struct hid_input *hi, struct hid_field *field, struct hid_usage *usage, unsigned long **bit, int *max) { struct lg_drv_data *drv_data = hid_get_drvdata(hdev); if ((drv_data->quirks & LG_BAD_RELATIVE_KEYS) && usage->type == EV_KEY && (field->flags & HID_MAIN_ITEM_RELATIVE)) field->flags &= ~HID_MAIN_ITEM_RELATIVE; if ((drv_data->quirks & LG_DUPLICATE_USAGES) && (usage->type == EV_KEY || usage->type == EV_REL || usage->type == EV_ABS)) clear_bit(usage->code, *bit); /* Ensure that Logitech wheels are not given a default fuzz/flat value */ if (usage->type == EV_ABS && (usage->code == ABS_X || usage->code == ABS_Y || usage->code == ABS_Z || usage->code == ABS_RZ)) { switch (hdev->product) { case USB_DEVICE_ID_LOGITECH_G29_WHEEL: case USB_DEVICE_ID_LOGITECH_WINGMAN_FG: case USB_DEVICE_ID_LOGITECH_WINGMAN_FFG: case USB_DEVICE_ID_LOGITECH_WHEEL: case USB_DEVICE_ID_LOGITECH_MOMO_WHEEL: case USB_DEVICE_ID_LOGITECH_DFP_WHEEL: case USB_DEVICE_ID_LOGITECH_G25_WHEEL: case USB_DEVICE_ID_LOGITECH_DFGT_WHEEL: case USB_DEVICE_ID_LOGITECH_G27_WHEEL: case USB_DEVICE_ID_LOGITECH_WII_WHEEL: case USB_DEVICE_ID_LOGITECH_MOMO_WHEEL2: case USB_DEVICE_ID_LOGITECH_VIBRATION_WHEEL: field->application = HID_GD_MULTIAXIS; break; default: break; } } return 0; } static int lg_event(struct hid_device *hdev, struct hid_field *field, struct hid_usage *usage, __s32 value) { struct lg_drv_data *drv_data = hid_get_drvdata(hdev); if ((drv_data->quirks & LG_INVERT_HWHEEL) && usage->code == REL_HWHEEL) { input_event(field->hidinput->input, usage->type, usage->code, -value); return 1; } if (drv_data->quirks & LG_FF4) { return lg4ff_adjust_input_event(hdev, field, usage, value, drv_data); } return 0; } static int lg_raw_event(struct hid_device *hdev, struct hid_report *report, u8 *rd, int size) { struct lg_drv_data *drv_data = hid_get_drvdata(hdev); if (drv_data->quirks & LG_FF4) return lg4ff_raw_event(hdev, report, rd, size, drv_data); return 0; } static int lg_probe(struct hid_device *hdev, const struct hid_device_id *id) { struct usb_interface *iface; __u8 iface_num; unsigned int connect_mask = HID_CONNECT_DEFAULT; struct lg_drv_data *drv_data; int ret; if (!hid_is_usb(hdev)) return -EINVAL; iface = to_usb_interface(hdev->dev.parent); iface_num = iface->cur_altsetting->desc.bInterfaceNumber; /* G29 only work with the 1st interface */ if ((hdev->product == USB_DEVICE_ID_LOGITECH_G29_WHEEL) && (iface_num != 0)) { dbg_hid("%s: ignoring ifnum %d\n", __func__, iface_num); return -ENODEV; } drv_data = kzalloc(sizeof(struct lg_drv_data), GFP_KERNEL); if (!drv_data) { hid_err(hdev, "Insufficient memory, cannot allocate driver data\n"); return -ENOMEM; } drv_data->quirks = id->driver_data; hid_set_drvdata(hdev, (void *)drv_data); if (drv_data->quirks & LG_NOGET) hdev->quirks |= HID_QUIRK_NOGET; ret = hid_parse(hdev); if (ret) { hid_err(hdev, "parse failed\n"); goto err_free; } if (drv_data->quirks & (LG_FF | LG_FF2 | LG_FF3 | LG_FF4)) connect_mask &= ~HID_CONNECT_FF; ret = hid_hw_start(hdev, connect_mask); if (ret) { hid_err(hdev, "hw start failed\n"); goto err_free; } /* Setup wireless link with Logitech Wii wheel */ if (hdev->product == USB_DEVICE_ID_LOGITECH_WII_WHEEL) { static const unsigned char cbuf[] = { 0x00, 0xAF, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; u8 *buf = kmemdup(cbuf, sizeof(cbuf), GFP_KERNEL); if (!buf) { ret = -ENOMEM; goto err_stop; } ret = hid_hw_raw_request(hdev, buf[0], buf, sizeof(cbuf), HID_FEATURE_REPORT, HID_REQ_SET_REPORT); if (ret >= 0) { /* insert a little delay of 10 jiffies ~ 40ms */ wait_queue_head_t wait; init_waitqueue_head (&wait); wait_event_interruptible_timeout(wait, 0, msecs_to_jiffies(40)); /* Select random Address */ buf[1] = 0xB2; get_random_bytes(&buf[2], 2); ret = hid_hw_raw_request(hdev, buf[0], buf, sizeof(cbuf), HID_FEATURE_REPORT, HID_REQ_SET_REPORT); } kfree(buf); } if (drv_data->quirks & LG_FF) ret = lgff_init(hdev); else if (drv_data->quirks & LG_FF2) ret = lg2ff_init(hdev); else if (drv_data->quirks & LG_FF3) ret = lg3ff_init(hdev); else if (drv_data->quirks & LG_FF4) ret = lg4ff_init(hdev); if (ret) goto err_stop; return 0; err_stop: hid_hw_stop(hdev); err_free: kfree(drv_data); return ret; } static void lg_remove(struct hid_device *hdev) { struct lg_drv_data *drv_data = hid_get_drvdata(hdev); if (drv_data->quirks & LG_FF4) lg4ff_deinit(hdev); hid_hw_stop(hdev); kfree(drv_data); } static const struct hid_device_id lg_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_S510_RECEIVER), .driver_data = LG_RDESC | LG_WIRELESS }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_RECEIVER), .driver_data = LG_BAD_RELATIVE_KEYS }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_DINOVO_DESKTOP), .driver_data = LG_DUPLICATE_USAGES }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_ELITE_KBD), .driver_data = LG_IGNORE_DOUBLED_WHEEL | LG_EXPANDED_KEYMAP }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_CORDLESS_DESKTOP_LX500), .driver_data = LG_IGNORE_DOUBLED_WHEEL | LG_EXPANDED_KEYMAP }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_EXTREME_3D), .driver_data = LG_NOGET }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_DUAL_ACTION), .driver_data = LG_NOGET }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_WHEEL), .driver_data = LG_NOGET | LG_FF4 }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_RUMBLEPAD_CORD), .driver_data = LG_FF2 }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_RUMBLEPAD), .driver_data = LG_FF }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_RUMBLEPAD2_2), .driver_data = LG_FF }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_G29_WHEEL), .driver_data = LG_FF4 }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_WINGMAN_F3D), .driver_data = LG_FF }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_FORCE3D_PRO), .driver_data = LG_FF }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_MOMO_WHEEL), .driver_data = LG_NOGET | LG_FF4 }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_MOMO_WHEEL2), .driver_data = LG_FF4 }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_VIBRATION_WHEEL), .driver_data = LG_FF2 }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_G25_WHEEL), .driver_data = LG_FF4 }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_DFGT_WHEEL), .driver_data = LG_FF4 }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_G27_WHEEL), .driver_data = LG_FF4 }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_DFP_WHEEL), .driver_data = LG_NOGET | LG_FF4 }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_WII_WHEEL), .driver_data = LG_FF4 }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_WINGMAN_FG), .driver_data = LG_NOGET }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_WINGMAN_FFG), .driver_data = LG_NOGET | LG_FF4 }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_RUMBLEPAD2), .driver_data = LG_NOGET | LG_FF2 }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_FLIGHT_SYSTEM_G940), .driver_data = LG_FF3 }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_SPACENAVIGATOR), .driver_data = LG_RDESC_REL_ABS }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_SPACETRAVELLER), .driver_data = LG_RDESC_REL_ABS }, { } }; MODULE_DEVICE_TABLE(hid, lg_devices); static struct hid_driver lg_driver = { .name = "logitech", .id_table = lg_devices, .report_fixup = lg_report_fixup, .input_mapping = lg_input_mapping, .input_mapped = lg_input_mapped, .event = lg_event, .raw_event = lg_raw_event, .probe = lg_probe, .remove = lg_remove, }; module_hid_driver(lg_driver); #ifdef CONFIG_LOGIWHEELS_FF int lg4ff_no_autoswitch = 0; module_param_named(lg4ff_no_autoswitch, lg4ff_no_autoswitch, int, S_IRUGO); MODULE_PARM_DESC(lg4ff_no_autoswitch, "Do not switch multimode wheels to their native mode automatically"); #endif MODULE_DESCRIPTION("HID driver for some logitech \"special\" devices"); MODULE_LICENSE("GPL"); |
456 1269 3116 1982 1168 59 41 68 25 1 64 172 65 65 845 846 81 40 774 846 846 1396 1449 1441 1448 1443 1012 174 846 845 846 594 4 5 4 9 7 19 2 2 2 34 1 828 831 2 2 9 3 1 234 354 57 5 147 24 257 333 693 944 845 83 557 79 833 83 83 83 4 62 62 2903 78 2903 2905 846 845 845 844 846 1 844 509 845 846 5 282 273 273 4 4 4 4 487 1411 4 276 276 2901 2872 212 511 285 433 212 510 212 511 664 671 565 342 1348 866 865 67 1 66 56 6 13 10 312 332 1205 303 19 1374 333 1205 1382 35 6 1383 57 81 518 516 1205 1382 1384 1382 1384 2058 20 8 4 1304 1594 1305 1592 4 21 145 100 2683 2684 2914 2913 2916 2916 261 1 1 3119 3120 6 3119 23 3118 3104 3 4 1249 1555 3117 3116 271 4 271 3107 2729 397 3108 3103 1465 244 3097 2170 3067 47 1194 1229 8 3099 3070 38 3063 232 2696 2698 624 372 538 565 2883 2828 1652 1159 2838 1376 2813 1018 265 1200 1221 987 3 1243 466 8 6 1206 582 583 497 2632 510 2889 15 1341 126 1023 699 2659 20 2827 1312 1961 2111 752 133 1557 1059 273 276 1056 511 118 118 1098 27 27 30 30 147 147 446 913 147 147 735 548 6 556 165 221 49 316 222 36 116 194 97 48 456 30 30 5 178 178 178 27 27 2 363 121 194 187 417 62 46 15 62 2 46 46 31 9 46 25 29 20 7 606 606 556 606 283 66 268 61 160 159 1059 1059 1018 159 1059 12 32 48 1058 1058 160 1059 390 361 50 1144 48 287 1058 1098 913 147 1059 1057 1059 1059 25 1059 1059 1030 1056 42 319 47 237 317 319 1 24 43 257 61 61 319 26 26 146 31 829 77 561 34 1814 19 884 805 6 677 220 155 611 693 692 155 773 60 3 773 60 10 9 446 539 558 332 164 333 10 107 234 108 187 70 108 149 75 234 15 15 15 15 5 2 5 15 14 1 55 2 54 54 67 27 1 92 20 9 62 10 2 8 6 9 1 10 15 1 137 76 42 49 76 40 55 10 130 11 213 213 831 831 15 6 2 119 119 12 12 10 9 71 22 5 1792 4 4 1784 1 83 56 56 44 37 36 44 1 1 12 36 1319 600 744 119 745 2 19 19 1825 41 4 2 1493 841 7 2094 101 46 2092 1267 2 1269 1269 867 116 116 4 1262 1256 7 109 1262 1263 1268 1269 5 11 5 2 6 99 209 595 75 544 247 386 20 3 38 7 30 3 3 23 9 4 7 2 4 2 8 4 1 4 4 842 34 842 842 15 843 18 1 833 2089 689 9 13 2 5 4 8 2 3 2 7 2 1 8 1 1085 253 1 5 599 95 489 11 203 11 213 831 13 1 8 7 17 35 20 3 2 2 2 1 1 10 9 8 4 4 25 1033 595 25 105 1 106 6 4 23 95 1 2 91 52 21 17 38 55 33 15 73 351 1 8 342 255 100 91 255 250 15 17 35 226 237 3 252 376 9 5 3 20 17 6 6 351 4 268 1 2 1 19 244 7 7 7 7 5 5 13 13 9 1 2 4 15 15 15 8 6 7 8 8 8 8 8 6 4 26 3 2 4 838 167 700 162 116 701 701 7 1 34 466 482 30 7 23 1 827 3 50 415 854 24 842 30 842 839 168 701 693 7 34 34 693 40 290 266 89 24 7 3 4 7 269 6 142 137 13 79 4 13 30 5 23 23 2 20 1 8 846 672 97 9 2 94 1 1 5 9 12 3 3 3 13 12 1 17 3 171 8 1 97 1 6 4 8 4 16 10 1 1 1159 1160 1160 1161 36 1161 36 56 56 56 21 34 34 34 172 172 171 172 172 14 172 172 172 171 172 172 172 14 55 117 170 7 39 153 3 150 149 150 43 107 150 150 845 146 704 43 869 868 109 20 123 3 862 850 7 1 7 1 1532 1532 1501 12 23 1 22 7 7 2 5 1283 1286 2 105 1163 32 1163 4 1163 2 1159 1161 1160 1159 975 1161 379 6 7 854 1 856 34 34 820 8 10 23 2 6 23 731 42 50 8 2 1 79 2 2 763 129 798 83 45 796 7 7 7 751 56 755 7 7 1 749 1 1 11 1 855 12 1 319 10 46 46 14 746 747 744 2 12 12 7 7 3 10 7 1 2 1 2 30 2914 4 1 1 1 1 3 49 1 743 693 1 51 1 2 772 764 80 4 20 1 755 1 3 9 4 745 3 3 46 739 46 45 24 22 1 1 33 13 33 4 28 11 7 6 740 1 291 292 291 292 81 73 26 81 75 29 48 10 5 20 48 11 3 24 13 11 2 13 29 19 3 28 5 4 3 5 6 58 44 2 60 2 36 60 70 70 173 362 468 112 469 226 326 292 292 81 81 71 71 23 23 46 46 9 9 2 70 1 4 4 17 405 132 961 487 2 4 28 82 105 10 437 750 115 469 317 225 92 55 53 34 1 1 1 18 17 4 45 1 45 104 891 2 1 2 891 3 1346 1445 2 879 16 5 10 3 99 22 20 4 8 61 1028 326 215 3 4 935 210 11 970 15 1 957 99 4 962 50 13 17 41 316 77 56 7 16 9 99 42 19 2 33 2 49 109 109 56 94 25 109 64 154 19 49 14 33 70 64 9 20 73 152 6 20 18 31 29 14 2 3 5 828 55 71 366 415 365 34 8 17 591 108 76 41 56 53 110 42 13 117 19 21 355 356 49 133 169 51 23 25 47 1 47 1 45 3 59 57 97 7 88 18 96 6 41 2 31 1 7 41 2 39 75 170 75 170 41 123 41 124 650 424 356 1 1 353 65 65 65 65 65 65 65 62 65 65 598 497 113 34 10 5 14 13 13 17 31 2 4 8 8 13 6 24 7 4 10 22 14 27 72 2 1 3 5 7 6 194 194 193 863 1 1 1 15 8 8 4 9 381 315 1 5 720 831 29 465 448 147 474 327 254 73 314 269 44 306 532 4 532 276 55 588 187 547 214 79 2 2 328 412 65 45 50 545 3 492 154 1945 7 9 1928 1 663 205 36 39 39 3 6 3 9 1 1 5 16 1488 1489 1490 1490 8 1477 4 1478 1089 40 96 215 14 59 59 409 2 3 2924 2928 1175 1174 2926 251 12 881 1449 7 1397 234 1962 7 2531 2910 1449 1450 1448 1450 223 45 256 261 1000 17 1011 1011 2926 2927 2929 2883 12 2918 3 2916 1956 2 3119 4 5 31 4 1 2 1 2 6 6 1 1 1 2 3 7 3 3 3 1190 2 7 2 1 11 4 23 2 1 2 6 19 1 1 2 12 11 1 1 1 1 50 3086 7 45 3071 3037 10 31 15 214 97 93 19 19 2 19 1 203 73 193 40 205 55 6 168 170 170 170 272 271 259 272 78 166 198 271 272 1401 217 23 272 272 158 17 154 154 289 35 268 4 260 74 170 5 190 124 18 169 2 21 20 13 12 35 52 76 72 32 85 57 32 280 187 161 33 28 1 48 1 3 48 2 47 2 278 11 175 280 280 291 5 290 290 276 290 262 262 262 262 103 237 262 161 261 262 261 132 119 44 10 262 163 21 40 9 2 190 163 16 152 34 34 23 1399 1400 806 1401 283 46 178 174 8 4 3 168 4 34 33 168 56 149 272 272 262 231 262 210 189 264 263 28 46 46 237 282 1312 35 606 640 65 639 640 567 4 219 3 1480 1482 221 3 2915 2804 2859 1402 202 41 42 2905 1369 5 2914 1332 2912 88 2190 751 2905 11 2912 1585 1589 1549 2 1199 465 3 107 3 70 1 6 3 206 6 27 1334 1334 171 1285 1335 25 1 203 176 176 25 18 1527 1524 1528 1528 11 1488 1411 511 1271 861 39 39 17 1929 14 6 4 5 1 5 743 742 743 744 694 45 3015 3002 3015 1216 5 2 2031 14 7 755 583 2 1 747 39 78 4 566 9 5 3 232 700 24 117 663 38 699 5 700 699 2945 5 290 290 290 290 221 401 401 401 582 582 222 400 582 401 582 6 6 6 1 6 3 110 51 51 50 1 159 159 158 1 1 1 159 159 159 159 28 28 8 1214 1235 1233 1235 52 456 56 1234 1235 1234 56 1235 1235 158 1244 1244 1244 28 10 594 2 11 903 7 373 1257 1257 42 514 26 483 557 216 375 12 1 369 37 119 2 13 504 20 520 1244 1249 1250 1250 1249 620 1243 1249 1249 1250 1250 1250 11 1250 571 862 1250 74 4 3 1 109 613 2 608 38 614 7 7 6 609 609 609 609 1247 1250 1250 1244 6 1249 1249 1263 108 1261 1263 1262 1263 1263 2867 16 2874 2871 593 592 2915 2915 2913 2 2876 212 1768 1147 2876 1268 1267 2 2 1643 2491 630 621 139 15 13 2 1 1 12 5 3041 105 3 17 2928 15 14 3133 3138 3137 3131 3061 80 2 79 3064 3133 50 3122 47 30 26 165 2918 11 16 1643 9 9 1269 1263 1262 1863 1235 1235 1235 51 28 1263 1250 9 1244 1250 3091 3090 1978 925 290 1191 1192 1192 290 2861 3089 3090 3021 80 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620 9621 9622 9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9758 9759 9760 9761 9762 9763 9764 9765 9766 9767 9768 9769 9770 9771 9772 9773 9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784 9785 9786 9787 9788 9789 9790 9791 9792 9793 9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805 9806 9807 9808 9809 9810 9811 9812 9813 9814 9815 9816 9817 9818 9819 9820 9821 9822 9823 9824 9825 9826 9827 9828 9829 9830 9831 9832 9833 9834 9835 9836 9837 9838 9839 9840 9841 9842 9843 9844 9845 9846 9847 9848 9849 9850 9851 9852 9853 9854 9855 9856 9857 9858 9859 9860 9861 9862 9863 9864 9865 9866 9867 9868 9869 9870 9871 9872 9873 9874 9875 9876 9877 9878 9879 9880 9881 9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895 9896 9897 9898 9899 9900 9901 9902 9903 9904 9905 9906 9907 9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924 9925 9926 9927 9928 9929 9930 9931 9932 9933 9934 9935 9936 9937 9938 9939 9940 9941 9942 9943 9944 9945 9946 9947 9948 9949 9950 9951 9952 9953 9954 9955 9956 9957 9958 9959 9960 9961 9962 9963 9964 9965 9966 9967 9968 9969 9970 9971 9972 9973 9974 9975 9976 9977 9978 9979 9980 9981 9982 9983 9984 9985 9986 9987 9988 9989 9990 9991 9992 9993 9994 9995 9996 9997 9998 9999 10000 10001 10002 10003 10004 10005 10006 10007 10008 10009 10010 10011 10012 10013 10014 10015 10016 10017 10018 10019 10020 10021 10022 10023 10024 10025 10026 10027 10028 10029 10030 10031 10032 10033 10034 10035 10036 10037 10038 10039 10040 10041 10042 10043 10044 10045 10046 10047 10048 10049 10050 10051 10052 10053 10054 10055 10056 10057 10058 10059 10060 10061 10062 10063 10064 10065 10066 10067 10068 10069 10070 10071 10072 10073 10074 10075 10076 10077 10078 10079 10080 10081 10082 10083 10084 10085 10086 10087 10088 10089 10090 10091 10092 10093 10094 10095 10096 10097 10098 10099 10100 10101 10102 10103 10104 10105 10106 10107 10108 10109 10110 10111 10112 10113 10114 10115 10116 10117 10118 10119 10120 10121 10122 10123 10124 10125 10126 10127 10128 10129 10130 10131 10132 10133 10134 10135 10136 10137 10138 10139 10140 10141 10142 10143 10144 10145 10146 10147 10148 10149 10150 10151 10152 10153 10154 10155 10156 10157 10158 10159 10160 10161 10162 10163 10164 10165 10166 10167 10168 10169 10170 10171 10172 10173 10174 10175 10176 10177 10178 10179 10180 10181 10182 10183 10184 10185 10186 10187 10188 10189 10190 10191 10192 10193 10194 10195 10196 10197 10198 10199 10200 10201 10202 10203 10204 10205 10206 10207 10208 10209 10210 10211 10212 10213 10214 10215 10216 10217 10218 10219 10220 10221 10222 10223 10224 10225 10226 10227 10228 10229 10230 10231 10232 10233 10234 10235 10236 10237 10238 10239 10240 10241 10242 10243 10244 10245 10246 10247 10248 10249 10250 10251 10252 10253 10254 10255 10256 10257 10258 10259 10260 10261 10262 10263 10264 10265 10266 10267 10268 10269 10270 10271 10272 10273 10274 10275 10276 10277 10278 10279 10280 10281 10282 10283 10284 10285 10286 10287 10288 10289 10290 10291 10292 10293 10294 10295 10296 10297 10298 10299 10300 10301 10302 10303 10304 10305 10306 10307 10308 10309 10310 10311 10312 10313 10314 10315 10316 10317 10318 10319 10320 10321 10322 10323 10324 10325 10326 10327 10328 10329 10330 10331 10332 10333 10334 10335 10336 10337 10338 10339 10340 10341 10342 10343 10344 10345 10346 10347 10348 10349 10350 10351 10352 10353 10354 10355 10356 10357 10358 10359 10360 10361 10362 10363 10364 10365 10366 10367 10368 10369 10370 10371 10372 10373 10374 10375 10376 10377 10378 10379 10380 10381 10382 10383 10384 10385 10386 10387 10388 10389 10390 10391 10392 10393 10394 10395 10396 10397 10398 10399 10400 10401 10402 10403 10404 10405 10406 10407 10408 10409 10410 10411 10412 10413 10414 10415 10416 10417 10418 10419 10420 10421 10422 10423 10424 10425 10426 10427 10428 10429 10430 10431 10432 10433 10434 10435 10436 10437 10438 10439 10440 10441 10442 10443 10444 10445 10446 10447 10448 10449 10450 10451 10452 10453 10454 10455 10456 10457 10458 10459 10460 10461 10462 10463 10464 10465 10466 10467 10468 10469 10470 10471 10472 10473 10474 10475 10476 10477 10478 10479 10480 10481 10482 10483 10484 10485 10486 10487 10488 10489 10490 10491 10492 10493 10494 10495 10496 10497 10498 10499 10500 10501 10502 10503 10504 10505 10506 10507 10508 10509 10510 10511 10512 10513 10514 10515 10516 10517 10518 10519 10520 10521 10522 10523 10524 10525 10526 10527 10528 10529 10530 10531 10532 10533 10534 10535 10536 10537 10538 10539 10540 10541 10542 10543 10544 10545 10546 10547 10548 10549 10550 10551 10552 10553 10554 10555 10556 10557 10558 10559 10560 10561 10562 10563 10564 10565 10566 10567 10568 10569 10570 10571 10572 10573 10574 10575 10576 10577 10578 10579 10580 10581 10582 10583 10584 10585 10586 10587 10588 10589 10590 10591 10592 10593 10594 10595 10596 10597 10598 10599 10600 10601 10602 10603 10604 10605 10606 10607 10608 10609 10610 10611 10612 10613 10614 10615 10616 10617 10618 10619 10620 10621 10622 10623 10624 10625 10626 10627 10628 10629 10630 10631 10632 10633 10634 10635 10636 10637 10638 10639 10640 10641 10642 10643 10644 10645 10646 10647 10648 10649 10650 10651 10652 10653 10654 10655 10656 10657 10658 10659 10660 10661 10662 10663 10664 10665 10666 10667 10668 10669 10670 10671 10672 10673 10674 10675 10676 10677 10678 10679 10680 10681 10682 10683 10684 10685 10686 10687 10688 10689 10690 10691 10692 10693 10694 10695 10696 10697 10698 10699 10700 10701 10702 10703 10704 10705 10706 10707 10708 10709 10710 10711 10712 10713 10714 10715 10716 10717 10718 10719 10720 10721 10722 10723 10724 10725 10726 10727 10728 10729 10730 10731 10732 10733 10734 10735 10736 10737 10738 10739 10740 10741 10742 10743 10744 10745 10746 10747 10748 10749 10750 10751 10752 10753 10754 10755 10756 10757 10758 10759 10760 10761 10762 10763 10764 10765 10766 10767 10768 10769 10770 10771 10772 10773 10774 10775 10776 10777 10778 10779 10780 10781 10782 10783 10784 10785 10786 10787 10788 10789 10790 10791 10792 10793 10794 10795 10796 10797 10798 10799 10800 10801 10802 10803 10804 10805 10806 10807 10808 10809 10810 10811 10812 10813 10814 10815 10816 10817 10818 10819 10820 10821 10822 10823 10824 10825 10826 10827 10828 10829 10830 10831 10832 10833 10834 10835 10836 10837 10838 10839 10840 10841 10842 10843 10844 10845 10846 10847 10848 10849 10850 10851 10852 10853 10854 10855 10856 10857 10858 10859 10860 10861 10862 10863 10864 10865 10866 10867 10868 10869 10870 10871 10872 10873 10874 10875 10876 10877 10878 10879 10880 10881 10882 10883 10884 10885 10886 10887 10888 10889 10890 10891 10892 10893 10894 10895 10896 10897 10898 10899 10900 10901 10902 10903 10904 10905 10906 10907 10908 10909 10910 10911 10912 10913 10914 10915 10916 10917 10918 10919 10920 10921 10922 10923 10924 10925 10926 10927 10928 10929 10930 10931 10932 10933 10934 10935 10936 10937 10938 10939 10940 10941 10942 10943 10944 10945 10946 10947 10948 10949 10950 10951 10952 10953 10954 10955 10956 10957 10958 10959 10960 10961 10962 10963 10964 10965 10966 10967 10968 10969 10970 10971 10972 10973 10974 10975 10976 10977 10978 10979 10980 10981 10982 10983 10984 10985 10986 10987 10988 10989 10990 10991 10992 10993 10994 10995 10996 10997 10998 10999 11000 11001 11002 11003 11004 11005 11006 11007 11008 11009 11010 11011 11012 11013 11014 11015 11016 11017 11018 11019 11020 11021 11022 11023 11024 11025 11026 11027 11028 11029 11030 11031 11032 11033 11034 11035 11036 11037 11038 11039 11040 11041 11042 11043 11044 11045 11046 11047 11048 11049 11050 11051 11052 11053 11054 11055 11056 11057 11058 11059 11060 11061 11062 11063 11064 11065 11066 11067 11068 11069 11070 11071 11072 11073 11074 11075 11076 11077 11078 11079 11080 11081 11082 11083 11084 11085 11086 11087 11088 11089 11090 11091 11092 11093 11094 11095 11096 11097 11098 11099 11100 11101 11102 11103 11104 11105 11106 11107 11108 11109 11110 11111 11112 11113 11114 11115 11116 11117 11118 11119 11120 11121 11122 11123 11124 11125 11126 11127 11128 11129 11130 11131 11132 11133 11134 11135 11136 11137 11138 11139 11140 11141 11142 11143 11144 11145 11146 11147 11148 11149 11150 11151 11152 11153 11154 11155 11156 11157 11158 11159 11160 11161 11162 11163 11164 11165 11166 11167 11168 11169 11170 11171 11172 11173 11174 11175 11176 11177 11178 11179 11180 11181 11182 11183 11184 11185 11186 11187 11188 11189 11190 11191 11192 11193 11194 11195 11196 11197 11198 11199 11200 11201 11202 11203 11204 11205 11206 11207 11208 11209 11210 11211 11212 11213 11214 11215 11216 11217 11218 11219 11220 11221 11222 11223 11224 11225 11226 11227 11228 11229 11230 11231 11232 11233 11234 11235 11236 11237 11238 11239 11240 11241 11242 11243 11244 11245 11246 11247 11248 11249 11250 11251 11252 11253 11254 11255 11256 11257 11258 11259 11260 11261 11262 11263 11264 11265 11266 11267 11268 11269 11270 11271 11272 11273 11274 11275 11276 11277 11278 11279 11280 11281 11282 11283 11284 11285 11286 11287 11288 11289 11290 11291 11292 11293 11294 11295 11296 11297 11298 11299 11300 11301 11302 11303 11304 11305 11306 11307 11308 11309 11310 11311 11312 11313 11314 11315 11316 11317 11318 11319 11320 11321 11322 11323 11324 11325 11326 11327 11328 11329 11330 11331 11332 11333 11334 11335 11336 11337 11338 11339 11340 11341 11342 11343 11344 11345 11346 11347 11348 11349 11350 11351 11352 11353 11354 11355 11356 11357 11358 11359 11360 11361 11362 11363 11364 11365 11366 11367 11368 11369 11370 11371 11372 11373 11374 11375 11376 11377 11378 11379 11380 11381 11382 11383 11384 11385 11386 11387 11388 11389 11390 11391 11392 11393 11394 11395 11396 11397 11398 11399 11400 11401 11402 11403 11404 11405 11406 11407 11408 11409 11410 11411 11412 11413 11414 11415 11416 11417 11418 11419 11420 11421 11422 11423 11424 11425 11426 11427 11428 11429 11430 11431 11432 11433 11434 11435 11436 11437 11438 11439 11440 11441 11442 11443 11444 11445 11446 11447 11448 11449 11450 11451 11452 11453 11454 11455 11456 11457 11458 11459 11460 11461 11462 11463 11464 11465 11466 11467 11468 11469 11470 11471 11472 11473 11474 11475 11476 11477 11478 11479 11480 11481 11482 11483 11484 11485 11486 11487 11488 11489 11490 11491 11492 11493 11494 11495 11496 11497 11498 11499 11500 11501 11502 11503 11504 11505 11506 11507 11508 11509 11510 11511 11512 11513 11514 11515 11516 11517 11518 11519 11520 11521 11522 11523 11524 11525 11526 11527 11528 11529 11530 11531 11532 11533 11534 11535 11536 11537 11538 11539 11540 11541 11542 11543 11544 11545 11546 11547 11548 11549 11550 11551 11552 11553 11554 11555 11556 11557 11558 11559 11560 11561 11562 11563 11564 11565 11566 11567 11568 11569 11570 11571 11572 11573 11574 11575 11576 11577 11578 11579 11580 11581 11582 11583 11584 11585 11586 11587 11588 11589 11590 11591 11592 11593 11594 11595 11596 11597 11598 11599 11600 11601 11602 11603 11604 11605 11606 11607 11608 11609 11610 11611 11612 11613 11614 11615 11616 11617 11618 11619 11620 11621 11622 11623 11624 11625 11626 11627 11628 11629 11630 11631 11632 11633 11634 11635 11636 11637 11638 11639 11640 11641 11642 11643 11644 11645 11646 11647 11648 11649 11650 11651 11652 11653 11654 11655 11656 11657 11658 11659 11660 11661 11662 11663 11664 11665 11666 11667 11668 11669 11670 11671 11672 11673 11674 11675 11676 11677 11678 11679 11680 11681 11682 11683 11684 11685 11686 11687 11688 11689 11690 11691 11692 11693 11694 11695 11696 11697 11698 11699 11700 11701 11702 11703 11704 11705 11706 11707 11708 11709 11710 11711 11712 11713 11714 11715 11716 11717 11718 11719 11720 11721 11722 11723 11724 11725 11726 11727 11728 11729 11730 11731 11732 11733 11734 11735 11736 11737 11738 11739 11740 11741 11742 11743 11744 11745 11746 11747 11748 11749 11750 11751 11752 11753 11754 11755 11756 11757 11758 11759 11760 11761 11762 11763 11764 11765 11766 11767 11768 11769 11770 11771 11772 11773 11774 11775 11776 11777 11778 11779 11780 11781 11782 11783 11784 11785 11786 11787 11788 11789 11790 11791 11792 11793 11794 11795 11796 11797 11798 11799 11800 11801 11802 11803 11804 11805 11806 11807 11808 11809 11810 11811 11812 11813 11814 11815 11816 11817 11818 11819 11820 11821 11822 11823 11824 11825 11826 11827 11828 11829 11830 11831 11832 11833 11834 11835 11836 11837 11838 11839 11840 11841 11842 11843 11844 11845 11846 11847 11848 11849 11850 11851 11852 11853 11854 11855 11856 11857 11858 11859 11860 11861 11862 11863 11864 11865 11866 11867 11868 11869 11870 11871 11872 11873 11874 11875 11876 11877 11878 11879 11880 11881 11882 11883 11884 11885 11886 11887 11888 11889 11890 11891 11892 11893 11894 11895 11896 11897 11898 11899 11900 11901 11902 11903 11904 11905 11906 11907 11908 11909 11910 11911 11912 11913 11914 11915 11916 11917 11918 11919 11920 11921 11922 11923 11924 11925 11926 11927 11928 11929 11930 11931 11932 11933 11934 11935 11936 11937 11938 11939 11940 11941 11942 11943 11944 11945 11946 11947 11948 11949 11950 11951 11952 11953 11954 11955 11956 11957 11958 11959 11960 11961 11962 11963 11964 11965 11966 11967 11968 11969 11970 11971 11972 11973 11974 11975 11976 11977 11978 11979 11980 11981 11982 11983 11984 11985 11986 11987 11988 11989 11990 11991 11992 11993 11994 11995 11996 11997 11998 11999 12000 12001 12002 12003 12004 12005 12006 12007 12008 12009 12010 12011 12012 12013 12014 12015 12016 12017 12018 12019 12020 12021 12022 12023 12024 12025 12026 12027 12028 12029 12030 12031 12032 12033 12034 12035 12036 12037 12038 12039 12040 12041 12042 12043 12044 12045 12046 12047 12048 12049 12050 12051 12052 12053 12054 12055 12056 12057 12058 12059 12060 12061 12062 12063 12064 12065 12066 12067 12068 12069 12070 12071 12072 12073 12074 12075 12076 12077 12078 12079 12080 12081 12082 12083 12084 12085 12086 12087 12088 12089 12090 12091 12092 12093 12094 12095 12096 12097 12098 12099 12100 12101 12102 12103 12104 12105 12106 12107 12108 12109 12110 12111 12112 12113 12114 12115 12116 12117 12118 12119 12120 12121 12122 12123 12124 12125 12126 12127 12128 12129 12130 12131 12132 12133 12134 12135 12136 12137 12138 12139 12140 12141 12142 12143 12144 12145 12146 12147 12148 12149 12150 12151 12152 12153 12154 12155 12156 12157 12158 12159 12160 12161 12162 12163 12164 12165 12166 12167 12168 12169 12170 12171 12172 12173 12174 12175 12176 12177 12178 12179 12180 12181 12182 12183 12184 12185 12186 12187 12188 12189 12190 12191 12192 12193 12194 12195 12196 12197 12198 12199 12200 12201 12202 12203 12204 12205 12206 12207 12208 12209 12210 12211 12212 12213 12214 12215 12216 12217 12218 12219 12220 12221 12222 12223 12224 12225 12226 12227 12228 12229 12230 12231 12232 12233 12234 12235 12236 12237 12238 12239 12240 12241 12242 12243 12244 12245 12246 12247 12248 12249 12250 12251 12252 12253 12254 12255 12256 12257 12258 12259 12260 12261 12262 12263 12264 12265 12266 12267 12268 12269 12270 12271 12272 12273 12274 12275 12276 12277 12278 12279 12280 12281 12282 12283 12284 12285 12286 12287 12288 12289 12290 12291 12292 12293 12294 12295 12296 12297 12298 12299 12300 12301 12302 12303 12304 12305 12306 12307 12308 12309 12310 12311 12312 12313 12314 12315 12316 12317 12318 12319 12320 12321 12322 12323 12324 12325 12326 12327 12328 12329 12330 12331 12332 12333 12334 12335 12336 12337 12338 12339 12340 12341 12342 12343 12344 12345 12346 12347 12348 12349 12350 12351 12352 12353 12354 12355 12356 12357 12358 12359 12360 12361 12362 12363 12364 12365 12366 12367 12368 12369 12370 12371 12372 12373 12374 12375 12376 12377 12378 12379 12380 12381 12382 12383 12384 12385 12386 12387 12388 12389 12390 12391 12392 12393 12394 12395 12396 12397 12398 12399 12400 12401 12402 12403 12404 12405 12406 12407 12408 12409 12410 12411 12412 12413 12414 12415 12416 12417 12418 12419 12420 12421 12422 12423 12424 12425 12426 12427 12428 12429 12430 12431 12432 12433 12434 12435 12436 12437 12438 12439 12440 12441 12442 12443 12444 12445 12446 12447 12448 12449 12450 12451 12452 12453 12454 12455 12456 12457 12458 12459 12460 12461 12462 12463 12464 12465 12466 12467 12468 12469 12470 12471 12472 12473 12474 12475 12476 12477 12478 12479 12480 12481 12482 12483 12484 12485 12486 12487 12488 12489 12490 12491 12492 12493 12494 12495 12496 12497 12498 12499 12500 12501 12502 12503 12504 12505 12506 12507 12508 12509 12510 12511 12512 12513 12514 12515 12516 12517 12518 12519 12520 12521 12522 12523 12524 12525 12526 12527 12528 12529 12530 12531 12532 12533 12534 12535 12536 12537 12538 12539 12540 12541 12542 12543 12544 12545 12546 12547 12548 12549 12550 12551 12552 12553 12554 12555 12556 12557 12558 12559 12560 12561 12562 12563 12564 12565 12566 12567 12568 12569 12570 12571 12572 12573 12574 12575 12576 12577 12578 12579 12580 12581 12582 12583 12584 12585 12586 12587 12588 12589 12590 12591 12592 12593 12594 12595 12596 12597 12598 12599 12600 12601 12602 12603 12604 12605 12606 12607 12608 12609 12610 12611 12612 12613 12614 12615 12616 12617 12618 12619 12620 12621 12622 12623 12624 12625 12626 12627 12628 12629 12630 12631 12632 12633 12634 12635 12636 12637 12638 12639 12640 12641 12642 12643 12644 12645 12646 12647 12648 12649 12650 12651 12652 12653 12654 12655 12656 12657 12658 12659 12660 12661 12662 12663 12664 12665 12666 12667 12668 12669 12670 12671 12672 12673 12674 12675 12676 12677 12678 12679 12680 12681 12682 12683 12684 12685 12686 12687 12688 12689 12690 12691 12692 12693 12694 12695 12696 12697 12698 12699 12700 12701 12702 12703 12704 12705 12706 12707 12708 12709 12710 12711 12712 12713 12714 12715 12716 12717 12718 12719 12720 12721 12722 12723 12724 12725 12726 12727 12728 12729 12730 12731 12732 12733 12734 12735 12736 12737 12738 12739 12740 12741 12742 12743 12744 12745 12746 12747 12748 12749 12750 12751 12752 12753 12754 12755 12756 12757 12758 12759 12760 12761 12762 12763 12764 12765 12766 12767 12768 12769 12770 12771 12772 12773 12774 12775 12776 12777 12778 12779 12780 12781 12782 12783 12784 12785 12786 12787 12788 12789 12790 12791 12792 12793 12794 12795 12796 12797 12798 12799 12800 12801 12802 12803 12804 12805 12806 12807 12808 12809 12810 12811 12812 12813 12814 12815 12816 12817 12818 12819 12820 12821 12822 12823 12824 12825 12826 12827 12828 12829 12830 12831 12832 12833 12834 12835 12836 12837 12838 12839 12840 12841 12842 12843 12844 12845 12846 12847 12848 12849 12850 12851 12852 12853 12854 12855 12856 12857 12858 12859 12860 12861 12862 12863 12864 12865 12866 12867 12868 12869 12870 12871 12872 12873 12874 12875 12876 12877 12878 12879 12880 12881 12882 12883 12884 12885 12886 12887 12888 12889 12890 12891 12892 12893 12894 12895 12896 12897 12898 12899 12900 12901 12902 12903 12904 12905 12906 12907 12908 12909 12910 12911 12912 12913 12914 12915 12916 12917 12918 12919 12920 12921 12922 12923 12924 12925 12926 12927 12928 12929 12930 12931 12932 12933 12934 12935 12936 12937 12938 12939 12940 12941 12942 12943 12944 12945 12946 12947 12948 12949 12950 12951 12952 12953 12954 12955 12956 12957 12958 12959 12960 12961 12962 12963 12964 12965 12966 12967 12968 12969 12970 12971 12972 12973 12974 12975 12976 12977 12978 12979 12980 12981 12982 12983 12984 12985 12986 12987 12988 12989 12990 12991 12992 12993 12994 12995 12996 12997 12998 12999 13000 13001 13002 13003 13004 13005 13006 13007 13008 13009 13010 13011 13012 13013 13014 13015 13016 13017 13018 13019 13020 13021 13022 13023 13024 13025 13026 13027 13028 13029 13030 13031 13032 13033 13034 13035 13036 13037 13038 13039 13040 13041 13042 13043 13044 13045 13046 13047 13048 13049 13050 13051 13052 13053 13054 13055 13056 13057 13058 13059 13060 13061 13062 13063 13064 13065 13066 13067 13068 13069 13070 13071 13072 13073 13074 13075 13076 13077 13078 13079 13080 13081 13082 13083 13084 13085 13086 13087 13088 13089 13090 13091 13092 13093 13094 13095 13096 13097 13098 13099 13100 13101 13102 13103 13104 13105 13106 13107 13108 13109 13110 13111 13112 13113 13114 13115 13116 13117 13118 13119 13120 13121 13122 13123 13124 13125 13126 13127 13128 13129 13130 13131 13132 13133 13134 13135 13136 13137 13138 13139 13140 13141 13142 13143 13144 13145 13146 13147 13148 13149 13150 13151 13152 13153 13154 13155 13156 13157 13158 13159 13160 13161 13162 13163 13164 13165 13166 13167 13168 13169 13170 13171 13172 13173 13174 13175 13176 13177 13178 13179 13180 13181 13182 13183 13184 13185 13186 13187 13188 13189 13190 13191 13192 13193 13194 13195 13196 13197 13198 13199 13200 13201 13202 13203 13204 13205 13206 13207 13208 13209 13210 13211 13212 13213 13214 13215 13216 13217 13218 13219 13220 13221 13222 13223 13224 13225 13226 13227 13228 13229 13230 13231 13232 13233 13234 13235 13236 13237 13238 13239 13240 13241 13242 13243 13244 13245 13246 13247 13248 13249 13250 13251 13252 13253 13254 13255 13256 13257 13258 13259 13260 13261 13262 13263 13264 13265 13266 13267 13268 13269 13270 13271 13272 13273 13274 13275 13276 13277 13278 13279 13280 13281 13282 13283 13284 13285 13286 13287 13288 13289 13290 13291 13292 13293 13294 13295 13296 13297 13298 13299 13300 13301 13302 13303 13304 13305 13306 13307 13308 13309 13310 13311 13312 13313 13314 13315 13316 13317 13318 13319 13320 13321 13322 13323 13324 13325 13326 13327 13328 13329 13330 13331 13332 13333 13334 13335 13336 13337 13338 13339 13340 13341 13342 13343 13344 13345 13346 13347 13348 13349 13350 13351 13352 13353 13354 13355 13356 13357 13358 13359 13360 13361 13362 13363 13364 13365 13366 13367 13368 13369 13370 13371 13372 13373 13374 13375 13376 13377 13378 13379 13380 13381 13382 13383 13384 13385 13386 13387 13388 13389 13390 13391 13392 13393 13394 13395 13396 13397 13398 13399 13400 13401 13402 13403 13404 13405 13406 13407 13408 13409 13410 13411 13412 13413 13414 13415 13416 13417 13418 13419 13420 13421 13422 13423 13424 13425 13426 13427 13428 13429 13430 13431 13432 13433 13434 13435 13436 13437 13438 13439 13440 13441 13442 13443 13444 13445 13446 13447 13448 13449 13450 13451 13452 13453 13454 13455 13456 13457 13458 13459 13460 13461 13462 13463 13464 13465 13466 13467 13468 13469 13470 13471 13472 13473 13474 13475 13476 13477 13478 13479 13480 13481 13482 13483 13484 13485 13486 13487 13488 13489 13490 13491 13492 13493 13494 13495 13496 13497 13498 13499 13500 13501 13502 13503 13504 13505 13506 13507 13508 13509 13510 13511 13512 13513 13514 13515 13516 13517 13518 13519 13520 13521 13522 13523 13524 13525 13526 13527 13528 13529 13530 13531 13532 13533 13534 13535 13536 13537 13538 13539 13540 13541 13542 13543 13544 13545 13546 13547 13548 13549 13550 13551 13552 13553 13554 13555 13556 13557 13558 13559 13560 13561 13562 13563 13564 13565 13566 13567 13568 13569 13570 13571 13572 13573 13574 13575 13576 13577 13578 13579 13580 13581 13582 13583 13584 13585 13586 13587 13588 13589 13590 13591 13592 13593 13594 13595 13596 13597 13598 13599 13600 13601 13602 13603 13604 13605 13606 13607 13608 13609 13610 13611 13612 13613 13614 13615 13616 13617 13618 13619 13620 13621 13622 13623 13624 13625 13626 13627 13628 13629 13630 13631 13632 13633 13634 13635 13636 13637 13638 13639 13640 13641 13642 13643 13644 13645 13646 13647 13648 13649 13650 13651 13652 13653 13654 13655 13656 13657 13658 13659 13660 13661 13662 13663 13664 13665 13666 13667 13668 13669 13670 13671 13672 13673 13674 13675 13676 13677 13678 13679 13680 13681 13682 13683 13684 13685 13686 13687 13688 13689 13690 13691 13692 13693 13694 13695 13696 13697 13698 13699 13700 13701 13702 13703 13704 13705 13706 13707 13708 13709 13710 13711 13712 13713 13714 13715 13716 13717 13718 13719 13720 13721 13722 13723 13724 13725 13726 13727 13728 13729 13730 13731 13732 13733 13734 13735 13736 13737 13738 13739 13740 13741 13742 13743 13744 13745 13746 13747 13748 13749 13750 13751 13752 13753 13754 13755 13756 13757 13758 13759 13760 13761 13762 13763 13764 13765 13766 13767 13768 13769 13770 13771 13772 13773 13774 13775 13776 13777 13778 13779 13780 13781 13782 13783 13784 13785 13786 13787 13788 13789 13790 13791 13792 13793 13794 13795 13796 13797 13798 13799 13800 13801 13802 13803 13804 13805 13806 13807 13808 13809 13810 13811 13812 13813 13814 13815 13816 13817 13818 13819 13820 13821 13822 13823 13824 13825 13826 13827 13828 13829 13830 13831 13832 13833 13834 13835 13836 13837 13838 13839 13840 13841 13842 13843 13844 13845 13846 13847 13848 13849 13850 13851 13852 13853 13854 13855 13856 13857 13858 13859 13860 13861 13862 13863 13864 13865 13866 13867 13868 13869 13870 13871 13872 13873 13874 13875 13876 13877 13878 13879 13880 13881 13882 13883 13884 13885 13886 13887 13888 13889 13890 13891 13892 13893 13894 13895 13896 13897 13898 13899 13900 13901 13902 13903 13904 13905 13906 13907 13908 13909 13910 13911 13912 13913 13914 13915 13916 13917 13918 13919 13920 13921 13922 13923 13924 13925 13926 13927 13928 13929 13930 13931 13932 13933 13934 13935 13936 13937 13938 13939 13940 13941 13942 13943 13944 13945 13946 13947 13948 13949 13950 13951 13952 13953 13954 13955 13956 13957 13958 13959 13960 13961 13962 13963 13964 13965 13966 13967 13968 13969 13970 13971 13972 13973 13974 13975 13976 13977 13978 13979 13980 13981 13982 13983 13984 13985 13986 13987 13988 13989 13990 13991 13992 13993 13994 13995 13996 13997 13998 13999 14000 14001 14002 14003 14004 14005 14006 14007 14008 14009 14010 14011 14012 14013 14014 14015 14016 14017 14018 14019 14020 14021 14022 14023 14024 14025 14026 14027 14028 14029 14030 14031 14032 14033 14034 14035 14036 14037 14038 14039 14040 14041 14042 14043 14044 14045 14046 14047 14048 14049 14050 14051 14052 14053 14054 14055 14056 14057 14058 14059 14060 14061 14062 14063 14064 14065 14066 14067 14068 14069 14070 14071 14072 14073 14074 14075 14076 14077 14078 14079 14080 14081 14082 14083 14084 14085 14086 14087 14088 14089 14090 14091 14092 14093 14094 14095 14096 14097 14098 14099 14100 14101 14102 14103 14104 14105 14106 14107 14108 14109 14110 14111 14112 14113 14114 14115 14116 14117 14118 14119 14120 14121 14122 14123 14124 14125 14126 14127 14128 14129 14130 14131 14132 14133 14134 14135 14136 14137 14138 14139 14140 14141 14142 14143 14144 14145 14146 14147 14148 14149 14150 14151 14152 14153 14154 14155 14156 14157 14158 14159 14160 14161 14162 14163 14164 14165 14166 14167 14168 14169 14170 14171 14172 14173 14174 14175 14176 14177 14178 14179 14180 14181 14182 14183 14184 14185 14186 14187 14188 14189 14190 14191 14192 14193 14194 14195 14196 14197 14198 14199 14200 14201 14202 14203 14204 14205 14206 14207 14208 14209 14210 14211 14212 14213 14214 14215 14216 14217 14218 14219 14220 14221 14222 14223 14224 14225 14226 14227 14228 14229 14230 14231 14232 14233 14234 14235 14236 14237 14238 14239 14240 14241 14242 14243 14244 14245 14246 14247 14248 14249 14250 14251 14252 14253 14254 14255 14256 14257 14258 14259 14260 14261 14262 14263 14264 14265 14266 14267 14268 14269 14270 14271 14272 14273 14274 14275 14276 14277 14278 14279 14280 14281 14282 14283 14284 14285 14286 14287 14288 14289 14290 14291 14292 14293 14294 14295 14296 14297 14298 14299 14300 14301 14302 14303 14304 14305 14306 14307 14308 14309 14310 14311 14312 14313 14314 14315 14316 14317 14318 14319 14320 14321 14322 14323 14324 14325 14326 14327 14328 14329 14330 14331 14332 14333 14334 14335 14336 14337 14338 14339 14340 14341 14342 14343 14344 14345 14346 14347 14348 14349 14350 14351 14352 14353 14354 14355 14356 14357 14358 14359 14360 14361 14362 14363 14364 14365 14366 14367 14368 14369 14370 14371 14372 14373 14374 14375 14376 14377 14378 14379 14380 14381 14382 14383 14384 14385 14386 14387 14388 14389 14390 14391 14392 14393 14394 14395 14396 14397 14398 14399 14400 14401 14402 14403 14404 14405 14406 14407 14408 14409 14410 14411 14412 14413 14414 14415 14416 14417 14418 14419 14420 14421 14422 14423 14424 14425 14426 14427 14428 14429 14430 14431 14432 14433 14434 14435 14436 14437 14438 14439 14440 14441 14442 14443 14444 14445 14446 14447 14448 14449 14450 14451 14452 14453 14454 14455 14456 14457 14458 14459 14460 14461 14462 14463 14464 14465 14466 14467 14468 14469 14470 14471 14472 14473 14474 14475 14476 14477 14478 14479 14480 14481 14482 14483 14484 14485 14486 14487 14488 14489 14490 14491 14492 14493 14494 14495 14496 14497 14498 14499 14500 14501 14502 14503 14504 14505 14506 14507 14508 14509 14510 14511 14512 14513 14514 14515 14516 14517 14518 14519 14520 14521 14522 14523 14524 14525 14526 14527 14528 14529 14530 14531 14532 14533 14534 14535 14536 14537 14538 14539 14540 14541 14542 14543 14544 14545 14546 14547 14548 14549 14550 14551 14552 14553 14554 14555 14556 14557 14558 14559 14560 14561 14562 14563 14564 14565 14566 14567 14568 14569 14570 14571 14572 14573 14574 14575 14576 14577 14578 14579 14580 14581 14582 14583 14584 14585 14586 14587 14588 14589 14590 14591 14592 14593 14594 14595 14596 14597 14598 14599 14600 14601 14602 14603 14604 14605 14606 14607 14608 14609 14610 14611 14612 14613 14614 14615 14616 14617 14618 14619 14620 14621 14622 14623 14624 14625 14626 14627 14628 14629 14630 14631 14632 14633 14634 14635 14636 14637 14638 14639 14640 14641 14642 14643 14644 14645 14646 14647 14648 14649 14650 14651 14652 14653 14654 14655 14656 14657 14658 14659 14660 14661 14662 14663 14664 14665 14666 14667 14668 14669 14670 14671 14672 14673 14674 14675 14676 14677 14678 14679 14680 14681 14682 14683 14684 14685 14686 14687 14688 14689 14690 14691 14692 14693 14694 14695 14696 14697 14698 14699 14700 14701 14702 14703 14704 14705 14706 14707 14708 14709 14710 14711 14712 14713 14714 14715 14716 14717 14718 14719 14720 14721 14722 14723 14724 14725 14726 14727 14728 14729 14730 14731 14732 14733 14734 14735 14736 14737 14738 14739 14740 14741 14742 14743 14744 14745 14746 14747 14748 14749 14750 14751 14752 14753 14754 14755 14756 14757 14758 14759 14760 14761 14762 14763 14764 14765 14766 14767 14768 14769 14770 14771 14772 14773 14774 14775 14776 14777 14778 14779 14780 14781 14782 14783 14784 14785 14786 14787 14788 14789 14790 14791 14792 14793 14794 14795 14796 14797 14798 14799 14800 14801 14802 14803 14804 14805 14806 14807 14808 14809 14810 14811 14812 14813 14814 14815 14816 14817 14818 14819 14820 14821 14822 14823 14824 14825 14826 14827 14828 14829 14830 14831 14832 14833 14834 14835 14836 14837 14838 14839 14840 14841 14842 14843 14844 14845 14846 14847 14848 14849 14850 14851 14852 14853 14854 14855 14856 14857 14858 14859 14860 14861 14862 14863 14864 14865 14866 14867 14868 14869 14870 14871 14872 14873 14874 14875 14876 14877 14878 14879 14880 14881 14882 14883 14884 14885 14886 14887 14888 14889 14890 14891 14892 14893 14894 14895 14896 14897 14898 14899 14900 14901 14902 14903 14904 14905 14906 14907 14908 14909 14910 14911 14912 14913 14914 14915 14916 14917 14918 14919 14920 14921 14922 14923 14924 14925 14926 14927 14928 14929 14930 14931 14932 14933 14934 14935 14936 14937 14938 14939 14940 14941 14942 14943 14944 14945 14946 14947 14948 14949 14950 14951 14952 14953 14954 14955 14956 14957 14958 14959 14960 14961 14962 14963 14964 14965 14966 14967 14968 14969 14970 14971 14972 14973 14974 14975 14976 14977 14978 14979 14980 14981 14982 14983 14984 14985 14986 14987 14988 14989 14990 14991 14992 14993 14994 14995 14996 14997 14998 14999 15000 15001 15002 15003 15004 15005 15006 15007 15008 15009 15010 15011 15012 15013 15014 15015 15016 15017 15018 15019 15020 15021 15022 15023 15024 15025 15026 15027 15028 15029 15030 15031 15032 15033 15034 15035 15036 15037 15038 15039 15040 15041 15042 15043 15044 15045 15046 15047 15048 15049 15050 15051 15052 15053 15054 15055 15056 15057 15058 15059 15060 15061 15062 15063 15064 15065 15066 15067 15068 15069 15070 15071 15072 15073 15074 15075 15076 15077 15078 15079 15080 15081 15082 15083 15084 15085 15086 15087 15088 15089 15090 15091 15092 15093 15094 15095 15096 15097 15098 15099 15100 15101 15102 15103 15104 15105 15106 15107 15108 15109 15110 15111 15112 15113 15114 15115 15116 15117 15118 15119 15120 15121 15122 15123 15124 15125 15126 15127 15128 15129 15130 15131 15132 15133 15134 15135 15136 15137 15138 15139 15140 15141 15142 15143 15144 15145 15146 15147 15148 15149 15150 15151 15152 15153 15154 15155 15156 15157 15158 15159 15160 15161 15162 15163 15164 15165 15166 15167 15168 15169 15170 15171 15172 15173 15174 15175 15176 15177 15178 15179 15180 15181 15182 15183 15184 15185 15186 15187 15188 15189 15190 15191 15192 15193 15194 15195 15196 15197 15198 15199 15200 15201 15202 15203 15204 15205 15206 15207 15208 15209 15210 15211 15212 15213 15214 15215 15216 15217 15218 15219 15220 15221 15222 15223 15224 15225 15226 15227 15228 15229 15230 15231 15232 15233 15234 15235 15236 15237 15238 15239 15240 15241 15242 15243 15244 15245 15246 15247 15248 15249 15250 15251 15252 15253 15254 15255 15256 15257 15258 15259 15260 15261 15262 15263 15264 15265 15266 15267 15268 15269 15270 15271 15272 15273 15274 15275 15276 15277 15278 15279 15280 15281 15282 15283 15284 15285 15286 15287 15288 15289 15290 15291 15292 15293 15294 15295 15296 15297 15298 15299 15300 15301 15302 15303 15304 15305 15306 15307 15308 15309 15310 15311 15312 15313 15314 15315 15316 15317 15318 15319 15320 15321 15322 15323 15324 15325 15326 15327 15328 15329 15330 15331 15332 15333 15334 15335 15336 15337 15338 15339 15340 15341 15342 15343 15344 15345 15346 15347 15348 15349 15350 15351 15352 15353 15354 15355 15356 15357 15358 15359 15360 15361 15362 15363 15364 15365 15366 15367 15368 15369 15370 15371 15372 15373 15374 15375 15376 15377 15378 15379 15380 15381 15382 15383 15384 15385 15386 15387 15388 15389 15390 15391 15392 15393 15394 15395 15396 15397 15398 15399 15400 15401 15402 15403 15404 15405 15406 15407 15408 15409 15410 15411 15412 15413 15414 15415 15416 15417 15418 15419 15420 15421 15422 15423 15424 15425 15426 15427 15428 15429 15430 15431 15432 15433 15434 15435 15436 15437 15438 15439 15440 15441 15442 15443 15444 15445 15446 15447 15448 15449 15450 15451 15452 15453 15454 15455 15456 15457 15458 15459 15460 15461 15462 15463 15464 15465 15466 15467 15468 15469 15470 15471 15472 15473 15474 15475 15476 15477 15478 15479 15480 15481 15482 15483 15484 15485 15486 15487 15488 15489 15490 15491 15492 15493 15494 15495 15496 15497 15498 15499 15500 15501 15502 15503 15504 15505 15506 15507 15508 15509 15510 15511 15512 15513 15514 15515 15516 15517 15518 15519 15520 15521 15522 15523 15524 15525 15526 15527 15528 15529 15530 15531 15532 15533 15534 15535 15536 15537 15538 15539 15540 15541 15542 15543 15544 15545 15546 15547 15548 15549 15550 15551 15552 15553 15554 15555 15556 15557 15558 15559 15560 15561 15562 15563 15564 15565 15566 15567 15568 15569 15570 15571 15572 15573 15574 15575 15576 15577 15578 15579 15580 15581 15582 15583 15584 15585 15586 15587 15588 15589 15590 15591 15592 15593 15594 15595 15596 15597 15598 15599 15600 15601 15602 15603 15604 15605 15606 15607 15608 15609 15610 15611 15612 15613 15614 15615 15616 15617 15618 15619 15620 15621 15622 15623 15624 15625 15626 15627 15628 15629 15630 15631 15632 15633 15634 15635 15636 15637 15638 15639 15640 15641 15642 15643 15644 15645 15646 15647 15648 15649 15650 15651 15652 15653 15654 15655 15656 15657 15658 15659 15660 15661 15662 15663 15664 15665 15666 15667 15668 15669 15670 15671 15672 15673 15674 15675 15676 15677 15678 15679 15680 15681 15682 15683 15684 15685 15686 15687 15688 15689 15690 15691 15692 15693 15694 15695 15696 15697 15698 15699 15700 15701 15702 15703 15704 15705 15706 15707 15708 15709 15710 15711 15712 15713 15714 15715 15716 15717 15718 15719 15720 15721 15722 15723 15724 15725 15726 15727 15728 15729 15730 15731 15732 15733 15734 15735 15736 15737 15738 15739 15740 15741 15742 15743 15744 15745 15746 15747 15748 15749 15750 15751 15752 15753 15754 15755 15756 15757 15758 15759 15760 15761 15762 15763 15764 15765 15766 15767 15768 15769 15770 15771 15772 15773 15774 15775 15776 15777 15778 15779 15780 15781 15782 15783 15784 15785 15786 15787 15788 15789 15790 15791 15792 15793 15794 15795 15796 15797 15798 15799 15800 15801 15802 15803 15804 15805 15806 15807 15808 15809 15810 15811 15812 15813 15814 15815 15816 15817 15818 15819 15820 15821 15822 15823 15824 15825 15826 15827 15828 15829 15830 15831 15832 15833 15834 15835 15836 15837 15838 15839 15840 15841 15842 15843 15844 15845 15846 15847 15848 15849 15850 15851 15852 15853 15854 15855 15856 15857 15858 15859 15860 15861 15862 15863 15864 15865 15866 15867 15868 15869 15870 15871 15872 15873 15874 15875 15876 15877 15878 15879 15880 15881 15882 15883 15884 15885 15886 15887 15888 15889 15890 15891 15892 15893 15894 15895 15896 15897 15898 15899 15900 15901 15902 15903 15904 15905 15906 15907 15908 15909 15910 15911 15912 15913 15914 15915 15916 15917 15918 15919 15920 15921 15922 15923 15924 15925 15926 15927 15928 15929 15930 15931 15932 15933 15934 15935 15936 15937 15938 15939 15940 15941 15942 15943 15944 15945 15946 15947 15948 15949 15950 15951 15952 15953 15954 15955 15956 15957 15958 15959 15960 15961 15962 15963 15964 15965 15966 15967 15968 15969 15970 15971 15972 15973 15974 15975 15976 15977 15978 15979 15980 15981 15982 15983 15984 15985 15986 15987 15988 15989 15990 15991 15992 15993 15994 15995 15996 15997 15998 15999 16000 16001 16002 16003 16004 16005 16006 16007 16008 16009 16010 16011 16012 16013 16014 16015 16016 16017 16018 16019 16020 16021 16022 16023 16024 16025 16026 16027 16028 16029 16030 16031 16032 16033 16034 16035 16036 16037 16038 16039 16040 16041 16042 16043 16044 16045 16046 16047 16048 16049 16050 16051 16052 16053 16054 16055 16056 16057 16058 16059 16060 16061 16062 16063 16064 16065 16066 16067 16068 16069 16070 16071 16072 16073 16074 16075 16076 16077 16078 16079 16080 16081 16082 16083 16084 16085 16086 16087 16088 16089 16090 16091 16092 16093 16094 16095 16096 16097 16098 16099 16100 16101 16102 16103 16104 16105 16106 16107 16108 16109 16110 16111 16112 16113 16114 16115 16116 16117 16118 16119 16120 16121 16122 16123 16124 16125 16126 16127 16128 16129 16130 16131 16132 16133 16134 16135 16136 16137 16138 16139 16140 16141 16142 16143 16144 16145 16146 16147 16148 16149 16150 16151 16152 16153 16154 16155 16156 16157 16158 16159 16160 16161 16162 16163 16164 16165 16166 16167 16168 16169 16170 16171 16172 16173 16174 16175 16176 16177 16178 16179 16180 16181 16182 16183 16184 16185 16186 16187 16188 16189 16190 16191 16192 16193 16194 16195 16196 16197 16198 16199 16200 16201 16202 16203 16204 16205 16206 16207 16208 16209 16210 16211 16212 16213 16214 16215 16216 16217 16218 16219 16220 16221 16222 16223 16224 16225 16226 16227 16228 16229 16230 16231 16232 16233 16234 16235 16236 16237 16238 16239 16240 16241 16242 16243 16244 16245 16246 16247 16248 16249 16250 16251 16252 16253 16254 16255 16256 16257 16258 16259 16260 16261 16262 16263 16264 16265 16266 16267 16268 16269 16270 16271 16272 16273 16274 16275 16276 16277 16278 16279 16280 16281 16282 16283 16284 16285 16286 16287 16288 16289 16290 16291 16292 16293 16294 16295 16296 16297 16298 16299 16300 16301 16302 16303 16304 16305 16306 16307 16308 16309 16310 16311 16312 16313 16314 16315 16316 16317 16318 16319 16320 16321 16322 16323 16324 16325 16326 16327 16328 16329 16330 16331 16332 16333 16334 16335 16336 16337 16338 16339 16340 16341 16342 16343 16344 16345 16346 16347 16348 16349 16350 16351 16352 16353 16354 16355 16356 16357 16358 16359 16360 16361 16362 16363 16364 16365 16366 16367 16368 16369 16370 16371 16372 16373 16374 16375 16376 16377 16378 16379 16380 16381 16382 16383 16384 16385 16386 16387 16388 16389 16390 16391 16392 16393 16394 16395 16396 16397 16398 16399 16400 16401 16402 16403 16404 16405 16406 16407 16408 16409 16410 16411 16412 16413 16414 16415 16416 16417 16418 16419 16420 16421 16422 16423 16424 16425 16426 16427 16428 16429 16430 16431 16432 16433 16434 16435 16436 16437 16438 16439 16440 16441 16442 16443 16444 16445 16446 16447 16448 16449 16450 16451 16452 16453 16454 16455 16456 16457 16458 16459 16460 16461 16462 16463 16464 16465 16466 16467 16468 16469 16470 16471 16472 16473 16474 16475 16476 16477 16478 16479 16480 16481 16482 16483 16484 16485 16486 16487 16488 16489 16490 16491 16492 16493 16494 16495 16496 16497 16498 16499 16500 16501 16502 16503 16504 16505 16506 16507 16508 16509 16510 16511 16512 16513 16514 16515 16516 16517 16518 16519 16520 16521 16522 16523 16524 16525 16526 16527 16528 16529 16530 16531 16532 16533 16534 16535 16536 16537 16538 16539 16540 16541 16542 16543 16544 16545 16546 16547 16548 16549 16550 16551 16552 16553 16554 16555 16556 16557 16558 16559 16560 16561 16562 16563 16564 16565 16566 16567 16568 16569 16570 16571 16572 16573 16574 16575 16576 16577 16578 16579 16580 16581 16582 16583 16584 16585 16586 16587 16588 16589 16590 16591 16592 16593 16594 16595 16596 16597 16598 16599 16600 16601 16602 16603 16604 16605 16606 16607 16608 16609 16610 16611 16612 16613 16614 16615 16616 16617 16618 16619 16620 16621 16622 16623 16624 16625 16626 16627 16628 16629 16630 16631 16632 16633 16634 16635 16636 16637 16638 16639 16640 16641 16642 16643 16644 16645 16646 16647 16648 16649 16650 16651 16652 16653 16654 16655 16656 16657 16658 16659 16660 16661 16662 16663 16664 16665 16666 16667 16668 16669 16670 16671 16672 16673 16674 16675 16676 16677 16678 16679 16680 16681 16682 16683 16684 16685 16686 16687 16688 16689 16690 16691 16692 16693 16694 16695 16696 16697 16698 16699 16700 16701 16702 16703 16704 16705 16706 16707 16708 16709 16710 16711 16712 16713 16714 16715 16716 16717 16718 16719 16720 16721 16722 16723 16724 16725 16726 16727 16728 16729 16730 16731 16732 16733 16734 16735 16736 16737 16738 16739 16740 16741 16742 16743 16744 16745 16746 16747 16748 16749 16750 16751 16752 16753 16754 16755 16756 16757 16758 16759 16760 16761 16762 16763 16764 16765 16766 16767 16768 16769 16770 16771 16772 16773 16774 16775 16776 16777 16778 16779 16780 16781 16782 16783 16784 16785 16786 16787 16788 16789 16790 16791 16792 16793 16794 16795 16796 16797 16798 16799 16800 16801 16802 16803 16804 16805 16806 16807 16808 16809 16810 16811 16812 16813 16814 16815 16816 16817 16818 16819 16820 16821 16822 16823 16824 16825 16826 16827 16828 16829 16830 16831 16832 16833 16834 16835 16836 16837 16838 16839 16840 16841 16842 16843 16844 16845 16846 16847 16848 16849 16850 16851 16852 16853 16854 16855 16856 16857 16858 16859 16860 16861 16862 16863 16864 16865 16866 16867 16868 16869 16870 16871 16872 16873 16874 16875 16876 16877 16878 16879 16880 16881 16882 16883 16884 16885 16886 16887 16888 16889 16890 16891 16892 16893 16894 16895 16896 16897 16898 16899 16900 16901 16902 16903 16904 16905 16906 16907 16908 16909 16910 16911 16912 16913 16914 16915 16916 16917 16918 16919 16920 16921 16922 16923 16924 16925 16926 16927 16928 16929 16930 16931 16932 16933 16934 16935 16936 16937 16938 16939 16940 16941 16942 16943 16944 16945 16946 16947 16948 16949 16950 16951 16952 16953 16954 16955 16956 16957 16958 16959 16960 16961 16962 16963 16964 16965 16966 16967 16968 16969 16970 16971 16972 16973 16974 16975 16976 16977 16978 16979 16980 16981 16982 16983 16984 16985 16986 16987 16988 16989 16990 16991 16992 16993 16994 16995 16996 16997 16998 16999 17000 17001 17002 17003 17004 17005 17006 17007 17008 17009 17010 17011 17012 17013 17014 17015 17016 17017 17018 17019 17020 17021 17022 17023 17024 17025 17026 17027 17028 17029 17030 17031 17032 17033 17034 17035 17036 17037 17038 17039 17040 17041 17042 17043 17044 17045 17046 17047 17048 17049 17050 17051 17052 17053 17054 17055 17056 17057 17058 17059 17060 17061 17062 17063 17064 17065 17066 17067 17068 17069 17070 17071 17072 17073 17074 17075 17076 17077 17078 17079 17080 17081 17082 17083 17084 17085 17086 17087 17088 17089 17090 17091 17092 17093 17094 17095 17096 17097 17098 17099 17100 17101 17102 17103 17104 17105 17106 17107 17108 17109 17110 17111 17112 17113 17114 17115 17116 17117 17118 17119 17120 17121 17122 17123 17124 17125 17126 17127 17128 17129 17130 17131 17132 17133 17134 17135 17136 17137 17138 17139 17140 17141 17142 17143 17144 17145 17146 17147 17148 17149 17150 17151 17152 17153 17154 17155 17156 17157 17158 17159 17160 17161 17162 17163 17164 17165 17166 17167 17168 17169 17170 17171 17172 17173 17174 17175 17176 17177 17178 17179 17180 17181 17182 17183 17184 17185 17186 17187 17188 17189 17190 17191 17192 17193 17194 17195 17196 17197 17198 17199 17200 17201 17202 17203 17204 17205 17206 17207 17208 17209 17210 17211 17212 17213 17214 17215 17216 17217 17218 17219 17220 17221 17222 17223 17224 17225 17226 17227 17228 17229 17230 17231 17232 17233 17234 17235 17236 17237 17238 17239 17240 17241 17242 17243 17244 17245 17246 17247 17248 17249 17250 17251 17252 17253 17254 17255 17256 17257 17258 17259 17260 17261 17262 17263 17264 17265 17266 17267 17268 17269 17270 17271 17272 17273 17274 17275 17276 17277 17278 17279 17280 17281 17282 17283 17284 17285 17286 17287 17288 17289 17290 17291 17292 17293 17294 17295 17296 17297 17298 17299 17300 17301 17302 17303 17304 17305 17306 17307 17308 17309 17310 17311 17312 17313 17314 17315 17316 17317 17318 17319 17320 17321 17322 17323 17324 17325 17326 17327 17328 17329 17330 17331 17332 17333 17334 17335 17336 17337 17338 17339 17340 17341 17342 17343 17344 17345 17346 17347 17348 17349 17350 17351 17352 17353 17354 17355 17356 17357 17358 17359 17360 17361 17362 17363 17364 17365 17366 17367 17368 17369 17370 17371 17372 17373 17374 17375 17376 17377 17378 17379 17380 17381 17382 17383 17384 17385 17386 17387 17388 17389 17390 17391 17392 17393 17394 17395 17396 17397 17398 17399 17400 17401 17402 17403 17404 17405 17406 17407 17408 17409 17410 17411 17412 17413 17414 17415 17416 17417 17418 17419 17420 17421 17422 17423 17424 17425 17426 17427 17428 17429 17430 17431 17432 17433 17434 17435 17436 17437 17438 17439 17440 17441 17442 17443 17444 17445 17446 17447 17448 17449 17450 17451 17452 17453 17454 17455 17456 17457 17458 17459 17460 17461 17462 17463 17464 17465 17466 17467 17468 17469 17470 17471 17472 17473 17474 17475 17476 17477 17478 17479 17480 17481 17482 17483 17484 17485 17486 17487 17488 17489 17490 17491 17492 17493 17494 17495 17496 17497 17498 17499 17500 17501 17502 17503 17504 17505 17506 17507 17508 17509 17510 17511 17512 17513 17514 17515 17516 17517 17518 17519 17520 17521 17522 17523 17524 17525 17526 17527 17528 17529 17530 17531 17532 17533 17534 17535 17536 17537 17538 17539 17540 17541 17542 17543 17544 17545 17546 17547 17548 17549 17550 17551 17552 17553 17554 17555 17556 17557 17558 17559 17560 17561 17562 17563 17564 17565 17566 17567 17568 17569 17570 17571 17572 17573 17574 17575 17576 17577 17578 17579 17580 17581 17582 17583 17584 17585 17586 17587 17588 17589 17590 17591 17592 17593 17594 17595 17596 17597 17598 17599 17600 17601 17602 17603 17604 17605 17606 17607 17608 17609 17610 17611 17612 17613 17614 17615 17616 17617 17618 17619 17620 17621 17622 17623 17624 17625 17626 17627 17628 17629 17630 17631 17632 17633 17634 17635 17636 17637 17638 17639 17640 17641 17642 17643 17644 17645 17646 17647 17648 17649 17650 17651 17652 17653 17654 17655 17656 17657 17658 17659 17660 17661 17662 17663 17664 17665 17666 17667 17668 17669 17670 17671 17672 17673 17674 17675 17676 17677 17678 17679 17680 17681 17682 17683 17684 17685 17686 17687 17688 17689 17690 17691 17692 17693 17694 17695 17696 17697 17698 17699 17700 17701 17702 17703 17704 17705 17706 17707 17708 17709 17710 17711 17712 17713 17714 17715 17716 17717 17718 17719 17720 17721 17722 17723 17724 17725 17726 17727 17728 17729 17730 17731 17732 17733 17734 17735 17736 17737 17738 17739 17740 17741 17742 17743 17744 17745 17746 17747 17748 17749 17750 17751 17752 17753 17754 17755 17756 17757 17758 17759 17760 17761 17762 17763 17764 17765 17766 17767 17768 17769 17770 17771 17772 17773 17774 17775 17776 17777 17778 17779 17780 17781 17782 17783 17784 17785 17786 17787 17788 17789 17790 17791 17792 17793 17794 17795 17796 17797 17798 17799 17800 17801 17802 17803 17804 17805 17806 17807 17808 17809 17810 17811 17812 17813 17814 17815 17816 17817 17818 17819 17820 17821 17822 17823 17824 17825 17826 17827 17828 17829 17830 17831 17832 17833 17834 17835 17836 17837 17838 17839 17840 17841 17842 17843 17844 17845 17846 17847 17848 17849 17850 17851 17852 17853 17854 17855 17856 17857 17858 17859 17860 17861 17862 17863 17864 17865 17866 17867 17868 17869 17870 17871 17872 17873 17874 17875 17876 17877 17878 17879 17880 17881 17882 17883 17884 17885 17886 17887 17888 17889 17890 17891 17892 17893 17894 17895 17896 17897 17898 17899 17900 17901 17902 17903 17904 17905 17906 17907 17908 17909 17910 17911 17912 17913 17914 17915 17916 17917 17918 17919 17920 17921 17922 17923 17924 17925 17926 17927 17928 17929 17930 17931 17932 17933 17934 17935 17936 17937 17938 17939 17940 17941 17942 17943 17944 17945 17946 17947 17948 17949 17950 17951 17952 17953 17954 17955 17956 17957 17958 17959 17960 17961 17962 17963 17964 17965 17966 17967 17968 17969 17970 17971 17972 17973 17974 17975 17976 17977 17978 17979 17980 17981 17982 17983 17984 17985 17986 17987 17988 17989 17990 17991 17992 17993 17994 17995 17996 17997 17998 17999 18000 18001 18002 18003 18004 18005 18006 18007 18008 18009 18010 18011 18012 18013 18014 18015 18016 18017 18018 18019 18020 18021 18022 18023 18024 18025 18026 18027 18028 18029 18030 18031 18032 18033 18034 18035 18036 18037 18038 18039 18040 18041 18042 18043 18044 18045 18046 18047 18048 18049 18050 18051 18052 18053 18054 18055 18056 18057 18058 18059 18060 18061 18062 18063 18064 18065 18066 18067 18068 18069 18070 18071 18072 18073 18074 18075 18076 18077 18078 18079 18080 18081 18082 18083 18084 18085 18086 18087 18088 18089 18090 18091 18092 18093 18094 18095 18096 18097 18098 18099 18100 18101 18102 18103 18104 18105 18106 18107 18108 18109 18110 18111 18112 18113 18114 18115 18116 18117 18118 18119 18120 18121 18122 18123 18124 18125 18126 18127 18128 18129 18130 18131 18132 18133 18134 18135 18136 18137 18138 18139 18140 18141 18142 18143 18144 18145 18146 18147 18148 18149 18150 18151 18152 18153 18154 18155 18156 18157 18158 18159 18160 18161 18162 18163 18164 18165 18166 18167 18168 18169 18170 18171 18172 18173 18174 18175 18176 18177 18178 18179 18180 18181 18182 18183 18184 18185 18186 18187 18188 18189 18190 18191 18192 18193 18194 18195 18196 18197 18198 18199 18200 18201 18202 18203 18204 18205 18206 18207 18208 18209 18210 18211 18212 18213 18214 18215 18216 18217 18218 18219 18220 18221 18222 18223 18224 18225 18226 18227 18228 18229 18230 18231 18232 18233 18234 18235 18236 18237 18238 18239 18240 18241 18242 18243 18244 18245 18246 18247 18248 18249 18250 18251 18252 18253 18254 18255 18256 18257 18258 18259 18260 18261 18262 18263 18264 18265 18266 18267 18268 18269 18270 18271 18272 18273 18274 18275 18276 18277 18278 18279 18280 18281 18282 18283 18284 18285 18286 18287 18288 18289 18290 18291 18292 18293 18294 |